def generateCollectionMultiPageXml(self, colDir, dFileListPerDoc, bStrict):
        """
        We concatenate all pages into a "multi-page PageXml" for each document of the collection
        return the list of XML filenames
        """
        lsXmlFilename = list()
        traceln("- Generating multi_page PageXml")
        #         lsDocMaxTSFilename = sorted(glob.iglob(os.path.join(colDir, "*%s"%TranskribusClient._POSTFIX_MAX_TX)), reverse=True)  # *_max.ts files
        for docId in dFileListPerDoc.keys():
            if dFileListPerDoc[docId] is not None:
                lFiles = list(
                    map(lambda x: os.path.join(colDir, docId, x + ".pxml"),
                        dFileListPerDoc[docId]))
                docDir = os.path.join(colDir, docId)
                traceln("\t- %s" % docDir)

                doc = self.makeMultiPageXml(lFiles)

                sXmlFilename = docDir + sMPXMLExtension
                self.writeDom(doc, sXmlFilename, True)
                lsXmlFilename.append(sXmlFilename)

                trace("\t\t- validating the MultiPageXml ...")
                if not PageXml.MultiPageXml.validate(doc):
                    if bStrict:
                        raise ValueError("Invalid XML generated in '%s'" %
                                         sXmlFilename)
                    else:
                        traceln(
                            "   *** WARNING: XML file is invalid against the schema: '%s'"
                            % sXmlFilename)
                traceln(" Ok!")

                if DEBUG > 1:
                    PageXml.MultiPageXml.splitMultiPageXml(doc,
                                                           docDir,
                                                           "debug_%d.xml",
                                                           bIndent=True)

#                 doc.freeDoc()
                traceln('\t- %s' % sXmlFilename)

        return lsXmlFilename
    def login(self, trnskrbs_client, trace=None, traceln=None):
        """
        deal with the complicated login variants...
            -trace and traceln are optional print methods 
        return True or raises an exception
        """
        DEBUG = True
        bOk = False
        if self.persist:
            #try getting some persistent session token
            if DEBUG and trace:
                trace("  ---login--- Try reusing persistent session ... ")
            try:
                bOk = trnskrbs_client.reusePersistentSession()
                if DEBUG and traceln: traceln("OK!")
            except:
                if DEBUG and traceln: traceln("Failed")

        if not bOk:
            if self.loginInfo:
                login, pwd = self.loginInfo, self.pwd
            else:
                if trace:
                    DEBUG and trace(
                        "  ---login--- no login provided, looking for stored credentials... "
                    )
                login, pwd = trnskrbs_client.getStoredCredentials(bAsk=False)
                if DEBUG and traceln: traceln("OK")

            if DEBUG and traceln:
                trace("  ---login--- logging onto Transkribus as %s " % login)
            trnskrbs_client.auth_login(login, pwd)
            if DEBUG and traceln: traceln("OK")
            bOk = True

        return bOk
Beispiel #3
0
    dName_by_docId = {}
    lDocDic = doer.listDocsByCollectionId(colIdFrom)
    for docDic in lDocDic:
        dName_by_docId[docDic['docId']] = docDic['title']
    #check now, so as to avoid partial copies...
    for docId in lDocId:
        try:
            name = dName_by_docId[docId]
        except KeyError as e:
            traceln()
            traceln("ERROR: document '%d' is not in source collection '%d'" %
                    (docId, colIdFrom))
            _exit("", 3, e)

    trace(
        "- duplicating from collection %d to collection '%d' the %d documents: "
        % (colIdFrom, colIdTo, len(lDocId)))
    for docId in lDocId:
        name = dName_by_docId[docId]
        trace(" %d  ('%s')" % (docId, name))
        try:
            doer.duplicateDoc(colIdFrom, docId, colIdTo, name)
        except Exception as e:
            traceln()
            traceln(
                "ERROR: could not copy document '%d' from collection '%d' to collection '%d'"
                % (docId, colIdFrom, colIdTo))
            _exit("", 4, e)
    traceln()
    traceln("- Done for %d documents" % len(lDocId))
    #"-s", "--server",  "-l", "--login" ,   "-p", "--pwd",   "--https_proxy"    OPTIONS
    __Trnskrbs_basic_options(parser, DoLogin.sDefaultServerUrl)

    #parse the command line
    (options, args) = parser.parse_args()

    # ---
    #credentials and proxy
    proxies = {} if not options.https_proxy else {
        'https_proxy': options.https_proxy
    }

    if options.login:
        login, pwd = options.login, options.pwd
    else:
        trace("- no login provided, looking for stored credentials... ")
        login, pwd = getStoredCredentials(bAsk=False)
        traceln("OK")

    # ------------------------------------------------------------------------------------------------

    doer = DoLogin(options.server, proxies, loggingLevel=logging.INFO)

    try:
        if options.persist:
            traceln(
                "- Logging onto Transkribus as %s and making a persistent session"
                % login)
            doer.cleanPersistentSession()
            resp = doer.auth_login(login, pwd, bPersist=options.persist)
            traceln("\t --> %s" % os.path.join(DoLogin._sSESSION_FOLDER,
    def filter(self,
               colId,
               docId,
               page_filter=None,
               time_filter=None,
               user_filter=(None, None),
               status_filter=(None, None),
               bVerbose=False,
               bLast=False,
               bLastFiltered=False):
        """
        return a TRP containing the transcripts, excluding the ones filtered out.
        """
        if bLast:
            #consider only last transcript per page
            if bVerbose:
                traceln(
                    "\t[filter] ignore all but last transcript of each page")
            trp = TRP_FullDoc(self.getDocById(colId, docId, 1))
        else:
            trp = TRP_FullDoc(self.getDocById(colId, docId, -1))

        if page_filter:
            if bVerbose:
                trace("\t[filter] as per page specification: %s" % page_filter)
                n0 = len(trp.getPageList())
            trp.filterPageList(page_filter)
            if bVerbose:
                n1 = len(trp.getPageList())
                traceln(" --> %d pages in-scope (after excluding %d)" %
                        (n1, n0 - n1))

        for filter, filter_name, slot in [(time_filter, "time", "timestamp")]:
            if filter:
                if bVerbose:
                    trace("\t[filter] as per %s specification: %s" %
                          (filter_name, filter))
                    n0 = len(trp.getTranscriptList())
                trp.filterTranscriptsBySlot(filter, slot)
                if bVerbose:
                    n1 = len(trp.getTranscriptList())
                    traceln(
                        " --> %d transcripts in-scope (after excluding %d)" %
                        (n1, n0 - n1))

        for (filter_pos, filter_neg), filter_name, slot in [
            (user_filter, "user", "userName"),
            (status_filter, "status", "status")
        ]:
            if filter_pos or filter_neg:
                if bVerbose:
                    if filter_pos:
                        trace(
                            "\t[filter] as per %s specification: keeping   %s"
                            % (filter_name, filter_pos))
                    if filter_neg:
                        trace(
                            "\t[filter] as per %s specification: excluding %s"
                            % (filter_name, filter_neg))
                    n0 = len(trp.getTranscriptList())
                if filter_pos: trp.filterTranscriptsBySlot(filter_pos, slot)
                if filter_neg:
                    trp.filterTranscriptsBySlot(filter_neg, slot, bNot=True)
                if bVerbose:
                    n1 = len(trp.getTranscriptList())
                    traceln(
                        " --> %d transcripts in-scope (after excluding %d)" %
                        (n1, n0 - n1))

        if bLastFiltered:
            if bVerbose:
                trace("\t[filter] keep last filtered transcript per page")
                n0 = len(trp.getTranscriptList())
            trp.filterLastTranscript()
            if bVerbose:
                n1 = len(trp.getTranscriptList())
                traceln(" --> %d transcripts in-scope (after excluding %d)" %
                        (n1, n0 - n1))
        return trp
    def uploadDocumentTranscript(self,
                                 colid,
                                 docid,
                                 sColDSDir,
                                 sNote="",
                                 sToolName="NLE DU",
                                 sTranscripExt=sTRANSCRIPT_EXTENSION,
                                 iVerbose=0):
        """
        Upload the transcripts of one document in that collection into Transkribus
        return nothing
        """
        if iVerbose:
            traceln(
                "- Uploading transcript of document %s from folder %s to collection %s "
                % (docid, sColDSDir, colid))

        sDocFilename = os.path.join(sColDSDir, str(docid) + sTranscripExt)
        doc = etree.parse(sDocFilename)

        #We will also try to set the parent-Id of each transcript, by parsing the trp.json
        trpFilename = os.path.join(sColDSDir, str(docid), "trp.json")
        try:
            trp = json.load(open(trpFilename, "r", encoding='utf-8'))
            trpPageList = trp["pageList"]['pages']
        except:
            trpPageList = None
            traceln(
                "Warning: cannot set Parent-ID because file not found: %s" %
                trpFilename)
            traceln(
                "\n  *** Check the consistency of the resulting upload!!!! ***\n"
            )

        if iVerbose > 1: traceln("\t%6s %s" % (docid, sDocFilename))
        for pnum, pageDoc in PageXml.MultiPageXml._iter_splitMultiPageXml(
                doc, bInPlace=True):
            #dump the new XML into a file in target folder
            if iVerbose > 1:
                if pnum % 10 == 0: trace(" %d " % pnum)
                else: trace(".")
                flush()
            sXMlTranscript = etree.tostring(pageDoc,
                                            encoding="utf-8",
                                            pretty_print=True)

            if trpPageList:
                trpPage = trpPageList[int(pnum) - 1]
                trpPnum = trpPage['pageNr']
                ParentId = trpPage['tsList']["transcripts"][0]['tsId']
                if str(docid) != str(
                        trpPage['tsList']["transcripts"][0]["docId"]):
                    raise ValueError(
                        "trp.json file inconsistent with .pxml files - docid mismatch"
                    )
                if trpPnum != pnum:
                    raise ValueError(
                        "trp.json file inconsistent with .pxml files - pageNr mismatch"
                    )
            else:
                ParentId = None
            self.postPageTranscript(colid,
                                    docid,
                                    pnum,
                                    sXMlTranscript,
                                    parentId=ParentId,
                                    bEncoded=True,
                                    sNote=sNote,
                                    sToolName=sToolName)

        if iVerbose > 1: traceln("")

        #         doc.freeDoc()

        if iVerbose:
            traceln("   Done (collection %s, document %s)" % (colid, docid))
        return
            else:
                docId = int(chunk)
                lDocId.append(docId)
    except Exception as e:
        _exit(usage, 2, e)

    # ---
    #credentials and proxy
    proxies = {} if not options.https_proxy else {
        'https_proxy': options.https_proxy
    }

    # ------------------------------------------------------------------------------------------------
    doer = DoAddDocToCollec(options.server, proxies, loggingLevel=logging.INFO)

    __Trnskrbs_do_login_stuff(doer, options, trace, traceln)

    trace("- adding to collection '%d' the %d documents: " %
          (colId, len(lDocId)))
    for docId in lDocId:
        trace(" %d" % docId)
        try:
            doer.addDocToCollection(colId, docId)
        except Exception as e:
            traceln()
            traceln("ERROR: could not add document '%d' to collection '%d'" %
                    (docId, colId))
            raise e
    traceln()
    traceln("- Done for %d documents" % len(lDocId))