def generateCollectionMultiPageXml(self, colDir, dFileListPerDoc, bStrict): """ We concatenate all pages into a "multi-page PageXml" for each document of the collection return the list of XML filenames """ lsXmlFilename = list() traceln("- Generating multi_page PageXml") # lsDocMaxTSFilename = sorted(glob.iglob(os.path.join(colDir, "*%s"%TranskribusClient._POSTFIX_MAX_TX)), reverse=True) # *_max.ts files for docId in dFileListPerDoc.keys(): if dFileListPerDoc[docId] is not None: lFiles = list( map(lambda x: os.path.join(colDir, docId, x + ".pxml"), dFileListPerDoc[docId])) docDir = os.path.join(colDir, docId) traceln("\t- %s" % docDir) doc = self.makeMultiPageXml(lFiles) sXmlFilename = docDir + sMPXMLExtension self.writeDom(doc, sXmlFilename, True) lsXmlFilename.append(sXmlFilename) trace("\t\t- validating the MultiPageXml ...") if not PageXml.MultiPageXml.validate(doc): if bStrict: raise ValueError("Invalid XML generated in '%s'" % sXmlFilename) else: traceln( " *** WARNING: XML file is invalid against the schema: '%s'" % sXmlFilename) traceln(" Ok!") if DEBUG > 1: PageXml.MultiPageXml.splitMultiPageXml(doc, docDir, "debug_%d.xml", bIndent=True) # doc.freeDoc() traceln('\t- %s' % sXmlFilename) return lsXmlFilename
def login(self, trnskrbs_client, trace=None, traceln=None): """ deal with the complicated login variants... -trace and traceln are optional print methods return True or raises an exception """ DEBUG = True bOk = False if self.persist: #try getting some persistent session token if DEBUG and trace: trace(" ---login--- Try reusing persistent session ... ") try: bOk = trnskrbs_client.reusePersistentSession() if DEBUG and traceln: traceln("OK!") except: if DEBUG and traceln: traceln("Failed") if not bOk: if self.loginInfo: login, pwd = self.loginInfo, self.pwd else: if trace: DEBUG and trace( " ---login--- no login provided, looking for stored credentials... " ) login, pwd = trnskrbs_client.getStoredCredentials(bAsk=False) if DEBUG and traceln: traceln("OK") if DEBUG and traceln: trace(" ---login--- logging onto Transkribus as %s " % login) trnskrbs_client.auth_login(login, pwd) if DEBUG and traceln: traceln("OK") bOk = True return bOk
dName_by_docId = {} lDocDic = doer.listDocsByCollectionId(colIdFrom) for docDic in lDocDic: dName_by_docId[docDic['docId']] = docDic['title'] #check now, so as to avoid partial copies... for docId in lDocId: try: name = dName_by_docId[docId] except KeyError as e: traceln() traceln("ERROR: document '%d' is not in source collection '%d'" % (docId, colIdFrom)) _exit("", 3, e) trace( "- duplicating from collection %d to collection '%d' the %d documents: " % (colIdFrom, colIdTo, len(lDocId))) for docId in lDocId: name = dName_by_docId[docId] trace(" %d ('%s')" % (docId, name)) try: doer.duplicateDoc(colIdFrom, docId, colIdTo, name) except Exception as e: traceln() traceln( "ERROR: could not copy document '%d' from collection '%d' to collection '%d'" % (docId, colIdFrom, colIdTo)) _exit("", 4, e) traceln() traceln("- Done for %d documents" % len(lDocId))
#"-s", "--server", "-l", "--login" , "-p", "--pwd", "--https_proxy" OPTIONS __Trnskrbs_basic_options(parser, DoLogin.sDefaultServerUrl) #parse the command line (options, args) = parser.parse_args() # --- #credentials and proxy proxies = {} if not options.https_proxy else { 'https_proxy': options.https_proxy } if options.login: login, pwd = options.login, options.pwd else: trace("- no login provided, looking for stored credentials... ") login, pwd = getStoredCredentials(bAsk=False) traceln("OK") # ------------------------------------------------------------------------------------------------ doer = DoLogin(options.server, proxies, loggingLevel=logging.INFO) try: if options.persist: traceln( "- Logging onto Transkribus as %s and making a persistent session" % login) doer.cleanPersistentSession() resp = doer.auth_login(login, pwd, bPersist=options.persist) traceln("\t --> %s" % os.path.join(DoLogin._sSESSION_FOLDER,
def filter(self, colId, docId, page_filter=None, time_filter=None, user_filter=(None, None), status_filter=(None, None), bVerbose=False, bLast=False, bLastFiltered=False): """ return a TRP containing the transcripts, excluding the ones filtered out. """ if bLast: #consider only last transcript per page if bVerbose: traceln( "\t[filter] ignore all but last transcript of each page") trp = TRP_FullDoc(self.getDocById(colId, docId, 1)) else: trp = TRP_FullDoc(self.getDocById(colId, docId, -1)) if page_filter: if bVerbose: trace("\t[filter] as per page specification: %s" % page_filter) n0 = len(trp.getPageList()) trp.filterPageList(page_filter) if bVerbose: n1 = len(trp.getPageList()) traceln(" --> %d pages in-scope (after excluding %d)" % (n1, n0 - n1)) for filter, filter_name, slot in [(time_filter, "time", "timestamp")]: if filter: if bVerbose: trace("\t[filter] as per %s specification: %s" % (filter_name, filter)) n0 = len(trp.getTranscriptList()) trp.filterTranscriptsBySlot(filter, slot) if bVerbose: n1 = len(trp.getTranscriptList()) traceln( " --> %d transcripts in-scope (after excluding %d)" % (n1, n0 - n1)) for (filter_pos, filter_neg), filter_name, slot in [ (user_filter, "user", "userName"), (status_filter, "status", "status") ]: if filter_pos or filter_neg: if bVerbose: if filter_pos: trace( "\t[filter] as per %s specification: keeping %s" % (filter_name, filter_pos)) if filter_neg: trace( "\t[filter] as per %s specification: excluding %s" % (filter_name, filter_neg)) n0 = len(trp.getTranscriptList()) if filter_pos: trp.filterTranscriptsBySlot(filter_pos, slot) if filter_neg: trp.filterTranscriptsBySlot(filter_neg, slot, bNot=True) if bVerbose: n1 = len(trp.getTranscriptList()) traceln( " --> %d transcripts in-scope (after excluding %d)" % (n1, n0 - n1)) if bLastFiltered: if bVerbose: trace("\t[filter] keep last filtered transcript per page") n0 = len(trp.getTranscriptList()) trp.filterLastTranscript() if bVerbose: n1 = len(trp.getTranscriptList()) traceln(" --> %d transcripts in-scope (after excluding %d)" % (n1, n0 - n1)) return trp
def uploadDocumentTranscript(self, colid, docid, sColDSDir, sNote="", sToolName="NLE DU", sTranscripExt=sTRANSCRIPT_EXTENSION, iVerbose=0): """ Upload the transcripts of one document in that collection into Transkribus return nothing """ if iVerbose: traceln( "- Uploading transcript of document %s from folder %s to collection %s " % (docid, sColDSDir, colid)) sDocFilename = os.path.join(sColDSDir, str(docid) + sTranscripExt) doc = etree.parse(sDocFilename) #We will also try to set the parent-Id of each transcript, by parsing the trp.json trpFilename = os.path.join(sColDSDir, str(docid), "trp.json") try: trp = json.load(open(trpFilename, "r", encoding='utf-8')) trpPageList = trp["pageList"]['pages'] except: trpPageList = None traceln( "Warning: cannot set Parent-ID because file not found: %s" % trpFilename) traceln( "\n *** Check the consistency of the resulting upload!!!! ***\n" ) if iVerbose > 1: traceln("\t%6s %s" % (docid, sDocFilename)) for pnum, pageDoc in PageXml.MultiPageXml._iter_splitMultiPageXml( doc, bInPlace=True): #dump the new XML into a file in target folder if iVerbose > 1: if pnum % 10 == 0: trace(" %d " % pnum) else: trace(".") flush() sXMlTranscript = etree.tostring(pageDoc, encoding="utf-8", pretty_print=True) if trpPageList: trpPage = trpPageList[int(pnum) - 1] trpPnum = trpPage['pageNr'] ParentId = trpPage['tsList']["transcripts"][0]['tsId'] if str(docid) != str( trpPage['tsList']["transcripts"][0]["docId"]): raise ValueError( "trp.json file inconsistent with .pxml files - docid mismatch" ) if trpPnum != pnum: raise ValueError( "trp.json file inconsistent with .pxml files - pageNr mismatch" ) else: ParentId = None self.postPageTranscript(colid, docid, pnum, sXMlTranscript, parentId=ParentId, bEncoded=True, sNote=sNote, sToolName=sToolName) if iVerbose > 1: traceln("") # doc.freeDoc() if iVerbose: traceln(" Done (collection %s, document %s)" % (colid, docid)) return
else: docId = int(chunk) lDocId.append(docId) except Exception as e: _exit(usage, 2, e) # --- #credentials and proxy proxies = {} if not options.https_proxy else { 'https_proxy': options.https_proxy } # ------------------------------------------------------------------------------------------------ doer = DoAddDocToCollec(options.server, proxies, loggingLevel=logging.INFO) __Trnskrbs_do_login_stuff(doer, options, trace, traceln) trace("- adding to collection '%d' the %d documents: " % (colId, len(lDocId))) for docId in lDocId: trace(" %d" % docId) try: doer.addDocToCollection(colId, docId) except Exception as e: traceln() traceln("ERROR: could not add document '%d' to collection '%d'" % (docId, colId)) raise e traceln() traceln("- Done for %d documents" % len(lDocId))