def createStandardFolders(self, colId, destDir):
        """
        CReate the standard DU folde structure and return the collection folder
        """
        if not (os.path.exists(destDir) and os.path.isdir(destDir)):
            raise ValueError("Non-existing destination folder %s" % destDir)

        colDir = os.path.join(destDir, "trnskrbs_%s" % colId)

        #Creating folder structure
        if os.path.exists(colDir):
            if not os.path.isdir(colDir):
                raise ValueError("%s exists and is not a folder." % colDir)
        else:
            traceln('- creating folder: %s' % colDir)
            os.mkdir(colDir)

        for sSubDir in [sCOL, "xml", "ref", "run", "out"]:
            sDir = os.path.join(colDir, sSubDir)
            if os.path.exists(sDir):
                if not os.path.isdir(sDir):
                    raise ValueError("%s exists and is not a folder." % sDir)
            else:
                os.mkdir(sDir)

        return colDir
Exemple #2
0
 def uploadDocumentTranscript(self,
                              colid,
                              docid,
                              sColDSDir,
                              sNote="",
                              sToolName="",
                              iVerbose=0,
                              status=None):
     """
     Upload the transcripts of all document in that collection into Transkribus
     return nothing
     """
     trpFilename = os.path.join(sColDSDir, str(docid), "trp.json")
     traceln(" - reading %s" % trpFilename)
     if not os.path.exists(trpFilename):
         raise Exception(
             "File not found %s. \nData probably created in --trp mode, so upload must be done in --trp mode."
             % trpFilename)
     trp = json.load(open(trpFilename, "r", encoding='utf-8'))
     self.uploadDocumentTranscript_by_trp(colid,
                                          docid,
                                          trp,
                                          sColDSDir,
                                          sNote=sNote,
                                          sToolName=sToolName,
                                          iVerbose=iVerbose,
                                          status=status)
     return
    def applyHTRForRegions(self, colid, docid, nbpages, modelname, dictionary):
        """
            apply an htr model at region level 
        """

        htrComp = DoHtrRnn(self.myTrKCient.getServerUrl(),
                           self.myTrKCient.getProxies())
        htrComp._trpMng.setSessionId(self.myTrKCient.getSessionId())
        htrComp.setSessionId(self.myTrKCient.getSessionId())

        _, sPageDesc = htrComp.buildDescription(colid,
                                                "%s/%s" % (docid, nbpages))

        sPages = "1-%d" % (nbpages)
        sModelID = None
        # get modelID
        lColModels = self.myTrKCient.listRnns(colid)
        for model in lColModels:
            #             print model['htrId'], type(model['htrId']), modelname,type(modelname)
            if str(model['htrId']) == str(modelname):
                sModelID = model['htrId']
                traceln('model id = %s' % sModelID)
                #some old? models do not have params field
#             try: traceln("%s\t%s\t%s" % (model['htrId'],model['name'],model['params']))
#             except KeyError: traceln("%s\t%s\tno params" % (model['htrId'],model['name']))
        if sModelID == None:
            raise Exception, "no model ID found for %s" % (modelname)
        ret = htrComp.htrRnnDecode(colid,
                                   sModelID,
                                   dictionary,
                                   docid,
                                   sPageDesc,
                                   bDictTemp=False)
        traceln(ret)
        return ret
    def upLoadDocument(self,
                       colid,
                       coldir,
                       docid,
                       sNote="",
                       sTranscripExt='.mpxml'):
        """
            download colID
        """

        #         options.server, proxies, loggingLevel=logging.WARN)
        #download
        #         uploader = TranskribusTranscriptUploader(self.server,self.proxies)
        uploader = TranskribusDUTranscriptUploader(
            self.myTrKCient.getServerUrl(), self.myTrKCient.getProxies())
        uploader.setSessionId(self.myTrKCient.getSessionId())
        traceln("- uploading document %s to collection %s" % (docid, colid))
        uploader.uploadDocumentTranscript(colid,
                                          docid,
                                          os.path.join(coldir, sCOL),
                                          sNote,
                                          'NLE Table',
                                          sTranscripExt,
                                          iVerbose=False)
        traceln("- Done")
        return
 def run(self):
     lDic = self.listHmmHtrModels()
     #traceln(json.dumps(data, indent=4))
     traceln(
         strTabularFormat(lDic, [
             "modelName", "modelId", "isUsableInTranskribus", "nrOfTokens",
             "nrOfDictTokens", "nrOfLines"
         ], "modelName"))
     return lDic
    def deleteTranscripts(self, trp, bVerbose=True):
        """
        Delete the transcripts listed in the trp
        """
        colId = trp.getCollectionId()
        ldTr = trp.getTranscriptList()

        for dTr in ldTr:
            docId = dTr["docId"]
            pnum = dTr["pageNr"]
            sKey = dTr["key"]
            if bVerbose:
                traceln("\tdeleting %s %s p%s transcript %s" %
                        (colId, docId, pnum, sKey))
                traceln(self.deletePageTranscript(colId, docId, pnum, sKey))
        return True
 def run(self, sModelName, colID, lTrain, lTest, options, lcombinations):
     ljobid = []
     for i, (lr, epochs, batch) in enumerate(lcombinations):
         sDesc = options.description
         xmlconf = self.createXMLConf(sModelName,
                                      colID,
                                      lTrain,
                                      lTest,
                                      sDesc=sDesc,
                                      lang='German',
                                      numEpochs=epochs,
                                      learningRate=lr,
                                      noise='preproc',
                                      trainSizePerEpoch=batch)
         jobid = self.htrTrainingCITlab(xmlconf)
         ljobid.append(jobid)
         traceln("job id: %s" % jobid)
     return ljobid
    def generateCollectionMultiPageXml(self, colDir, dFileListPerDoc, bStrict):
        """
        We concatenate all pages into a "multi-page PageXml" for each document of the collection
        return the list of XML filenames
        """
        lsXmlFilename = list()
        traceln("- Generating multi_page PageXml")
        #         lsDocMaxTSFilename = sorted(glob.iglob(os.path.join(colDir, "*%s"%TranskribusClient._POSTFIX_MAX_TX)), reverse=True)  # *_max.ts files
        for docId in dFileListPerDoc.keys():
            if dFileListPerDoc[docId] is not None:
                lFiles = list(
                    map(lambda x: os.path.join(colDir, docId, x + ".pxml"),
                        dFileListPerDoc[docId]))
                docDir = os.path.join(colDir, docId)
                traceln("\t- %s" % docDir)

                doc = self.makeMultiPageXml(lFiles)

                sXmlFilename = docDir + sMPXMLExtension
                self.writeDom(doc, sXmlFilename, True)
                lsXmlFilename.append(sXmlFilename)

                trace("\t\t- validating the MultiPageXml ...")
                if not PageXml.MultiPageXml.validate(doc):
                    if bStrict:
                        raise ValueError("Invalid XML generated in '%s'" %
                                         sXmlFilename)
                    else:
                        traceln(
                            "   *** WARNING: XML file is invalid against the schema: '%s'"
                            % sXmlFilename)
                traceln(" Ok!")

                if DEBUG > 1:
                    PageXml.MultiPageXml.splitMultiPageXml(doc,
                                                           docDir,
                                                           "debug_%d.xml",
                                                           bIndent=True)

#                 doc.freeDoc()
                traceln('\t- %s' % sXmlFilename)

        return lsXmlFilename
    def setTranscriptStatus(self, trp, status, bVerbose=True):
        """
        Set the status of the transcripts listed in the trp
        """
        colId = trp.getCollectionId()
        ldTr = trp.getTranscriptList()

        for dTr in ldTr:
            docId = dTr["docId"]
            pnum = dTr["pageNr"]
            sTSId = dTr["tsId"]
            if bVerbose:
                traceln(
                    "\tsetting status to '%s' for %s %s p%s transcript %s" %
                    (status, colId, docId, pnum, sTSId))
                traceln(
                    self.updatePageStatus(colId, docId, pnum, sTSId, status,
                                          "setStatus by PyClient"))
        return True
    def processCollection(self, coldir):
        """
            process all files in a colelction
            need mpxml files
        """
        lsDocFilename = sorted(
            glob.iglob(
                os.path.join(coldir, "*" + TableProcessing.sMPXMLExtension)))
        lDocId = []
        for sDocFilename in lsDocFilename:
            sDocId = os.path.basename(
                sDocFilename)[:-len(TableProcessing.sMPXMLExtension)]
            try:
                docid = int(sDocId)
                lDocId.append(docid)
            except ValueError:
                traceln("Warning: folder %s : %s invalid docid, IGNORING IT" %
                        (self.coldir, sDocId))
                continue

        # process each document
        for docid in lDocId:
            traceln("Processing %s : %s " % (self.coldir, sDocId))
            self.processDocument(self.colid, docid)
            traceln("\tProcessing done for %s " % (self.coldir, sDocId))
    def downloadCollection(self,
                           colid,
                           destDir,
                           docid,
                           bNoImg=True,
                           bForce=False):
        """
            download colID
            
            replace destDir by '.'  ?
        """
        destDir = "."
        #         options.server, proxies, loggingLevel=logging.WARN)
        #download
        downloader = TranskribusDownloader(self.myTrKCient.getServerUrl(),
                                           self.myTrKCient.getProxies())
        downloader.setSessionId(self.myTrKCient.getSessionId())
        traceln("- Downloading collection %s to folder %s" %
                (colid, os.path.abspath(destDir)))
        #         col_ts, colDir = downloader.downloadCollection(colid, destDir, bForce=options.bForce, bNoImage=options.bNoImage)
        col_ts, colDir, ldocids, dFileListPerDoc = downloader.downloadCollection(
            colid, destDir, bForce=bForce, bNoImage=bNoImg, sDocId=docid)
        traceln("- Done")

        with open(os.path.join(colDir, "config.txt"), "w") as fd:
            fd.write("server=%s\nforce=%s\nstrict=%s\n" %
                     (self.server, True, False))

        downloader.generateCollectionMultiPageXml(
            os.path.join(colDir, TableProcessing.sCOL), dFileListPerDoc, False)

        traceln('- Done, see in %s' % colDir)

        return ldocids
    def login(self, trnskrbs_client, trace=None, traceln=None):
        """
        deal with the complicated login variants...
            -trace and traceln are optional print methods 
        return True or raises an exception
        """
        DEBUG = True
        bOk = False
        if self.persist:
            #try getting some persistent session token
            if DEBUG and trace:
                trace("  ---login--- Try reusing persistent session ... ")
            try:
                bOk = trnskrbs_client.reusePersistentSession()
                if DEBUG and traceln: traceln("OK!")
            except:
                if DEBUG and traceln: traceln("Failed")

        if not bOk:
            if self.loginInfo:
                login, pwd = self.loginInfo, self.pwd
            else:
                if trace:
                    DEBUG and trace(
                        "  ---login--- no login provided, looking for stored credentials... "
                    )
                login, pwd = trnskrbs_client.getStoredCredentials(bAsk=False)
                if DEBUG and traceln: traceln("OK")

            if DEBUG and traceln:
                trace("  ---login--- logging onto Transkribus as %s " % login)
            trnskrbs_client.auth_login(login, pwd)
            if DEBUG and traceln: traceln("OK")
            bOk = True

        return bOk
    def uploadCollectionTranscript(self,
                                   colid,
                                   sColDSDir,
                                   sTranscripExt=sTRANSCRIPT_EXTENSION,
                                   sNote="",
                                   sToolName="NLE DU",
                                   iVerbose=0):
        """
        Upload the transcripts of all document in that collection into Transkribus
        return nothing
        """
        if iVerbose:
            traceln(
                "- Uploading all transcripts from folder %s to collection %s" %
                (sColDSDir, colid))

        lsDocFilename = sorted(
            glob.iglob(os.path.join(sColDSDir, "*" + sTranscripExt)))
        if not lsDocFilename:
            raise ValueError("No file found in %s" %
                             os.path.join(sColDSDir, "*" + sTranscripExt))
        for sDocFilename in lsDocFilename:
            sDocId = os.path.basename(sDocFilename)[:-len(sTranscripExt)]
            try:
                docid = int(sDocId)
            except ValueError:
                traceln("Warning: folder %s : %s invalid docid, IGNORING IT" %
                        (sColDSDir, sDocId))
                continue
            self.uploadDocumentTranscript(colid,
                                          docid,
                                          sColDSDir,
                                          sTranscripExt=sTranscripExt,
                                          sNote=sNote,
                                          iVerbose=iVerbose)

        if iVerbose:
            traceln("  Done (collection %s)" % colid)
        return
    def applyLA_URO(self, colid, docid, nbpages):
        """
        apply textline finder 
        """
        # do the job...
        #         if options.trp_doc:
        #             trpdoc =  json.load(codecs.open(options.trp_doc, "rb",'utf-8'))
        #             docId,sPageDesc = doer.buildDescription(colId,options.docid,trpdoc)

        traceln('process %s pages...' % nbpages)
        lretJobIDs = []
        for i in range(1, nbpages + 1):
            LA = DoLAbatch(self.myTrKCient.getServerUrl(),
                           self.myTrKCient.getProxies())
            LA._trpMng.setSessionId(self.myTrKCient.getSessionId())
            LA.setSessionId(self.myTrKCient.getSessionId())
            _, sPageDesc = LA.buildDescription(colid, "%s/%s" % (docid, i))
            sPageDesc = LA.jsonToXMLDescription(sPageDesc)
            _, lJobIDs = LA.run(colid, sPageDesc, "CITlabAdvancedLaJob", False)
            traceln(lJobIDs)
            lretJobIDs.extend(lJobIDs)
            traceln("- LA running for page %d job:%s" % (i, lJobIDs))
        return lretJobIDs
Exemple #15
0
    def uploadCollectionTranscript(self,
                                   colid,
                                   sColDSDir,
                                   sNote="",
                                   sToolName="",
                                   iVerbose=0,
                                   status=None):
        """
        Upload the transcripts of all document in that collection into Transkribus
        return nothing
        """
        if iVerbose:
            traceln(
                "- Uploading all transcripts from folder %s to collection %s" %
                (sColDSDir, colid))

        trpFilename = os.path.join(sColDSDir, "trp.json")
        traceln(" - reading %s" % trpFilename)
        if not os.path.exists(trpFilename):
            raise Exception(
                "File not found %s. \nData probably created in --trp mode, so upload must be done in --trp mode."
                % trpFilename)
        trp = json.load(open(trpFilename, "r", encoding='utf-8'))

        for docid in [d["docId"] for d in trp]:
            self.uploadDocumentTranscript(colid,
                                          docid,
                                          sColDSDir,
                                          sNote=sNote,
                                          sToolName=sToolName,
                                          iVerbose=iVerbose,
                                          status=status)

        if iVerbose:
            traceln("  Done (collection %s)" % colid)
        return
Exemple #16
0
    # ---
    #parse the command line
    (options, args) = parser.parse_args()
    proxies = {} if not options.https_proxy else {
        'https_proxy': options.https_proxy
    }

    # ---
    #source collection(s)
    try:
        jobid = int(args[0])
    except Exception as e:
        _exit(usage, 1, e)

    # ---
    doer = DoDeleteJob(options.server, proxies, loggingLevel=logging.INFO)
    __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln)

    # ---
    # do the job...
    try:
        resp = doer.deleteJob(jobid)
    except Exception as e:
        _exit("", 1, e)

    if resp != "CANCELED":
        raise Exception("Job status should be CANCELED not '%s'" % resp)

    traceln("- Done")
Exemple #17
0
    #prepare for the parsing of the command line
    parser = OptionParser(usage=usage, version=version)
    parser.description = description
    
    #"-s", "--server",  "-l", "--login" ,   "-p", "--pwd",   "--https_proxy"    OPTIONS
    __Trnskrbs_basic_options(parser, DoGetJobs.sDefaultServerUrl)
        
    # ---   
    #parse the command line
    (options, args) = parser.parse_args()
    proxies = {} if not options.https_proxy else {'https_proxy':options.https_proxy}

    # --- 
    #source collection(s)
#     try:
#         jobid = int(args[0])
#     except Exception as e:
#         _exit(usage, 1, e)

    # --- 
    doer = DoGetJobs(options.server, proxies, loggingLevel=logging.INFO)
    __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln)

    # --- 
    # do the job...
    try:
        resp = doer.getJobs()
    except Exception as e:  _exit("", 1, e)
    traceln( json.dumps(resp, sort_keys=True, indent=4, separators=(',', ': ')))
    
    # --- 
    doer = DoGetDocTrp(options.server, proxies, loggingLevel=logging.WARN)
    __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln)
    # --- 
    try:                        colId = int(args.pop(0))
    except Exception as e:      _exit(usage, 1, e)
    try:                        docId   = int(args.pop(0))
    except Exception as e:      _exit(usage, 1, e)
    try:                        sPageRangeSpec = args.pop(0)
    except Exception as e:      sPageRangeSpec = None
    if args:                    _exit(usage, 2, Exception("Extra arguments to the command"))

    oPageRange = PageRangeSpec(sPageRangeSpec) if sPageRangeSpec else None
        
    # --- 
    # do the job...
    resp = doer.run(colId, docId, nrOfTranscripts=options.nbTranscript)
    if oPageRange:
        traceln("Filtering response as per page specification: %s"%oPageRange)
        #let's filter the response (not super efficient but easy to code...
        ldPages = resp["pageList"]["pages"]
        ldPagesInRange = [ dPage for dPage in ldPages if dPage["pageNr"] in oPageRange]
        resp["pageList"]["pages"] = ldPagesInRange

    print (json.dumps(resp, sort_keys=True, indent=4, separators=(',', ': ')))
        
    traceln()      
    traceln("- Done")
    
Exemple #19
0
    except Exception as e:
        _exit(usage, 1, e)
    try:
        sDictName = args.pop(0)
    except Exception as e:
        _exit(usage, 1, e)
    try:
        colId = int(args.pop(0))
    except Exception as e:
        _exit(usage, 1, e)
    #     try:                        docId   = int(args.pop(0))
    #     except Exception as e:      _exit(usage, 1, e)
    #     try:                        sPages = args.pop(0)
    #     except Exception as e:      sPages = None

    if args: _exit(usage, 2, Exception("Extra arguments to the command"))

    if options.trp_doc:
        trpdoc = json.load(open(options.trp_doc, "r", encoding='utf-8'))
        docId, sPageDesc = doer.buildDescription(colId, options.docid, trpdoc)
    else:
        docId, sPageDesc = doer.buildDescription(colId, options.docid)

    # do the job...
    jobid = doer.run(sModelID, sDictName, colId, docId, sPageDesc,
                     options.bPylaia, options.dictTemp)
    traceln(jobid)

    traceln()
    traceln("- Done")
    # ---
    try:
        colId = int(args.pop(0))
    except Exception as e:
        _exit(usage, 1, e)
    try:
        docidpages = args.pop(0)
    except Exception as e:
        _exit(usage, 1, e)
    if args: _exit(usage, 2, Exception("Extra arguments to the command"))

    # ---
    # do the job...
    if options.trp_doc:
        trpdoc = json.load(open(options.trp_doc, "r", encoding='utf-8'))
        docId, sPageDesc = doer.buildDescription(colId, docidpages, trpdoc)
    else:
        docId, sPageDesc = doer.buildDescription(colId, docidpages)
#     NcsrLaJob
#     CITlabAdvancedLaJob
    sPageDesc = doer.jsonToXMLDescription(sPageDesc)

    status, jobid = doer.run(colId,
                             sPageDesc,
                             'CITlabAdvancedLaJob',
                             bBlockSeg=options.doRegionSeg,
                             bCreateJobBatch=options.doBatchJob)
    traceln("job ID:", jobid)
    traceln("- Done")
Exemple #21
0
        _exit(usage, 1, e)
    #     try:                        sColDir = args.pop(0)
    #     except Exception as e:      _exit(usage, 1, e)
    try:
        colId = int(args.pop(0))
    except Exception as e:
        _exit(usage, 1, e)
    #     try:                        docId   = int(args.pop(0))
    #     except Exception as e:      _exit(usage, 1, e)
    #     try:                        PagesTSID   = eval(args.pop(0))
    #     except Exception as e:      _exit(usage, 1, e)
    try:
        sPages = args.pop(0)
    except Exception as e:
        sPages = None
    if args: _exit(usage, 2, Exception("Extra arguments to the command"))

    #     # ---
    #     # do the job...
    lTrain, lTest = doer.createTrainTest(colId, options.ltrdoc, options.ltsdoc)
    lcombinations = doer.createParamaterCombinations(options.learningrate,
                                                     options.batchsize,
                                                     options.epochs)
    #     xmlconf = doer.createXMLConf(sModelName,colId,lTrain,lTest,sDesc = options.description, lang='German', learningRate=options.learningrate,numEpochs=options.epochs, trainSizePerEpoch=options.batchsize)
    ljobids = doer.run(sModelName, colId, lTrain, lTest, options,
                       lcombinations)
    # #     traceln(jobid)

    traceln()
    traceln("- training launched with job ID: %s" % ljobids)
    except:
        _exit(usage, 1)

    if args:
        destDir = args[0]
    else:
        destDir = "."

    # ---
    trnkbs2ds = TranskribusDownloader(options.server,
                                      proxies,
                                      loggingLevel=logging.WARN)
    __Trnskrbs_do_login_stuff(trnkbs2ds, options, trace=trace, traceln=traceln)

    if options.trp:
        traceln("- Loading trp data from %s" % options.trp)
        #         trp = json.load(open(options.trp, "rb",encoding='utf-8'))
        trp = json.load(open(options.trp, "rt", encoding='utf-8'))

        traceln(
            "- Downloading collection %s to folder %s, as specified by trp data"
            % (colid, os.path.abspath(destDir)))
        if not options.docid:
            options.docid = trp["md"]["docId"]
            traceln(" read docId from TRP: docId = %s" % options.docid)
        logging.basicConfig(level=logging.INFO)
        col_ts, docFolder, lFileList = trnkbs2ds.download_document_by_trp(
            colid,
            options.docid,
            destDir,
            trp,
    def run(self, colid=None, bListDict=False):
        """
        2 textual lists
        """
        sModels = None
        sColModels = None
        sDicts = None
        if colid is not None:
            sColModels = self.listRnns(colid)
            for models in sColModels:
                #some old? models do not have params field
                try:
                    traceln("%s\t%s\t%s\ndescription:%s" %
                            (models['htrId'], models['name'].strip(),
                             models['params'].strip(),
                             models['description'].strip()))
                except KeyError:
                    traceln("%s\t%s\tno params" %
                            (models['htrId'], models['name']))
                traceln()
        else:
            sModels = self.listRnnsText()
            traceln("\n--- Models ---------------------------")
            traceln(sModels)

        if bListDict:
            sDicts = self.listDictsText()
            traceln("\n--- Dictionaries ---------------------")
            traceln(sDicts)

        return sModels, sColModels, sDicts
Exemple #24
0
    # ---
    #parse the command line
    (options, args) = parser.parse_args()
    proxies = {} if not options.https_proxy else {
        'https_proxy': options.https_proxy
    }

    # ---
    #source collection(s)
    try:
        lColId = [int(arg) for arg in args]
    except Exception as e:
        _exit(usage, 1, e)

    # ---
    doer = DoListCollec(options.server, proxies, loggingLevel=logging.INFO)
    __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln)

    # ---
    # do the job...
    for colId in lColId:
        try:
            doer.run(colId)
        except Exception as e:
            traceln()
            traceln("ERROR: could not list collection '%d' " % colId)
            _exit("", 1, e)

    traceln()
    traceln("- Done for %d collection(s)" % len(lColId))
    #parse the command line
    (options, args) = parser.parse_args()

    # ---
    #credentials and proxy
    proxies = {} if not options.https_proxy else {
        'https_proxy': options.https_proxy
    }

    if options.login:
        login, pwd = options.login, options.pwd
    else:
        trace("- no login provided, looking for stored credentials... ")
        login, pwd = getStoredCredentials(bAsk=False)
        traceln("OK")

    # ------------------------------------------------------------------------------------------------

    doer = DoLogin(options.server, proxies, loggingLevel=logging.INFO)

    try:
        if options.persist:
            traceln(
                "- Logging onto Transkribus as %s and making a persistent session"
                % login)
            doer.cleanPersistentSession()
            resp = doer.auth_login(login, pwd, bPersist=options.persist)
            traceln("\t --> %s" % os.path.join(DoLogin._sSESSION_FOLDER,
                                               DoLogin._sSESSION_FILENAME))
        else:
Exemple #26
0
    }

    # ---
    doer = DoHtrRnn(options.server, proxies, loggingLevel=logging.WARN)
    __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln)
    # ---
    try:
        dictName = args.pop(0)
    except Exception as e:
        _exit(usage, 1, e)
    #     try:                        filename = args.pop(0)
    #     except Exception as e:      _exit(usage, 1, e)

    try:
        sfullDict = ""
        for filename in options.ldict:
            dictFile = open(filename, 'r', encoding='utf-8').read()
            dictFile = dictFile.replace('\t', ',')
            sfullDict += dictFile  #+ '\n'
            traceln("loaded %s" % (filename))
    except IOError:
        print('not possible to open file :%s' % (filename))

    #     print sfullDict.encode("utf-8")
    # need to normalize the weights when build this different dictionaries???
    response = doer.run(dictName, sfullDict)
    traceln(response)

    traceln()
    traceln("- Done")
Exemple #27
0
def main():
    usage = "%s <directory> <coldId> [<docId>]" % sys.argv[0]
    version = "v.01"
    description = """Upload the transcript(s) from the DS structure to Transkribus, either of the collection or one of its document(s). 
The <directory> must have been created by transkribus_downloader.py and should contain the 'col' directory and a trp.json file for the collection, and one per document (the 'out', 'ref', 'run', 'xml' folders are not used).
The page transcript from the single page PageXml files are uploaded. (The multi-page xml file(s) are ignored))    
""" + _Trnskrbs_description

    #prepare for the parsing of the command line
    parser = OptionParser(usage=usage, version=version)
    parser.description = description

    #"-s", "--server",  "-l", "--login" ,   "-p", "--pwd",   "--https_proxy"    OPTIONS
    __Trnskrbs_basic_options(parser,
                             TranskribusTranscriptUploader.sDefaultServerUrl)

    parser.add_option("-q",
                      "--quiet",
                      dest='bQuiet',
                      action="store_true",
                      default=False,
                      help="Quiet mode")
    parser.add_option("--trp",
                      dest='trp',
                      action="store",
                      type="string",
                      help="download the content specified by the trp file.")
    parser.add_option("--toolname",
                      dest='tool',
                      action="store",
                      type="string",
                      default="",
                      help="Set the Toolname metadata in Transkribus.")
    parser.add_option("--message",
                      dest='message',
                      action="store",
                      type="string",
                      default="",
                      help="Set the message metadata in Transkribus.")
    parser.add_option("--set_status",
                      dest='set_status',
                      action="store",
                      type="string",
                      default=None,
                      help="Set the status of the uploaded trasnscript.")

    # ---
    #parse the command line
    (options, args) = parser.parse_args()
    proxies = {} if not options.https_proxy else {
        'https_proxy': options.https_proxy
    }

    iVerbose = 0 if options.bQuiet else 2
    # ---
    try:
        sDSDir = args.pop(0)
    except:
        _exit(usage, 1)
    if not (sDSDir.endswith(sCOL) or sDSDir.endswith(sCOL + os.path.sep)):
        sColDSDir = os.path.abspath(os.path.join(sDSDir, sCOL))
    else:
        sColDSDir = os.path.abspath(sDSDir)
    if not (os.path.exists(sColDSDir) and os.path.isdir(sColDSDir)):
        raise ValueError("Non-existing folder: %s " % sColDSDir)

    try:
        colid = args.pop(0)
    except:
        _exit(usage, 1)

    try:
        docid = args.pop(0)
    except:
        docid = None

    # ---
    doer = TranskribusTranscriptUploader(options.server,
                                         proxies,
                                         loggingLevel=logging.WARN)
    __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln)

    if options.trp:
        trp = json.load(open(options.trp, "r", encoding='utf-8'))
        traceln("- Uploading to collection %s, as specified by trp data" %
                (colid))
        if not docid:
            docid = trp["md"]["docId"]
            traceln(" read docId from TRP: docId = %s" % docid)
        sToolname = options.tool if options.tool else "Transkribus_uploader (--trp)"
        lFileList = doer.uploadDocumentTranscript_by_trp(
            colid,
            docid,
            trp,
            sColDSDir,
            sNote=options.message,
            sToolName=sToolname,
            iVerbose=iVerbose,
            status=options.set_status)
        #traceln(map(lambda x: x.encode('utf-8'), lFileList))
    else:
        if docid == None:
            sToolname = options.tool if options.tool else "Transkribus_uploader"
            doer.uploadCollectionTranscript(colid,
                                            sColDSDir,
                                            sNote=options.message,
                                            sToolName=sToolname,
                                            iVerbose=iVerbose,
                                            status=options.set_status)

        else:
            sToolname = options.tool if options.tool else "Transkribus_uploader (docid)"
            doer.uploadDocumentTranscript(colid,
                                          docid,
                                          sColDSDir,
                                          sNote=options.message,
                                          sToolName=sToolname,
                                          iVerbose=iVerbose,
                                          status=options.set_status)

    traceln('- DONE, all transcripts were uploaded. See in collection %s' %
            colid)
Exemple #28
0
                docId1, docId2 = [int(i) for i in li]
                lDocId.extend(range(docId1, docId2 + 1))
            else:
                docId = int(chunk)
                lDocId.append(docId)
    except Exception as e:
        _exit(usage, 2, e)

    # ------------------------------------------------------------------------------------------------
    doer = DoCopyDocToCollec(options.server,
                             proxies,
                             loggingLevel=logging.INFO)
    __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln)

    #the only issue is that we need to have the name of each document...
    traceln("- checking existence of each document in source collection '%d'" %
            (colIdFrom))
    dName_by_docId = {}
    lDocDic = doer.listDocsByCollectionId(colIdFrom)
    for docDic in lDocDic:
        dName_by_docId[docDic['docId']] = docDic['title']
    #check now, so as to avoid partial copies...
    for docId in lDocId:
        try:
            name = dName_by_docId[docId]
        except KeyError as e:
            traceln()
            traceln("ERROR: document '%d' is not in source collection '%d'" %
                    (docId, colIdFrom))
            _exit("", 3, e)

    trace(
        'https_proxy': options.https_proxy
    }

    # ---
    #source collection(s)
    try:
        colid = int(args[0])
    except Exception as e:
        _exit(usage, 1, e)
    try:
        docid = int(args[0])
    except Exception as e:
        _exit(usage, 1, e)
    try:
        page = int(args[0])
    except Exception as e:
        _exit(usage, 1, e)

    # ---
    doer = listPageLocks(options.server, proxies, loggingLevel=logging.INFO)
    __Trnskrbs_do_login_stuff(doer, options, trace=trace, traceln=traceln)

    # ---
    # do the job...
    try:
        resp = doer.getListofLockedPages(colid, docid, page)
    except Exception as e:
        _exit("", 1, e)
    traceln(resp)
    traceln("- Done")
Exemple #30
0
    def uploadDocumentTranscript_by_trp(self,
                                        colid,
                                        docid,
                                        trp,
                                        sColDSDir,
                                        sNote="",
                                        sToolName="",
                                        iVerbose=0,
                                        status=None):
        """
        Upload the transcripts of one document in that collection into Transkribus, as specified by the TRP data
            status = None     ==> we get the status from the TRP
            otherwise         ==> we set the given status
        return nothing
        """
        if iVerbose:
            traceln(
                "- Uploading as listed in TRP, the transcript(s) of document %s from folder %s to collection %s "
                % (docid, sColDSDir, colid))

        if docid:
            if str(trp["md"]["docId"]) != str(docid):
                raise ValueError(
                    "Document ID does not match docId of TRP data.")
        else:
            docid = trp["md"]["docId"]

        pageList = trp["pageList"]

        docDir = os.path.join(sColDSDir, str(docid))

        if not os.path.exists(docDir):
            raise ValueError("Document directory not found: %s" % docDir)

        lFileList = []
        for dPage in pageList['pages']:
            pagenum = dPage['pageNr']
            logging.info("\t\t- page %s" % pagenum)

            imgFileName = dPage['imgFileName']
            base, _ = os.path.splitext(imgFileName)
            lFileList.append(base)

            _trpTranscript0 = dPage['tsList']["transcripts"][0]
            tsId = _trpTranscript0['tsId']
            sBaseName, _ = os.path.splitext(imgFileName)
            xmlFilename = docDir + os.sep + sBaseName + ".pxml"
            logging.info("\t\t\t%s" % xmlFilename)
            assert os.path.exists(xmlFilename)
            with open(xmlFilename, "r", encoding='utf-8') as fd:
                sXMlTranscript = fd.read()
            cur_status = _trpTranscript0["status"] if status == None else status
            traceln("page %5d : %s : %s : %s : %s : %s" %
                    (pagenum, cur_status, sToolName, tsId, sNote, xmlFilename))
            self.postPageTranscript(colid,
                                    docid,
                                    pagenum,
                                    sXMlTranscript,
                                    parentId=tsId,
                                    bEncoded=False,
                                    sNote=sNote,
                                    sToolName=sToolName,
                                    status=cur_status)

        if iVerbose:
            traceln("   Done (collection %s, document %s as per TRP)" %
                    (colid, docid))

        return lFileList