def __init__(self,
              trnkbsServerUrl,
              sHttpProxy=None,
              loggingLevel=logging.WARN):
     TranskribusClient.__init__(self,
                                sServerUrl=self.sDefaultServerUrl,
                                proxies=sHttpProxy,
                                loggingLevel=loggingLevel)
Esempio n. 2
0
    def __init__(self,
                 trnkbsServerUrl,
                 sHttpProxy=None,
                 loggingLevel=logging.WARN):
        TranskribusClient.__init__(self,
                                   sServerUrl=self.sDefaultServerUrl,
                                   proxies=sHttpProxy,
                                   loggingLevel=loggingLevel)

        self._trpMng = DoTranscript(self.sDefaultServerUrl,
                                    sHttpProxy=sHttpProxy,
                                    loggingLevel=loggingLevel)

        self.percTest = 0.1
Esempio n. 3
0
import logging

try:  #to ease the use without proper Python installation
    import TranskribusPyClient_version
except ImportError:
    sys.path.append(
        os.path.dirname(
            (os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))))))
    import TranskribusPyClient_version

from TranskribusPyClient.test import _colId_A, _docId_b
from TranskribusPyClient.client import TranskribusClient, getStoredCredentials

login, pwd = getStoredCredentials()

conn = TranskribusClient(proxies={'https': 'http://*****:*****@xrce.xerox.com", "trnjluc", sHttpsProxyUrl='http://cornillon:8000')

sXml = u"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd">
    <Metadata>
        <Creator>TRP</Creator>
        <Created>2016-08-18T13:35:08.767+07:00</Created>
        <LastChange>2016-12-01T09:59:24.254+01:00</LastChange>
Esempio n. 4
0
class TableProcessing(Component.Component):
    usage = ""
    version = "v.01"
    description = "description: table layout analysis based on template"

    sCOL = "col"
    sMPXMLExtension = ".mpxml"

    def __init__(self):
        """
        Always call first the Component constructor.
        """
        Component.Component.__init__(self, "TableProcessing", self.usage,
                                     self.version, self.description)

        self.colid = None
        self.docid = None

        self.bFullCol = False
        # generate MPXML using Ext
        self.useExtForMPXML = False

        self.bRegenerateMPXML = False

        self.sRowModelName = None
        self.sRowModelDir = None

        self.sHTRmodel = None
        self.sDictName = None

    def setParams(self, dParams):
        """
        Always call first the Component setParams
        Here, we set our internal attribute according to a possibly specified value (otherwise it stays at its default value)
        """
        Component.Component.setParams(self, dParams)
        if "coldir" in dParams:
            self.coldir = dParams["coldir"]
        if "colid" in dParams:
            self.colid = dParams["colid"]
        if "colid" in dParams:
            self.docid = dParams["docid"]
        if "useExt" in dParams:
            self.useExtForMPXML = dParams["useExt"]

        if 'regMPXML' in dParams:
            self.bRegenerateMPXML = True

        if "rowmodelname" in dParams:
            self.sRowModelName = dParams["rowmodelname"]
        if "rowmodeldir" in dParams:
            self.sRowModelDir = dParams["rowmodeldir"]

        if "htrmodel" in dParams:
            self.sHTRmodel = dParams["htrmodel"]
        if "dictname" in dParams:
            self.sDictName = dParams["dictname"]

        # Connection to Transkribus
        self.myTrKCient = None
        self.persist = False
        self.loginInfo = False
        if dParams.has_key("server"):
            self.server = dParams["server"]
        if dParams.has_key("persist"):
            self.persist = dParams["persist"]
        if dParams.has_key("login"):
            self.loginInfo = dParams["login"]

    def login(self, trnskrbs_client, trace=None, traceln=None):
        """
        deal with the complicated login variants...
            -trace and traceln are optional print methods 
        return True or raises an exception
        """
        DEBUG = True
        bOk = False
        if self.persist:
            #try getting some persistent session token
            if DEBUG and trace:
                trace("  ---login--- Try reusing persistent session ... ")
            try:
                bOk = trnskrbs_client.reusePersistentSession()
                if DEBUG and traceln: traceln("OK!")
            except:
                if DEBUG and traceln: traceln("Failed")

        if not bOk:
            if self.loginInfo:
                login, pwd = self.loginInfo, self.pwd
            else:
                if trace:
                    DEBUG and trace(
                        "  ---login--- no login provided, looking for stored credentials... "
                    )
                login, pwd = trnskrbs_client.getStoredCredentials(bAsk=False)
                if DEBUG and traceln: traceln("OK")

            if DEBUG and traceln:
                trace("  ---login--- logging onto Transkribus as %s " % login)
            trnskrbs_client.auth_login(login, pwd)
            if DEBUG and traceln: traceln("OK")
            bOk = True

        return bOk

    def downloadCollection(self,
                           colid,
                           destDir,
                           docid,
                           bNoImg=True,
                           bForce=False):
        """
            download colID
            
            replace destDir by '.'  ?
        """
        destDir = "."
        #         options.server, proxies, loggingLevel=logging.WARN)
        #download
        downloader = TranskribusDownloader(self.myTrKCient.getServerUrl(),
                                           self.myTrKCient.getProxies())
        downloader.setSessionId(self.myTrKCient.getSessionId())
        traceln("- Downloading collection %s to folder %s" %
                (colid, os.path.abspath(destDir)))
        #         col_ts, colDir = downloader.downloadCollection(colid, destDir, bForce=options.bForce, bNoImage=options.bNoImage)
        col_ts, colDir, ldocids, dFileListPerDoc = downloader.downloadCollection(
            colid, destDir, bForce=bForce, bNoImage=bNoImg, sDocId=docid)
        traceln("- Done")

        with open(os.path.join(colDir, "config.txt"), "w") as fd:
            fd.write("server=%s\nforce=%s\nstrict=%s\n" %
                     (self.server, True, False))

        downloader.generateCollectionMultiPageXml(
            os.path.join(colDir, TableProcessing.sCOL), dFileListPerDoc, False)

        traceln('- Done, see in %s' % colDir)

        return ldocids

    def upLoadDocument(self,
                       colid,
                       coldir,
                       docid,
                       sNote="",
                       sTranscripExt='.mpxml'):
        """
            download colID
        """

        #         options.server, proxies, loggingLevel=logging.WARN)
        #download
        #         uploader = TranskribusTranscriptUploader(self.server,self.proxies)
        uploader = TranskribusDUTranscriptUploader(
            self.myTrKCient.getServerUrl(), self.myTrKCient.getProxies())
        uploader.setSessionId(self.myTrKCient.getSessionId())
        traceln("- uploading document %s to collection %s" % (docid, colid))
        uploader.uploadDocumentTranscript(colid,
                                          docid,
                                          os.path.join(coldir, sCOL),
                                          sNote,
                                          'NLE Table',
                                          sTranscripExt,
                                          iVerbose=False)
        traceln("- Done")
        return

    def applyLA_URO(self, colid, docid, nbpages):
        """
        apply textline finder 
        """
        # do the job...
        #         if options.trp_doc:
        #             trpdoc =  json.load(codecs.open(options.trp_doc, "rb",'utf-8'))
        #             docId,sPageDesc = doer.buildDescription(colId,options.docid,trpdoc)

        traceln('process %s pages...' % nbpages)
        lretJobIDs = []
        for i in range(1, nbpages + 1):
            LA = DoLAbatch(self.myTrKCient.getServerUrl(),
                           self.myTrKCient.getProxies())
            LA._trpMng.setSessionId(self.myTrKCient.getSessionId())
            LA.setSessionId(self.myTrKCient.getSessionId())
            _, sPageDesc = LA.buildDescription(colid, "%s/%s" % (docid, i))
            sPageDesc = LA.jsonToXMLDescription(sPageDesc)
            _, lJobIDs = LA.run(colid, sPageDesc, "CITlabAdvancedLaJob", False)
            traceln(lJobIDs)
            lretJobIDs.extend(lJobIDs)
            traceln("- LA running for page %d job:%s" % (i, lJobIDs))
        return lretJobIDs

    def applyHTRForRegions(self, colid, docid, nbpages, modelname, dictionary):
        """
            apply an htr model at region level 
        """

        htrComp = DoHtrRnn(self.myTrKCient.getServerUrl(),
                           self.myTrKCient.getProxies())
        htrComp._trpMng.setSessionId(self.myTrKCient.getSessionId())
        htrComp.setSessionId(self.myTrKCient.getSessionId())

        _, sPageDesc = htrComp.buildDescription(colid,
                                                "%s/%s" % (docid, nbpages))

        sPages = "1-%d" % (nbpages)
        sModelID = None
        # get modelID
        lColModels = self.myTrKCient.listRnns(colid)
        for model in lColModels:
            #             print model['htrId'], type(model['htrId']), modelname,type(modelname)
            if str(model['htrId']) == str(modelname):
                sModelID = model['htrId']
                traceln('model id = %s' % sModelID)
                #some old? models do not have params field
#             try: traceln("%s\t%s\t%s" % (model['htrId'],model['name'],model['params']))
#             except KeyError: traceln("%s\t%s\tno params" % (model['htrId'],model['name']))
        if sModelID == None:
            raise Exception, "no model ID found for %s" % (modelname)
        ret = htrComp.htrRnnDecode(colid,
                                   sModelID,
                                   dictionary,
                                   docid,
                                   sPageDesc,
                                   bDictTemp=False)
        traceln(ret)
        return ret

    def applyHTR(self, colid, docid, nbpages, modelname, dictionary):
        """
            apply HTR on docid
            
            htr id is needed: we have htrmodename
        """
        htrComp = DoHtrRnn(self.myTrKCient.getServerUrl(),
                           self.myTrKCient.getProxies())
        htrComp._trpMng.setSessionId(self.myTrKCient.getSessionId())
        htrComp.setSessionId(self.myTrKCient.getSessionId())

        _, sPageDesc = htrComp.buildDescription(colid,
                                                "%s/%s" % (docid, nbpages))

        sPages = "1-%d" % (nbpages)
        sModelID = None
        # get modelID
        lColModels = self.myTrKCient.listRnns(colid)
        for model in lColModels:
            #             print model['htrId'], type(model['htrId']), modelname,type(modelname)
            if str(model['htrId']) == str(modelname):
                sModelID = model['htrId']
                traceln('model id = %s' % sModelID)
                #some old? models do not have params field
#             try: traceln("%s\t%s\t%s" % (model['htrId'],model['name'],model['params']))
#             except KeyError: traceln("%s\t%s\tno params" % (model['htrId'],model['name']))
        if sModelID == None:
            raise Exception, "no model ID found for %s" % (modelname)
        ret = htrComp.htrRnnDecode(colid,
                                   sModelID,
                                   dictionary,
                                   docid,
                                   sPageDesc,
                                   bDictTemp=False)
        traceln(ret)
        return ret

    def extractFileNamesFromMPXML(self, mpxmldoc):
        """
            to insure correct file order !
            
            duplicated form performCVLLA.py
        """
        xmlpath = os.path.abspath(os.path.join(self.coldir, sCOL, self.docid))

        lNd = PageXml.getChildByName(mpxmldoc.getRootElement(), 'Page')
        #         for i in lNd:print i
        return map(
            lambda x: "%s%s%s.xml" %
            (xmlpath, os.sep, x.prop('imageFilename')[:-4]), lNd)

    def processDocument(self, coldir, colid, docid, dom=None):
        """
            process a single document
            
            1 python ../../src/xml_formats/PageXml.py trnskrbs_5400/col/17442 --ext=pxml
            2 python ../../src/tasks/performCVLLA.py  --coldir=trnskrbs_5400/  --docid=17442 -i trnskrbs_5400/col/17442.mpxml  --bl --regTL --form
            3 python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400
            4 python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
            5 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml
            6 python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442
            7 python ../../../TranskribusPyClient/src/TranskribusCommands/do_htrRnn.py  <model-name> <dictionary-name> 5400 17442
            
            wait
            8  python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force 
             #covnert to ds
            9  python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
            10 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml  --doie --usetemplate  
            
        """

        #create Transkribus client
        self.myTrKCient = TranskribusClient(sServerUrl=self.server,
                                            proxies={},
                                            loggingLevel=logging.WARN)
        #login
        _ = self.login(self.myTrKCient, trace=trace, traceln=traceln)

        #         self.downloadCollection(colid,coldir,docid,bNoImg=False,bForce=True)

        ## load dom
        if dom is None:
            self.inputFileName = os.path.abspath(
                os.path.join(coldir, TableProcessing.sCOL,
                             docid + TableProcessing.sMPXMLExtension))
            mpxml_doc = self.loadDom()
            nbPages = MultiPageXml.getNBPages(mpxml_doc)
        else:
            # load provided mpxml
            mpxml_doc = dom
            nbPages = MultiPageXml.getNBPages(mpxml_doc)

#         ### table registration: need to compute/select???   the template
#         # perform LA  separator, table registration, baseline with normalization
#         #python ../../src/tasks/performCVLLA.py  --coldir=trnskrbs_5400/  --docid=17442 -i trnskrbs_5400/col/17442.mpxml  --bl --regTL --form
#         tableregtool= LAProcessor()
# #         latool.setParams(dParams)
#         tableregtool.coldir = coldir
#         tableregtool.docid = docid
#         tableregtool.bTemplate, tableregtool.bSeparator , tableregtool.bBaseLine , tableregtool.bRegularTextLine = True,False,False,False
#         # creates xml and a new mpxml
#         mpxml_doc,nbPages = tableregtool.performLA(mpxml_doc)
#
#

#         self.upLoadDocument(colid, coldir,docid,sNote='NLE workflow;table reg done')

        lJobIDs = self.applyLA_URO(colid, docid, nbPages)
        return

        bWait = True
        assert lJobIDs != []
        jobid = lJobIDs[-1]
        traceln("waiting for job %s" % jobid)
        while bWait:
            dInfo = self.myTrKCient.getJobStatus(jobid)
            bWait = dInfo['state'] not in ['FINISHED', 'FAILED']

        ## coldir???
        self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True)

        ##STOP HERE FOR DAS newx testset:
        return

        # tag text for BIES cell
        #python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400
        """ 
            needed : doer = DU_ABPTable_TypedCRF(sModelName, sModelDir,
        """
        doer = DU_ABPTable_TypedCRF(self.sRowModelName, self.sRowModelDir)
        doer.load()
        ## needed predict at file level, and do not store dom, but return it
        rowpath = os.path.join(coldir, "col")
        BIESFiles = doer.predict([rowpath], docid)
        BIESDom = self.loadDom(BIESFiles[0])
        #         res= BIESDom.saveFormatFileEnc('test.mpxml', "UTF-8",True)

        # MPXML2DS
        #python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
        dsconv = primaAnalysis()
        DSBIESdoc = dsconv.convert2DS(BIESDom, self.docid)

        # create XMLDOC object
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(
            DSBIESdoc)  #,listPages = range(self.firstPage,self.lastPage+1))
        # create row
        #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml
        rdc = RowDetection()
        rdc.findRowsInDoc(self.ODoc)

        #python ../../src/xml_formats/DS2PageXml.py -i trnskrbs_5400/out/17442.ds_xml --multi
        # DS2MPXML
        DS2MPXML = DS2PageXMLConvertor()
        lPageXml = DS2MPXML.run(self.ODoc.getDom())
        if lPageXml != []:
            #             if DS2MPXML.bMultiPages:
            newDoc = MultiPageXml.makeMultiPageXmlMemory(
                map(lambda xy: xy[0], lPageXml))
            outputFileName = os.path.join(
                self.coldir, sCOL,
                self.docid + TableProcessing.sMPXMLExtension)
            newDoc.write(outputFileName,
                         xml_declaration=True,
                         encoding="UTF-8",
                         pretty_print=True)


#             else:
#                 DS2MPXML.storePageXmlSetofFiles(lPageXml)

        return

        #upload
        # python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442
        self.upLoadDocument(colid,
                            coldir,
                            docid,
                            sNote='NLE workflow;table row done')

        ## apply HTR
        ## how to deal with specific dictionaries?

        ## here need to know the ontology and the template

        nbPages = 1
        jobid = self.applyHTR(colid, docid, nbPages, self.sHTRmodel,
                              self.sDictName)
        bWait = True
        traceln("waiting for job %s" % jobid)
        while bWait:
            dInfo = self.myTrKCient.getJobStatus(jobid)
            bWait = dInfo['state'] not in ['FINISHED', 'FAILED', 'CANCELED']

        # download  where???
        # python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force
        #   coldir is not right!! coldir must refer to the parent folder!
        self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True)

        #done!!

        # IE extr
        ## not here: specific to a usecas
        #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml  --doie --usetemplate

    def processCollection(self, coldir):
        """
            process all files in a colelction
            need mpxml files
        """
        lsDocFilename = sorted(
            glob.iglob(
                os.path.join(coldir, "*" + TableProcessing.sMPXMLExtension)))
        lDocId = []
        for sDocFilename in lsDocFilename:
            sDocId = os.path.basename(
                sDocFilename)[:-len(TableProcessing.sMPXMLExtension)]
            try:
                docid = int(sDocId)
                lDocId.append(docid)
            except ValueError:
                traceln("Warning: folder %s : %s invalid docid, IGNORING IT" %
                        (self.coldir, sDocId))
                continue

        # process each document
        for docid in lDocId:
            traceln("Processing %s : %s " % (self.coldir, sDocId))
            self.processDocument(self.colid, docid)
            traceln("\tProcessing done for %s " % (self.coldir, sDocId))

    def processParameters(self):
        """
            what to do with the parameters provided by the command line
        """
        if self.colid is None:
            print('collection id missing!')
            sys.exit(1)

        self.bFullCol = self.docid != None

        if self.bRegenerateMPXML and self.docid is not None:
            l = glob.glob(os.path.join(self.coldir, sCOL, self.docid,
                                       "*.pxml"))
            doc = MultiPageXml.makeMultiPageXml(l)
            outputFileName = os.path.join(
                self.coldir, sCOL,
                self.docid + TableProcessing.sMPXMLExtension)
            doc.write(outputFileName,
                      xml_declaration=True,
                      encoding="UTF-8",
                      pretty_print=True)
            return doc
        return None

    def run(self):
        """
            process at collection level or document level
        """
        newMPXML = self.processParameters()
        if self.bFullCol is None:
            self.processCollection(self.colid)
        else:
            self.processDocument(self.coldir, self.colid, self.docid, newMPXML)
Esempio n. 5
0
class TableProcessing(Component.Component):
    usage = ""
    version = "v.01"
    description = "description: table layout analysis based on template"

    sCOL = "col"
    sMPXMLExtension = ".mpxml"

    def __init__(self):
        """
        Always call first the Component constructor.
        """
        Component.Component.__init__(self, "TableProcessing", self.usage,
                                     self.version, self.description)

        self.colid = None
        self.docid = None

        self.bFullCol = False
        # generate MPXML using Ext
        self.useExtForMPXML = False

        self.bRegenerateMPXML = False

        self.sRowModelName = None
        self.sRowModelDir = None

        self.sHTRmodel = None
        self.sDictName = None

    def setParams(self, dParams):
        """
        Always call first the Component setParams
        Here, we set our internal attribute according to a possibly specified value (otherwise it stays at its default value)
        """
        Component.Component.setParams(self, dParams)
        if "coldir" in dParams:
            self.coldir = dParams["coldir"]
        if "colid" in dParams:
            self.colid = dParams["colid"]
        if "colid" in dParams:
            self.docid = dParams["docid"]
        if "useExt" in dParams:
            self.useExtForMPXML = dParams["useExt"]

        if 'mergeTLC' in dParams:
            self.bUROCVLMerge = dParams["mergeTLC"]

        if 'regMPXML' in dParams:
            self.bRegenerateMPXML = dParams["regMPXML"]

        if "rowmodelname" in dParams:
            self.sRowModelName = dParams["rowmodelname"]
        if "rowmodeldir" in dParams:
            self.sRowModelDir = dParams["rowmodeldir"]

        if "htrmodel" in dParams:
            self.sHTRmodel = dParams["htrmodel"]
        if "dictname" in dParams:
            self.sDictName = dParams["dictname"]

        # Connection to Transkribus
        self.myTrKCient = None
        self.persist = False
        self.loginInfo = False
        if "server" in dParams:
            self.server = dParams["server"]
        if "persist" in dParams:
            self.persist = dParams["persist"]
        if "login" in dParams:
            self.loginInfo = dParams["login"]

    def login(self, trnskrbs_client, trace=None, traceln=None):
        """
        deal with the complicated login variants...
            -trace and traceln are optional print methods 
        return True or raises an exception
        """
        DEBUG = True
        bOk = False
        if self.persist:
            #try getting some persistent session token
            if DEBUG and trace:
                trace("  ---login--- Try reusing persistent session ... ")
            try:
                bOk = trnskrbs_client.reusePersistentSession()
                if DEBUG and traceln: traceln("OK!")
            except:
                if DEBUG and traceln: traceln("Failed")

        if not bOk:
            if self.loginInfo:
                login, pwd = self.loginInfo, self.pwd
            else:
                if trace:
                    DEBUG and trace(
                        "  ---login--- no login provided, looking for stored credentials... "
                    )
                login, pwd = trnskrbs_client.getStoredCredentials(bAsk=False)
                if DEBUG and traceln: traceln("OK")

            if DEBUG and traceln:
                trace("  ---login--- logging onto Transkribus as %s " % login)
            trnskrbs_client.auth_login(login, pwd)
            if DEBUG and traceln: traceln("OK")
            bOk = True

        return bOk

    def downloadCollection(self,
                           colid,
                           destDir,
                           docid,
                           bNoImg=True,
                           bForce=False):
        """
            download colID
            
            replace destDir by '.'  ?
        """
        destDir = "."
        #         options.server, proxies, loggingLevel=logging.WARN)
        #download
        downloader = TranskribusDownloader(self.myTrKCient.getServerUrl(),
                                           self.myTrKCient.getProxies())
        downloader.setSessionId(self.myTrKCient.getSessionId())
        traceln("- Downloading collection %s to folder %s" %
                (colid, os.path.abspath(destDir)))
        #         col_ts, colDir = downloader.downloadCollection(colid, destDir, bForce=options.bForce, bNoImage=options.bNoImage)
        col_ts, colDir, ldocids, dFileListPerDoc = downloader.downloadCollection(
            colid, destDir, bForce=bForce, bNoImage=bNoImg, sDocId=docid)
        traceln("- Done")

        with open(os.path.join(colDir, "config.txt"), "w") as fd:
            fd.write("server=%s\nforce=%s\nstrict=%s\n" %
                     (self.server, True, False))

        downloader.generateCollectionMultiPageXml(
            os.path.join(colDir, TableProcessing.sCOL), dFileListPerDoc, False)

        traceln('- Done, see in %s' % colDir)

        return ldocids

    def upLoadDocument(self,
                       colid,
                       coldir,
                       docid,
                       sNote="",
                       sTranscripExt='.mpxml'):
        """
            download colID
        """

        #         options.server, proxies, loggingLevel=logging.WARN)
        #download
        #         uploader = TranskribusTranscriptUploader(self.server,self.proxies)
        uploader = TranskribusDUTranscriptUploader(
            self.myTrKCient.getServerUrl(), self.myTrKCient.getProxies())
        uploader.setSessionId(self.myTrKCient.getSessionId())
        traceln("- uploading document %s to collection %s" % (docid, colid))
        uploader.uploadDocumentTranscript(colid,
                                          docid,
                                          os.path.join(coldir, sCOL),
                                          sNote,
                                          'NLE Table',
                                          sTranscripExt,
                                          iVerbose=False)
        traceln("- Done")
        return

    def applyLA_URO(self, colid, docid, nbpages):
        """
        apply textline finder 
        """
        # do the job...
        #         if options.trp_doc:
        #             trpdoc =  json.load(codecs.open(options.trp_doc, "rb",'utf-8'))
        #             docId,sPageDesc = doer.buildDescription(colId,options.docid,trpdoc)

        traceln('process %s pages...' % nbpages)
        lretJobIDs = []
        for i in range(1, nbpages + 1):
            LA = DoLAbatch(self.myTrKCient.getServerUrl(),
                           self.myTrKCient.getProxies())
            LA._trpMng.setSessionId(self.myTrKCient.getSessionId())
            LA.setSessionId(self.myTrKCient.getSessionId())
            _, sPageDesc = LA.buildDescription(colid, "%s/%s" % (docid, i))
            sPageDesc = LA.jsonToXMLDescription(sPageDesc)
            _, lJobIDs = LA.run(colid, sPageDesc, "CITlabAdvancedLaJob", False)
            traceln(lJobIDs)
            lretJobIDs.extend(lJobIDs)
            traceln("- LA running for page %d job:%s" % (i, lJobIDs))
        return lretJobIDs

    def applyHTRForRegions(self, colid, docid, nbpages, modelname, dictionary):
        """
            apply an htr model at region level 
        """

        htrComp = DoHtrRnn(self.myTrKCient.getServerUrl(),
                           self.myTrKCient.getProxies())
        htrComp._trpMng.setSessionId(self.myTrKCient.getSessionId())
        htrComp.setSessionId(self.myTrKCient.getSessionId())

        _, sPageDesc = htrComp.buildDescription(colid,
                                                "%s/%s" % (docid, nbpages))

        sPages = "1-%d" % (nbpages)
        sModelID = None
        # get modelID
        lColModels = self.myTrKCient.listRnns(colid)
        for model in lColModels:
            #             print model['htrId'], type(model['htrId']), modelname,type(modelname)
            if str(model['htrId']) == str(modelname):
                sModelID = model['htrId']
                traceln('model id = %s' % sModelID)
                #some old? models do not have params field
#             try: traceln("%s\t%s\t%s" % (model['htrId'],model['name'],model['params']))
#             except KeyError: traceln("%s\t%s\tno params" % (model['htrId'],model['name']))
        if sModelID == None:
            raise Exception("no model ID found for %s" % (modelname))
        ret = htrComp.htrRnnDecode(colid,
                                   sModelID,
                                   dictionary,
                                   docid,
                                   sPageDesc,
                                   bDictTemp=False)
        traceln(ret)
        return ret

    def applyHTR(self, colid, docid, nbpages, modelname, dictionary):
        """
            apply HTR on docid
            
            htr id is needed: we have htrmodename
        """
        htrComp = DoHtrRnn(self.myTrKCient.getServerUrl(),
                           self.myTrKCient.getProxies())
        htrComp._trpMng.setSessionId(self.myTrKCient.getSessionId())
        htrComp.setSessionId(self.myTrKCient.getSessionId())

        _, sPageDesc = htrComp.buildDescription(colid,
                                                "%s/%s" % (docid, nbpages))

        sPages = "1-%d" % (nbpages)
        sModelID = None
        # get modelID
        lColModels = self.myTrKCient.listRnns(colid)
        for model in lColModels:
            #             print model['htrId'], type(model['htrId']), modelname,type(modelname)
            if str(model['htrId']) == str(modelname):
                sModelID = model['htrId']
                traceln('model id = %s' % sModelID)
                #some old? models do not have params field
#             try: traceln("%s\t%s\t%s" % (model['htrId'],model['name'],model['params']))
#             except KeyError: traceln("%s\t%s\tno params" % (model['htrId'],model['name']))
        if sModelID == None:
            raise Exception("no model ID found for %s" % (modelname))
        ret = htrComp.htrRnnDecode(colid,
                                   sModelID,
                                   dictionary,
                                   docid,
                                   sPageDesc,
                                   bDictTemp=False)
        traceln(ret)
        return ret

    def overlapX(self, zoneA, zoneB):
        [x11, y11, x12, y12] = zoneA.getBoundingBox(
        )  #self.getX(),self.getY(),self.getHeight(),self.getWidth()
        [x21, y21, x22, y22] = zoneB.getBoundingBox()

        [a1, a2] = x11, x12
        [b1, b2] = x21, x22  #zoneB.getX(),zoneB.getX()+ zoneB.getWidth()
        return min(a2, b2) >= max(a1, b1)

    def overlapY(self, zoneA, zoneB):
        [x11, y11, x12, y12] = zoneA.getBoundingBox(
        )  #self.getX(),self.getY(),self.getHeight(),self.getWidth()
        [x21, y21, x22, y22] = zoneB.getBoundingBox()
        [a1, a2] = y11, y12
        [b1, b2] = y22, y22  #zone.getY(),zone.getY() + zone.getHeight()
        return min(a2, b2) >= max(a1, b1)

    def signedOverlap(self, zoneA, zoneB):
        """
         overlap self and zone
         return surface of self in zone 
        """

        [x11, y11, x12, y12] = zoneA.getBoundingBox(
        )  #self.getX(),self.getY(),self.getHeight(),self.getWidth()
        [x21, y21, x22, y22] = zoneB.getBoundingBox(
        )  #.getX(),zone.getY(),zone.getHeight(),zone.getWidth()
        w1 = x12 - x11
        h1 = y12 - y11
        fOverlap = 0.0

        if self.overlapX(zoneA, zoneB) and self.overlapY(zoneA, zoneB):
            s1 = w1 * h1

            # possible ?
            if s1 == 0: s1 = 1.0
            #intersection
            nx1 = max(x11, x21)
            nx2 = min(x12, x22)
            ny1 = max(y11, y21)
            ny2 = min(y12, y22)
            h = abs(nx2 - nx1)
            w = abs(ny2 - ny1)

            inter = h * w
            if inter > 0:
                fOverlap = inter / s1
            else:
                # if overX and Y this is not possible !
                fOverlap = 0.0

        return fOverlap

    def mergeBaselineCells(self, coldir, colid, docid):
        """
        
            Take a file (pxml) with stuff processed on Transkribus
            Tale the CVL template tool xml (xml)
            
            merge them
            
            regenerate a mpxml
             
        """

        xmlpath = os.path.abspath(os.path.join(coldir, sCOL, docid))
        #         print (xmlpath)

        mpxml = xmlpath + ".mpxml"
        mpxmldoc = etree.parse(mpxml)

        lxml = glob.glob(os.path.join(xmlpath, "*.xml"))
        pxmldoc = MultiPageXml.makeMultiPageXml(lxml)

        lhtrxml = glob.glob(os.path.join(xmlpath, "*.pxml"))
        mpxmldoc = MultiPageXml.makeMultiPageXml(lhtrxml)

        lPXMLPage = PageXml.getChildByName(mpxmldoc.getroot(), 'Page')
        lXMLPage = PageXml.getChildByName(pxmldoc.getroot(), 'Page')

        assert len(lXMLPage) == len(lPXMLPage)
        for i, cvlpage in enumerate(lXMLPage):
            ## remove TextRegion from xcvlpage
            lTextRegions = PageXml.getChildByName(cvlpage, 'TextRegion')
            for tr in lTextRegions:
                tr.getparent().remove(tr)

            pxmlpage = lPXMLPage[i]
            lTL = []
            lTextRegions = PageXml.getChildByName(pxmlpage, 'TextRegion')
            for x in lTextRegions:
                lTL.extend(PageXml.getChildByName(x, 'TextLine'))

            ltable = PageXml.getChildByName(cvlpage, 'TableRegion')
            if len(ltable) == 0:
                raise "NO TABLE"
            lCells = PageXml.getChildByName(ltable[0], 'TableCell')

            lC = [Polygon(PageXml.getPointList(c)) for c in lCells]
            lT = [Polygon(PageXml.getPointList(t)) for t in lTL]
            for i, tl in enumerate(lT):
                ## normalization
                lCoordsPoints = PageXml.getChildByName(lTL[i], 'Coords')
                lCoordsB = PageXml.getChildByName(lTL[i], 'Baseline')
                coordB = lCoordsB[0]
                coord = lCoordsPoints[0]
                iHeight = 30  # in pixel
                x1, y1, x2, y2 = Polygon(
                    PageXml.getPointList(coordB)).getBoundingBox()
                if coord is not None:
                    coord.set(
                        'points', "%d,%d %d,%d %d,%d %d,%d" %
                        (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2))
                    tl = Polygon(PageXml.getPointList(coordB))
                lOverlap = []
                for _, c in enumerate(lC):
                    #                     print (lCells[j].get('row'),lCells[j].get('col'),  self.signedOverlap(c,tl),tl.getBoundingBox(),c.getBoundingBox())
                    lOverlap.append(self.signedOverlap(
                        c, tl))  #.getBoundingBox()))
                ## region of the same size as the textline
#                 print (j,max(lOverlap),lOverlap.index(max(lOverlap)))
                if max(lOverlap) == 0:
                    region = PageXml.createPageXmlNode('TextRegion')
                    cvlpage.append(region)
                    region.append(lTL[i])

                else:
                    cell = lCells[lOverlap.index(max(lOverlap))]
                    cell.append(lTL[i])
#                     print (cell.get('row'),cell.get('col'),''.join(lTL[i].itertext()))

        pxmldoc.write(mpxml)

    """                        
        lOverlap=[]        
        for region in lRegions:
            lOverlap.append(self.signedRatioOverlap(region))
        
        if max(lOverlap) == 0: return None
        return lRegions[lOverlap.index(max(lOverlap))]
    """
    """
            fOverlap = 0.0
        
        if self.overlapX(zone) and self.overlapY(zone):
            [x11,y11,x12,y12] = [x1,y1,x1+w1,y1+h1]
            [x21,y21,x22,y22] = [x2,y2,x2+w2,y2+h2]
            
            s1 = w1 * h1
            
            # possible ?
            if s1 == 0: s1 = 1.0
            
            #intersection
            nx1 = max(x11,x21)
            nx2 = min(x12,x22)
            ny1 = max(y11,y21)
            ny2 = min(y12,y22)
            h = abs(nx2 - nx1)
            w = abs(ny2 - ny1)
            
            inter = h * w
            if inter > 0 :
                fOverlap = inter/s1
            else:
                # if overX and Y this is not possible !
                fOverlap = 0.0
            
        return  fOverlap   
    
    """
    def extractFileNamesFromMPXML(self, mpxmldoc):
        """
            to insure correct file order !
            
            duplicated form performCVLLA.py
        """
        xmlpath = os.path.abspath(os.path.join(self.coldir, sCOL, self.docid))

        lNd = PageXml.getChildByName(mpxmldoc.getRootElement(), 'Page')
        #         for i in lNd:print i
        return map(
            lambda x: "%s%s%s.xml" %
            (xmlpath, os.sep, x.prop('imageFilename')[:-4]), lNd)

    def processDocument(self, coldir, colid, docid, dom=None):
        """
            process a single document
            
            1 python ../../src/xml_formats/PageXml.py trnskrbs_5400/col/17442 --ext=pxml
            2 python ../../src/tasks/performCVLLA.py  --coldir=trnskrbs_5400/  --docid=17442 -i trnskrbs_5400/col/17442.mpxml  --bl --regTL --form
            3 python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400
            4 python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
            5 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml
            6 python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442
            7 python ../../../TranskribusPyClient/src/TranskribusCommands/do_htrRnn.py  <model-name> <dictionary-name> 5400 17442
            
            wait
            8  python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force 
             #covnert to ds
            9  python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
            10 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml  --doie --usetemplate  
            
        """

        #create Transkribus client
        self.myTrKCient = TranskribusClient(sServerUrl=self.server,
                                            proxies={},
                                            loggingLevel=logging.WARN)
        #login
        _ = self.login(self.myTrKCient, trace=trace, traceln=traceln)

        #         self.downloadCollection(colid,coldir,docid,bNoImg=False,bForce=True)

        ## load dom
        if dom is None:
            self.inputFileName = os.path.abspath(
                os.path.join(coldir, TableProcessing.sCOL,
                             docid + TableProcessing.sMPXMLExtension))
            mpxml_doc = self.loadDom()
            nbPages = MultiPageXml.getNBPages(mpxml_doc)
        else:
            # load provided mpxml
            mpxml_doc = dom
            nbPages = MultiPageXml.getNBPages(mpxml_doc)

#         ### table registration: need to compute/select???   the template
#         # perform LA  separator, table registration, baseline with normalization
#         #python ../../src/tasks/performCVLLA.py  --coldir=trnskrbs_5400/  --docid=17442 -i trnskrbs_5400/col/17442.mpxml  --bl --regTL --form
#         tableregtool= LAProcessor()
# #         latool.setParams(dParams)
#         tableregtool.coldir = coldir
#         tableregtool.docid = docid
#         tableregtool.bTemplate, tableregtool.bSeparator , tableregtool.bBaseLine , tableregtool.bRegularTextLine = True,False,False,False
#         # creates xml and a new mpxml
#         mpxml_doc,nbPages = tableregtool.performLA(mpxml_doc)
#
#

#         self.upLoadDocument(colid, coldir,docid,sNote='NLE workflow;table reg done')

        lJobIDs = self.apply_URO(colid, docid, nbPages)
        return

        bWait = True
        assert lJobIDs != []
        jobid = lJobIDs[-1]
        traceln("waiting for job %s" % jobid)
        while bWait:
            dInfo = self.myTrKCient.getJobStatus(jobid)
            bWait = dInfo['state'] not in ['FINISHED', 'FAILED']

        ## coldir???
        self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True)

        ##STOP HERE FOR DAS newx testset:
        return

        # tag text for BIES cell
        #python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400
        """ 
            needed : doer = DU_ABPTable_TypedCRF(sModelName, sModelDir,
        """
        doer = DU_ABPTable_TypedCRF(self.sRowModelName, self.sRowModelDir)
        doer.load()
        ## needed predict at file level, and do not store dom, but return it
        rowpath = os.path.join(coldir, "col")
        BIESFiles = doer.predict([rowpath], docid)
        BIESDom = self.loadDom(BIESFiles[0])
        #         res= BIESDom.saveFormatFileEnc('test.mpxml', "UTF-8",True)

        # MPXML2DS
        #python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
        dsconv = primaAnalysis()
        DSBIESdoc = dsconv.convert2DS(BIESDom, self.docid)

        # create XMLDOC object
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(
            DSBIESdoc)  #,listPages = range(self.firstPage,self.lastPage+1))
        # create row
        #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml
        rdc = RowDetection()
        rdc.findRowsInDoc(self.ODoc)

        #python ../../src/xml_formats/DS2PageXml.py -i trnskrbs_5400/out/17442.ds_xml --multi
        # DS2MPXML
        DS2MPXML = DS2PageXMLConvertor()
        lPageXml = DS2MPXML.run(self.ODoc.getDom())
        if lPageXml != []:
            #             if DS2MPXML.bMultiPages:
            newDoc = MultiPageXml.makeMultiPageXmlMemory(
                map(lambda xy: xy[0], lPageXml))
            outputFileName = os.path.join(
                self.coldir, sCOL,
                self.docid + TableProcessing.sMPXMLExtension)
            newDoc.write(outputFileName,
                         xml_declaration=True,
                         encoding="UTF-8",
                         pretty_print=True)


#             else:
#                 DS2MPXML.storePageXmlSetofFiles(lPageXml)

        return

        #upload
        # python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442
        self.upLoadDocument(colid,
                            coldir,
                            docid,
                            sNote='NLE workflow;table row done')

        ## apply HTR
        ## how to deal with specific dictionaries?
        ## here need to know the ontology and the template
        ## OPTION: put it after LA on server  (just one download needed )

        nbPages = 1
        jobid = self.applyHTR(colid, docid, nbPages, self.sHTRmodel,
                              self.sDictName)
        bWait = True
        traceln("waiting for job %s" % jobid)
        while bWait:
            dInfo = self.myTrKCient.getJobStatus(jobid)
            bWait = dInfo['state'] not in ['FINISHED', 'FAILED', 'CANCELED']

        # download  where???
        # python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force
        #   coldir is not right!! coldir must refer to the parent folder!
        self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True)

        #done!!

        # IE extr
        ## not here: specific to a usecas
        #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml  --doie --usetemplate

    def processCollection(self, coldir):
        """
            process all files in a colelction
            need mpxml files
        """
        lsDocFilename = sorted(
            glob.iglob(
                os.path.join(coldir, "*" + TableProcessing.sMPXMLExtension)))
        lDocId = []
        for sDocFilename in lsDocFilename:
            sDocId = os.path.basename(
                sDocFilename)[:-len(TableProcessing.sMPXMLExtension)]
            try:
                docid = int(sDocId)
                lDocId.append(docid)
            except ValueError:
                traceln("Warning: folder %s : %s invalid docid, IGNORING IT" %
                        (self.coldir, sDocId))
                continue

        # process each document
        for docid in lDocId:
            traceln("Processing %s : %s " % (self.coldir, sDocId))
            self.processDocument(self.colid, docid)
            traceln("\tProcessing done for %s " % (self.coldir, sDocId))

    def processParameters(self):
        """
            what to do with the parameters provided by the command line
        """
        if self.colid is None:
            print('collection id missing!')
            sys.exit(1)

        self.bFullCol = self.docid != None

        if self.bRegenerateMPXML and self.docid is not None:
            l = glob.glob(os.path.join(self.coldir, sCOL, self.docid,
                                       "*.pxml"))
            doc = MultiPageXml.makeMultiPageXml(l)
            outputFileName = os.path.join(
                self.coldir, sCOL,
                self.docid + TableProcessing.sMPXMLExtension)
            doc.write(outputFileName,
                      xml_declaration=True,
                      encoding="UTF-8",
                      pretty_print=True)
            return doc
        return None

    def run(self):
        """
            process at collection level or document level
        """
        newMPXML = self.processParameters()
        if self.bFullCol is None:
            self.processCollection(self.colid)
        else:
            if self.bUROCVLMerge:
                self.mergeBaselineCells(self.coldir, self.colid, self.docid)
                return
            self.processDocument(self.coldir, self.colid, self.docid, newMPXML)
Esempio n. 6
0
    def processDocument(self, coldir, colid, docid, dom=None):
        """
            process a single document
            
            1 python ../../src/xml_formats/PageXml.py trnskrbs_5400/col/17442 --ext=pxml
            2 python ../../src/tasks/performCVLLA.py  --coldir=trnskrbs_5400/  --docid=17442 -i trnskrbs_5400/col/17442.mpxml  --bl --regTL --form
            3 python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400
            4 python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
            5 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml
            6 python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442
            7 python ../../../TranskribusPyClient/src/TranskribusCommands/do_htrRnn.py  <model-name> <dictionary-name> 5400 17442
            
            wait
            8  python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force 
             #covnert to ds
            9  python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
            10 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml  --doie --usetemplate  
            
        """

        #create Transkribus client
        self.myTrKCient = TranskribusClient(sServerUrl=self.server,
                                            proxies={},
                                            loggingLevel=logging.WARN)
        #login
        _ = self.login(self.myTrKCient, trace=trace, traceln=traceln)

        #         self.downloadCollection(colid,coldir,docid,bNoImg=False,bForce=True)

        ## load dom
        if dom is None:
            self.inputFileName = os.path.abspath(
                os.path.join(coldir, TableProcessing.sCOL,
                             docid + TableProcessing.sMPXMLExtension))
            mpxml_doc = self.loadDom()
            nbPages = MultiPageXml.getNBPages(mpxml_doc)
        else:
            # load provided mpxml
            mpxml_doc = dom
            nbPages = MultiPageXml.getNBPages(mpxml_doc)

#         ### table registration: need to compute/select???   the template
#         # perform LA  separator, table registration, baseline with normalization
#         #python ../../src/tasks/performCVLLA.py  --coldir=trnskrbs_5400/  --docid=17442 -i trnskrbs_5400/col/17442.mpxml  --bl --regTL --form
#         tableregtool= LAProcessor()
# #         latool.setParams(dParams)
#         tableregtool.coldir = coldir
#         tableregtool.docid = docid
#         tableregtool.bTemplate, tableregtool.bSeparator , tableregtool.bBaseLine , tableregtool.bRegularTextLine = True,False,False,False
#         # creates xml and a new mpxml
#         mpxml_doc,nbPages = tableregtool.performLA(mpxml_doc)
#
#

#         self.upLoadDocument(colid, coldir,docid,sNote='NLE workflow;table reg done')

        lJobIDs = self.applyLA_URO(colid, docid, nbPages)
        return

        bWait = True
        assert lJobIDs != []
        jobid = lJobIDs[-1]
        traceln("waiting for job %s" % jobid)
        while bWait:
            dInfo = self.myTrKCient.getJobStatus(jobid)
            bWait = dInfo['state'] not in ['FINISHED', 'FAILED']

        ## coldir???
        self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True)

        ##STOP HERE FOR DAS newx testset:
        return

        # tag text for BIES cell
        #python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400
        """ 
            needed : doer = DU_ABPTable_TypedCRF(sModelName, sModelDir,
        """
        doer = DU_ABPTable_TypedCRF(self.sRowModelName, self.sRowModelDir)
        doer.load()
        ## needed predict at file level, and do not store dom, but return it
        rowpath = os.path.join(coldir, "col")
        BIESFiles = doer.predict([rowpath], docid)
        BIESDom = self.loadDom(BIESFiles[0])
        #         res= BIESDom.saveFormatFileEnc('test.mpxml', "UTF-8",True)

        # MPXML2DS
        #python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml  --docid=17442
        dsconv = primaAnalysis()
        DSBIESdoc = dsconv.convert2DS(BIESDom, self.docid)

        # create XMLDOC object
        self.ODoc = XMLDSDocument()
        self.ODoc.loadFromDom(
            DSBIESdoc)  #,listPages = range(self.firstPage,self.lastPage+1))
        # create row
        #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml
        rdc = RowDetection()
        rdc.findRowsInDoc(self.ODoc)

        #python ../../src/xml_formats/DS2PageXml.py -i trnskrbs_5400/out/17442.ds_xml --multi
        # DS2MPXML
        DS2MPXML = DS2PageXMLConvertor()
        lPageXml = DS2MPXML.run(self.ODoc.getDom())
        if lPageXml != []:
            #             if DS2MPXML.bMultiPages:
            newDoc = MultiPageXml.makeMultiPageXmlMemory(
                map(lambda xy: xy[0], lPageXml))
            outputFileName = os.path.join(
                self.coldir, sCOL,
                self.docid + TableProcessing.sMPXMLExtension)
            newDoc.write(outputFileName,
                         xml_declaration=True,
                         encoding="UTF-8",
                         pretty_print=True)


#             else:
#                 DS2MPXML.storePageXmlSetofFiles(lPageXml)

        return

        #upload
        # python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442
        self.upLoadDocument(colid,
                            coldir,
                            docid,
                            sNote='NLE workflow;table row done')

        ## apply HTR
        ## how to deal with specific dictionaries?

        ## here need to know the ontology and the template

        nbPages = 1
        jobid = self.applyHTR(colid, docid, nbPages, self.sHTRmodel,
                              self.sDictName)
        bWait = True
        traceln("waiting for job %s" % jobid)
        while bWait:
            dInfo = self.myTrKCient.getJobStatus(jobid)
            bWait = dInfo['state'] not in ['FINISHED', 'FAILED', 'CANCELED']

        # download  where???
        # python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force
        #   coldir is not right!! coldir must refer to the parent folder!
        self.downloadCollection(colid, coldir, docid, bNoImg=True, bForce=True)
import sys, os
import logging

try: #to ease the use without proper Python installation
    import TranskribusPyClient_version
except ImportError:
    sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
    import TranskribusPyClient_version

from TranskribusPyClient.test import _colId_A
from TranskribusPyClient.client import TranskribusClient, getStoredCredentials


login, pwd = getStoredCredentials()

conn = TranskribusClient(proxies={'https':'http://cornillon:8000'}
                         , loggingLevel=logging.INFO)

sessionID = conn.auth_login(login, pwd)
doc = conn.listEditDeclFeatures(_colId_A)
doc.saveFormatFileEnc("-", "UTF-8", True)
conn.xmlFreeDoc(doc)

print conn.auth_logout()

"""
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<edFeatures>
  <edFeature>
    <featureId>1</featureId>
    <title>Long S</title>
    <description>Source uses long "s"</description>
# -*- coding: utf-8 -*-

#optional: useful if you want to choose the logging level to something else than logging.WARN
import sys, os
import logging

try:  #to ease the use without proper Python installation
    import TranskribusPyClient_version
except ImportError:
    sys.path.append(
        os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))))
    import TranskribusPyClient_version

from TranskribusPyClient.test import _coldId_Sandbox, _docId_a
from TranskribusPyClient.client import TranskribusClient, getStoredCredentials

login, pwd = getStoredCredentials()

conn = TranskribusClient(proxies={'https': 'http://cornillon:8000'},
                         loggingLevel=logging.INFO)
sessionID = conn.auth_login(login, pwd)

data = conn.addDocToCollection(_coldId_Sandbox, _docId_a)
"""
True or Exception
"""

print conn.auth_logout()
Esempio n. 9
0
# -*- coding: utf-8 -*-

#optional: useful if you want to choose the logging level to something else than logging.WARN
import sys, os
import logging

try:  #to ease the use without proper Python installation
    import TranskribusPyClient_version
except ImportError:
    sys.path.append(
        os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))))
    import TranskribusPyClient_version

from TranskribusPyClient.test import _colId_A, _coldId_Sandbox, _docId_c, _docId_d
from TranskribusPyClient.client import TranskribusClient, getStoredCredentials

login, pwd = getStoredCredentials()

conn = TranskribusClient(proxies={'https': 'http://cornillon:8000'},
                         loggingLevel=logging.INFO)
sessionID = conn.auth_login(login, pwd)

data = conn.duplicateDoc(_colId_A, _docId_c, _coldId_Sandbox, "named_by_JL")
data = conn.duplicateDoc(_colId_A, _docId_d, _coldId_Sandbox)
"""
True or Exception
"""

print conn.auth_logout()
Esempio n. 10
0
import sys, os
import logging

try: #to ease the use without proper Python installation
    import TranskribusPyClient_version
except ImportError:
    sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
    import TranskribusPyClient_version

from TranskribusPyClient.test import _colId_A, _docId_a
from TranskribusPyClient.client import TranskribusClient, getStoredCredentials

login, pwd = getStoredCredentials()

conn = TranskribusClient(proxies={'https':'http://*****:*****@xrce.xerox.com'}]},
  u'createdFromTimestamp': 33175290,
  u'createdToTimestamp': 33175290,
Esempio n. 11
0
import sys, os
import logging

try:  #to ease the use without proper Python installation
    import TranskribusPyClient_version
except ImportError:
    sys.path.append(
        os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))))
    import TranskribusPyClient_version

from TranskribusPyClient.test import _colId_A, _docId_a
from TranskribusPyClient.client import TranskribusClient, getStoredCredentials

login, pwd = getStoredCredentials()

conn = TranskribusClient(proxies={'https': 'http://*****:*****@xrce.xerox.com", "trnjluc", sHttpsProxyUrl='http://cornillon:8000')

# ret = conn.getDocumentFromServer(colid, docid)
#ret = conn.getDocumentFromServer("3571", "7750")
data = conn.getDocByIdAsXml(_colId_A, str(_docId_a))  #str just to stress-test
#data = conn.getDocByIdAsXml(3571, "7750")
print data
"""