def __init__(self, sModelName, sModelDir, sComment=None): DU_CRF_Task.__init__( self, sModelName, sModelDir, dodge_graph.DU_GRAPH, dFeatureConfig={ 'nbClass': 3, 'n_feat_node': 500, 't_ngrams_node': (2, 4), 'b_node_lc': False, 'n_feat_edge': 250, 't_ngrams_edge': (2, 4), 'b_edge_lc': False, 'n_jobs': 8 #n_jobs when fitting the internal Logit feat extractor model by grid search }, dLearnerConfig={ 'C': .1, 'njobs': 8, 'inference_cache': 50 #, 'tol' : .1 , 'tol': .05, 'save_every': 50 #save every 50 iterations,for warm start , 'max_iter': 1000, 'uniform_classweight': True }, sComment=sComment, cFeatureDefinition=FeatureDefinition_PageXml_LogitExtractor) self.addBaseline_LogisticRegression() #use a LR model as baseline
def __init__(self, sModelName, sModelDir, sComment=None): DU_CRF_Task.__init__( self, sModelName, sModelDir, dodge_graph.DU_GRAPH, dFeatureConfig={ 'n_tfidf_node': 500, 't_ngrams_node': (2, 4), 'b_tfidf_node_lc': False, 'n_tfidf_edge': 250, 't_ngrams_edge': (2, 4), 'b_tfidf_edge_lc': False }, dLearnerConfig={ 'C': .1, 'njobs': 4, 'inference_cache': 50 #, 'tol' : .1 , 'tol': .05, 'save_every': 50 #save every 50 iterations,for warm start , 'max_iter': 1000 }, sComment=sComment, cFeatureDefinition= None #SO THAT WE USE THE SAME FEATURES AS FOR PageXml (because it is the features by default) ) self.addBaseline_LogisticRegression() #use a LR model as baseline
def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): #NOTE: we might get a list in C tol max_iter inference_cache (in case of gridsearch) DU_CRF_Task.__init__(self , sModelName, sModelDir , DU_GRAPH , dFeatureConfig = { 'nbClass' : 3 , 't_ngrams_node' : (2,4) , 'b_node_lc' : False , 't_ngrams_edge' : (2,4) , 'b_edge_lc' : False , 'n_jobs' : 5 #n_jobs when fitting the internal Logit feat extractor model by grid search } , dLearnerConfig = { 'C' : .1 if C is None else C , 'njobs' : 5 if njobs is None else njobs , 'inference_cache' : 50 if inference_cache is None else inference_cache , 'tol' : .05 if tol is None else tol , 'save_every' : 50 #save every 50 iterations,for warm start , 'max_iter' : 1000 if max_iter is None else max_iter } , sComment=sComment , cFeatureDefinition=FeatureDefinition_PageXml_LogitExtractorV2 ) self.setNbClass(3) #so that we check if all classes are represented in the training set self.bsln_mdl = self.addBaseline_LogisticRegression() #use a LR model trained by GridSearch as baseline
def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): DU_CRF_Task.__init__( self, sModelName, sModelDir, self.DU_GRAPH, dLearnerConfig={ 'C': .1 if C is None else C, 'njobs': 8 if njobs is None else njobs, 'inference_cache': 50 if inference_cache is None else inference_cache #, 'tol' : .1 , 'tol': .05 if tol is None else tol, 'save_every': 50 #save every 50 iterations,for warm start , 'max_iter': 1000 if max_iter is None else max_iter }, sComment=sComment, cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText # , cFeatureDefinition=FeatureDefinition_T_PageXml_StandardOnes_noText # , dFeatureConfig = { # #config for the extractor of nodes of each type # "text": None, # "sprtr": None, # #config for the extractor of edges of each type # "text_text": None, # "text_sprtr": None, # "sprtr_text": None, # "sprtr_sprtr": None # } ) traceln("- classes: ", self.DU_GRAPH.getLabelNameList()) self.bsln_mdl = self.addBaseline_LogisticRegression( ) #use a LR model trained by GridSearch as baseline
def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): DU_CRF_Task.__init__(self , sModelName, sModelDir , DU_GRAPH , dFeatureConfig = { 'n_tfidf_node' : 500 , 't_ngrams_node' : (2,4) , 'b_tfidf_node_lc' : False , 'n_tfidf_edge' : 250 , 't_ngrams_edge' : (2,4) , 'b_tfidf_edge_lc' : False } # , dLearnerConfig = { # 'C' : .1 # # 'C' : 1.0 # , 'njobs' : 4 # , 'inference_cache' : 50 # , 'tol' : .1 # # , 'tol' : 0.05 # , 'save_every' : 50 #save every 50 iterations,for warm start # , 'max_iter' : 250 # } # } , dLearnerConfig = { 'C' : .1 if C is None else C , 'njobs' : 5 if njobs is None else njobs , 'inference_cache' : 50 if inference_cache is None else inference_cache #, 'tol' : .1 , 'tol' : .05 if tol is None else tol , 'save_every' : 50 #save every 50 iterations,for warm start , 'max_iter' : 1000 if max_iter is None else max_iter } , sComment=sComment ) #deprecated self.setNbClass(5+1) self.addBaseline_LogisticRegression() #use a LR model as baseline
def __init__(self, sModelName, sModelDir, sComment=None): DU_CRF_Task.__init__( self, sModelName, sModelDir, DU_GRAPH, dFeatureConfig={ 'nbClass': nt.nCls #number of classes , 'n_feat_node': 500 #number of ngrams to extract by chi2 , 't_ngrams_node': (2, 4), 'b_node_lc': False, 'n_feat_edge': 250 #number of ngrams to extract by chi2 , 't_ngrams_edge': (2, 4), 'b_edge_lc': False, 'n_jobs': 10 #n_jobs when fitting the internal Logit feat extractor model by grid search }, dLearnerConfig={ 'C': .1, 'njobs': 4, 'inference_cache': 50 #, 'tol' : .1 , 'tol': .05, 'save_every': 50 #save every 50 iterations,for warm start , 'max_iter': 250 }, sComment=sComment, cFeatureDefinition=FeatureDefinition_PageXml_LogitExtractorV2) self.addBaseline_LogisticRegression() #use a LR model as baseline
, ('ATMOSTONE', nt, 'title' , False) #0 or 1 heading pare page ] ) # =============================================================================================================== class DU_BL_V1(DU_Baseline): def __init__(self, sModelName, sModelDir,logitID,sComment=None): DU_Baseline.__init__(self, sModelName, sModelDir,DU_GRAPH,logitID) if __name__ == "__main__": version = "v.01" usage, description, parser = DU_CRF_Task.getBasicTrnTstRunOptionParser(sys.argv[0], version) # --- #parse the command line (options, args) = parser.parse_args() # --- try: sModelDir, sModelName = args except Exception as e: _exit(usage, 1, e) doer = DU_BL_V1(sModelName, sModelDir,'logit_5') if options.rm: doer.rm() sys.exit(0)
def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): # =============================================================================================================== lLabels = ['RB', 'RI', 'RE', 'RS', 'RO'] lIgnoredLabels = None nbClass = len(lLabels) """ if you play with a toy collection, which does not have all expected classes, you can reduce those. """ lActuallySeen = None if lActuallySeen: print "REDUCING THE CLASSES TO THOSE SEEN IN TRAINING" lIgnoredLabels = [ lLabels[i] for i in range(len(lLabels)) if i not in lActuallySeen ] lLabels = [lLabels[i] for i in lActuallySeen] print len(lLabels), lLabels print len(lIgnoredLabels), lIgnoredLabels nbClass = len( lLabels) + 1 #because the ignored labels will become OTHER #DEFINING THE CLASS OF GRAPH WE USE DU_GRAPH = Graph_MultiSinglePageXml nt = NodeType_PageXml_type_woText( "abp" #some short prefix because labels below are prefixed with it , lLabels, lIgnoredLabels, False #no label means OTHER , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3) ) #we reduce overlap in this way ) # ntA = NodeType_PageXml_type_woText("abp" #some short prefix because labels below are prefixed with it # , lLabels # , lIgnoredLabels # , False #no label means OTHER # ) nt.setXpathExpr(( ".//pc:TextLine" #how to find the nodes , "./pc:TextEquiv") #how to get their text ) # ntA.setXpathExpr( (".//pc:TextLine | .//pc:TextRegion" #how to find the nodes # , "./pc:TextEquiv") #how to get their text # ) DU_GRAPH.addNodeType(nt) # =============================================================================================================== DU_CRF_Task.__init__( self, sModelName, sModelDir, DU_GRAPH, dFeatureConfig={}, dLearnerConfig={ 'C': .1 if C is None else C, 'njobs': 8 if njobs is None else njobs, 'inference_cache': 50 if inference_cache is None else inference_cache #, 'tol' : .1 , 'tol': .05 if tol is None else tol, 'save_every': 50 #save every 50 iterations,for warm start , 'max_iter': 1000 if max_iter is None else max_iter }, sComment=sComment #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText , cFeatureDefinition=FeatureDefinition_PageXml_NoNodeFeat_v3) #self.setNbClass(3) #so that we check if all classes are represented in the training set self.bsln_mdl = self.addBaseline_LogisticRegression( ) #use a LR model trained by GridSearch as baseline
def predict(self, lsColDir): """ Return the list of produced files """ self.sXmlFilenamePattern = "*.mpxml" return DU_CRF_Task.predict(self, lsColDir)