Esempio n. 1
0
    def __init__(self, sModelName, sModelDir, sComment=None):

        DU_CRF_Task.__init__(
            self,
            sModelName,
            sModelDir,
            dodge_graph.DU_GRAPH,
            dFeatureConfig={
                'nbClass': 3,
                'n_feat_node': 500,
                't_ngrams_node': (2, 4),
                'b_node_lc': False,
                'n_feat_edge': 250,
                't_ngrams_edge': (2, 4),
                'b_edge_lc': False,
                'n_jobs':
                8  #n_jobs when fitting the internal Logit feat extractor model by grid search
            },
            dLearnerConfig={
                'C': .1,
                'njobs': 8,
                'inference_cache': 50
                #, 'tol'              : .1
                ,
                'tol': .05,
                'save_every': 50  #save every 50 iterations,for warm start
                ,
                'max_iter': 1000,
                'uniform_classweight': True
            },
            sComment=sComment,
            cFeatureDefinition=FeatureDefinition_PageXml_LogitExtractor)

        self.addBaseline_LogisticRegression()  #use a LR model as baseline
Esempio n. 2
0
    def __init__(self, sModelName, sModelDir, sComment=None):

        DU_CRF_Task.__init__(
            self,
            sModelName,
            sModelDir,
            dodge_graph.DU_GRAPH,
            dFeatureConfig={
                'n_tfidf_node': 500,
                't_ngrams_node': (2, 4),
                'b_tfidf_node_lc': False,
                'n_tfidf_edge': 250,
                't_ngrams_edge': (2, 4),
                'b_tfidf_edge_lc': False
            },
            dLearnerConfig={
                'C': .1,
                'njobs': 4,
                'inference_cache': 50
                #, 'tol'              : .1
                ,
                'tol': .05,
                'save_every': 50  #save every 50 iterations,for warm start
                ,
                'max_iter': 1000
            },
            sComment=sComment,
            cFeatureDefinition=
            None  #SO THAT WE USE THE SAME FEATURES AS FOR PageXml (because it is the features by default)
        )

        self.addBaseline_LogisticRegression()  #use a LR model as baseline
Esempio n. 3
0
 def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): 
     #NOTE: we might get a list in C tol max_iter inference_cache  (in case of gridsearch)
     
     DU_CRF_Task.__init__(self
                          , sModelName, sModelDir
                          , DU_GRAPH
                          , dFeatureConfig = {
                                 'nbClass'    : 3
                               , 't_ngrams_node'   : (2,4)
                               , 'b_node_lc' : False    
                               , 't_ngrams_edge'   : (2,4)
                               , 'b_edge_lc' : False    
                               , 'n_jobs'      : 5         #n_jobs when fitting the internal Logit feat extractor model by grid search
                           }
                          , dLearnerConfig = {
                                'C'                : .1   if C               is None else C
                              , 'njobs'            : 5    if njobs           is None else njobs
                              , 'inference_cache'  : 50   if inference_cache is None else inference_cache
                              , 'tol'              : .05  if tol             is None else tol
                              , 'save_every'       : 50     #save every 50 iterations,for warm start
                              , 'max_iter'         : 1000 if max_iter        is None else max_iter
                              }
                          , sComment=sComment
                          , cFeatureDefinition=FeatureDefinition_PageXml_LogitExtractorV2
                          )
     
     self.setNbClass(3)     #so that we check if all classes are represented in the training set
     
     self.bsln_mdl = self.addBaseline_LogisticRegression()    #use a LR model trained by GridSearch as baseline
Esempio n. 4
0
    def __init__(self,
                 sModelName,
                 sModelDir,
                 sComment=None,
                 C=None,
                 tol=None,
                 njobs=None,
                 max_iter=None,
                 inference_cache=None):

        DU_CRF_Task.__init__(
            self,
            sModelName,
            sModelDir,
            self.DU_GRAPH,
            dLearnerConfig={
                'C':
                .1 if C is None else C,
                'njobs':
                8 if njobs is None else njobs,
                'inference_cache':
                50 if inference_cache is None else inference_cache
                #, 'tol'              : .1
                ,
                'tol':
                .05 if tol is None else tol,
                'save_every':
                50  #save every 50 iterations,for warm start
                ,
                'max_iter':
                1000 if max_iter is None else max_iter
            },
            sComment=sComment,
            cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText
            #                     , cFeatureDefinition=FeatureDefinition_T_PageXml_StandardOnes_noText
            #                      , dFeatureConfig = {
            #                          #config for the extractor of nodes of each type
            #                          "text": None,
            #                          "sprtr": None,
            #                          #config for the extractor of edges of each type
            #                          "text_text": None,
            #                          "text_sprtr": None,
            #                          "sprtr_text": None,
            #                          "sprtr_sprtr": None
            #                          }
        )

        traceln("- classes: ", self.DU_GRAPH.getLabelNameList())

        self.bsln_mdl = self.addBaseline_LogisticRegression(
        )  #use a LR model trained by GridSearch as baseline
Esempio n. 5
0
    def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None): 
        
        DU_CRF_Task.__init__(self
                             , sModelName, sModelDir
                             , DU_GRAPH
                             , dFeatureConfig = {
                                    'n_tfidf_node'    : 500
                                  , 't_ngrams_node'   : (2,4)
                                  , 'b_tfidf_node_lc' : False    
                                  , 'n_tfidf_edge'    : 250
                                  , 't_ngrams_edge'   : (2,4)
                                  , 'b_tfidf_edge_lc' : False    
                              }
#                              , dLearnerConfig = {
#                                     'C'                : .1 
# #                                    'C'                : 1.0 
#                                  , 'njobs'            : 4
#                                  , 'inference_cache'  : 50
#                                 , 'tol'              : .1
# #                                  , 'tol'              : 0.05
#                                 , 'save_every'       : 50     #save every 50 iterations,for warm start
#                                  , 'max_iter'         : 250
#                                  }
#                                                            }
                             , dLearnerConfig = {
                                   'C'                : .1   if C               is None else C
                                 , 'njobs'            : 5    if njobs           is None else njobs
                                 , 'inference_cache'  : 50   if inference_cache is None else inference_cache
                                 #, 'tol'              : .1
                                 , 'tol'              : .05  if tol             is None else tol
                                 , 'save_every'       : 50     #save every 50 iterations,for warm start
                                 , 'max_iter'         : 1000 if max_iter        is None else max_iter
                                 }
                             , sComment=sComment
                             )
        #deprecated self.setNbClass(5+1)
        self.addBaseline_LogisticRegression()    #use a LR model as baseline
Esempio n. 6
0
    def __init__(self, sModelName, sModelDir, sComment=None):

        DU_CRF_Task.__init__(
            self,
            sModelName,
            sModelDir,
            DU_GRAPH,
            dFeatureConfig={
                'nbClass': nt.nCls  #number of classes
                ,
                'n_feat_node': 500  #number of ngrams to extract by chi2
                ,
                't_ngrams_node': (2, 4),
                'b_node_lc': False,
                'n_feat_edge': 250  #number of ngrams to extract by chi2
                ,
                't_ngrams_edge': (2, 4),
                'b_edge_lc': False,
                'n_jobs':
                10  #n_jobs when fitting the internal Logit feat extractor model by grid search
            },
            dLearnerConfig={
                'C': .1,
                'njobs': 4,
                'inference_cache': 50
                #, 'tol'              : .1
                ,
                'tol': .05,
                'save_every': 50  #save every 50 iterations,for warm start
                ,
                'max_iter': 250
            },
            sComment=sComment,
            cFeatureDefinition=FeatureDefinition_PageXml_LogitExtractorV2)

        self.addBaseline_LogisticRegression()  #use a LR model as baseline
Esempio n. 7
0
                                   , ('ATMOSTONE', nt, 'title'    , False)    #0 or 1 heading pare page
                                 ] )

# ===============================================================================================================


class DU_BL_V1(DU_Baseline):
    def __init__(self, sModelName, sModelDir,logitID,sComment=None):
        DU_Baseline.__init__(self, sModelName, sModelDir,DU_GRAPH,logitID)



if __name__ == "__main__":

    version = "v.01"
    usage, description, parser = DU_CRF_Task.getBasicTrnTstRunOptionParser(sys.argv[0], version)

    # ---
    #parse the command line
    (options, args) = parser.parse_args()
    # ---
    try:
        sModelDir, sModelName = args
    except Exception as e:
        _exit(usage, 1, e)

    doer = DU_BL_V1(sModelName, sModelDir,'logit_5')

    if options.rm:
        doer.rm()
        sys.exit(0)
    def __init__(self,
                 sModelName,
                 sModelDir,
                 sComment=None,
                 C=None,
                 tol=None,
                 njobs=None,
                 max_iter=None,
                 inference_cache=None):

        # ===============================================================================================================

        lLabels = ['RB', 'RI', 'RE', 'RS', 'RO']

        lIgnoredLabels = None

        nbClass = len(lLabels)
        """
        if you play with a toy collection, which does not have all expected classes, you can reduce those.
        """

        lActuallySeen = None
        if lActuallySeen:
            print "REDUCING THE CLASSES TO THOSE SEEN IN TRAINING"
            lIgnoredLabels = [
                lLabels[i] for i in range(len(lLabels))
                if i not in lActuallySeen
            ]
            lLabels = [lLabels[i] for i in lActuallySeen]
            print len(lLabels), lLabels
            print len(lIgnoredLabels), lIgnoredLabels
            nbClass = len(
                lLabels) + 1  #because the ignored labels will become OTHER

        #DEFINING THE CLASS OF GRAPH WE USE
        DU_GRAPH = Graph_MultiSinglePageXml
        nt = NodeType_PageXml_type_woText(
            "abp"  #some short prefix because labels below are prefixed with it
            ,
            lLabels,
            lIgnoredLabels,
            False  #no label means OTHER
            ,
            BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3)
                                       )  #we reduce overlap in this way
        )
        # ntA = NodeType_PageXml_type_woText("abp"                   #some short prefix because labels below are prefixed with it
        #                       , lLabels
        #                       , lIgnoredLabels
        #                       , False    #no label means OTHER
        #                       )

        nt.setXpathExpr((
            ".//pc:TextLine"  #how to find the nodes
            ,
            "./pc:TextEquiv")  #how to get their text
                        )

        # ntA.setXpathExpr( (".//pc:TextLine | .//pc:TextRegion"        #how to find the nodes
        #                   , "./pc:TextEquiv")       #how to get their text
        #                 )

        DU_GRAPH.addNodeType(nt)

        # ===============================================================================================================

        DU_CRF_Task.__init__(
            self,
            sModelName,
            sModelDir,
            DU_GRAPH,
            dFeatureConfig={},
            dLearnerConfig={
                'C':
                .1 if C is None else C,
                'njobs':
                8 if njobs is None else njobs,
                'inference_cache':
                50 if inference_cache is None else inference_cache
                #, 'tol'              : .1
                ,
                'tol':
                .05 if tol is None else tol,
                'save_every':
                50  #save every 50 iterations,for warm start
                ,
                'max_iter':
                1000 if max_iter is None else max_iter
            },
            sComment=sComment
            #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText
            ,
            cFeatureDefinition=FeatureDefinition_PageXml_NoNodeFeat_v3)

        #self.setNbClass(3)     #so that we check if all classes are represented in the training set

        self.bsln_mdl = self.addBaseline_LogisticRegression(
        )  #use a LR model trained by GridSearch as baseline
 def predict(self, lsColDir):
     """
     Return the list of produced files
     """
     self.sXmlFilenamePattern = "*.mpxml"
     return DU_CRF_Task.predict(self, lsColDir)