def _testBaselines(self, lX, lY, lLabelName=None, lsDocName=None):
        """
        test the baseline models, 
        return a test report list, one per baseline method
        """
        if lsDocName: assert len(lX) == len(lsDocName), "Internal error"

        lTstRpt = []
        if self._lMdlBaseline:
            for itype in range(self.nbType):
                X_flat, Y_flat = self._getXY_forType(lX, lY, itype)
                traceln("\t\t type %d   #nodes=%d  #features=%d" %
                        (itype, X_flat.shape[0], X_flat.shape[1]))
                for mdl in self._lMdlBaseline:  #code in extenso, to call del on the Y_pred_flat array...
                    chronoOn("_testBaselines_T")
                    Y_pred_flat = mdl[itype].predict(X_flat)
                    traceln("\t\t [%.1fs] done\n" %
                            chronoOff("_testBaselines_T"))
                    lTstRpt.append(
                        TestReport(str(mdl),
                                   Y_pred_flat,
                                   Y_flat,
                                   lLabelName,
                                   lsDocName=lsDocName))

            del X_flat, Y_flat, Y_pred_flat
        return lTstRpt
 def _testBaselinesEco(self, lX, lY, lLabelName=None, lsDocName=None):
     """
     test the baseline models, WITHOUT MAKING A HUGE X IN MEMORY
     return a test report list, one per baseline method
     """
     if lsDocName: assert len(lX) == len(lsDocName), "Internal error"
     lTstRpt = []
     for mdl in self._lMdlBaseline:  #code in extenso, to call del on the Y_pred_flat array...
         chronoOn("_testBaselinesEco_T")
         #using a COnfusionMatrix-based test report object, we can accumulate results
         oTestReportConfu = TestReportConfusion(str(mdl),
                                                list(),
                                                lLabelName,
                                                lsDocName=lsDocName)
         for X, Y in zip(lX, lY):
             for itype in range(self.nbType):
                 X_flat, Y_flat = self._getXY_forType([X], [Y], itype)
                 Y_flat_pred = mdl[itype].predict(X_flat)
                 oTestReportConfu.accumulate(
                     TestReport(str(mdl),
                                Y_flat_pred,
                                Y_flat,
                                lLabelName,
                                lsDocName=lsDocName))
         traceln("\t\t [%.1fs] done\n" % chronoOff("_testBaselinesEco_T"))
         lTstRpt.append(oTestReportConfu)
     return lTstRpt
Ejemplo n.º 3
0
 def _testBaselinesEco(self, lX, lY, lLabelName=None, lsDocName=None):
     """
     test the baseline models, WITHOUT MAKING A HUGE X IN MEMORY
     return a test report list, one per baseline method
     """
     if lsDocName: assert len(lX) == len(lsDocName), "Internal error"
     lTstRpt = []
     for mdl in self._lMdlBaseline:  #code in extenso, to call del on the Y_pred_flat array...
         chronoOn()
         #using a COnfusionMatrix-based test report object, we can accumulate results
         oTestReportConfu = TestReportConfusion(str(mdl),
                                                list(),
                                                lLabelName,
                                                lsDocName=lsDocName)
         for X, Y in zip(lX, lY):
             Y_pred = mdl.predict(
                 X
             )  #I suspect a bug here. (JLM June 2017) Because X_flat is probably required.
             oTestReportConfu.accumulate(
                 TestReport(str(mdl),
                            Y_pred,
                            Y,
                            lLabelName,
                            lsDocName=lsDocName))
         traceln("\t\t [%.1fs] done\n" % chronoOff())
         lTstRpt.append(oTestReportConfu)
     return lTstRpt
Ejemplo n.º 4
0
    def _testEdgeBaselines(self, lX, lY, lLabelName=None, lsDocName=None):
        """
        test the edge baseline model, 
        return a test report list (a singleton for now)
        """
        lTstRpt = []
        if self._EdgeBaselineModel:
            if lsDocName: assert len(lX) == len(lsDocName), "Internal error"

            lEdgeLabelName = [
                "%s_%s" % (lbl1, lbl2) for lbl1 in lLabelName
                for lbl2 in lLabelName
            ] if lLabelName else None
            lTstRpt = []
            X_flat, Y_flat = self._getEdgeXEdgeY(lX, lY)
            chronoOn("_testEdgeBaselines")
            Y_pred_flat = self._EdgeBaselineModel.predict(X_flat)
            traceln("\t\t [%.1fs] done\n" % chronoOff("_testEdgeBaselines"))
            lTstRpt.append(
                TestReport(str(self._EdgeBaselineModel),
                           Y_pred_flat,
                           Y_flat,
                           lEdgeLabelName,
                           lsDocName=lsDocName))

            del X_flat, Y_flat, Y_pred_flat
        return lTstRpt
Ejemplo n.º 5
0
    def test(self, lGraph, lsDocName=None, predict_proba=False):
        """
        Test the model using those graphs and report results on stderr

        if some baseline model(s) were set, they are also tested

        Return a Report object
        """
        #Assume the model was created or loaded
        assert lGraph

        gcn_graph_test, lX, lY = self._prepare_for_test(lGraph)

        lLabelName = lGraph[0].getLabelNameList()

        chronoOn("test2")
        session = self.tf_session
        if predict_proba:
            #TODO Should split that function diryt
            lY_pred_proba = self.gcn_model.predict_prob_lG(session,
                                                           gcn_graph_test,
                                                           verbose=False)
            traceln(" [%.1fs] done\n" % chronoOff("test2"))

            ret = lY_pred_proba
        else:
            #pdb.set_trace()
            lY_pred = self.gcn_model.predict_lG(session,
                                                gcn_graph_test,
                                                verbose=False)

            #             # Convert to list as Python pickle does not  seem like the array while the list can be pickled
            #             lY_list = []
            #             for x in lY_pred:
            #                 lY_list.append(list(x))

            traceln(" [%.1fs] done\n" % chronoOff("test2"))
            #             tstRpt = TestReport(self.sName, lY_list, lY, lLabelName, lsDocName=lsDocName)
            tstRpt = TestReport(self.sName,
                                lY_pred,
                                lY,
                                lLabelName,
                                lsDocName=lsDocName)
            lBaselineTestReport = self._testBaselines(lX,
                                                      lY,
                                                      lLabelName,
                                                      lsDocName=lsDocName)
            tstRpt.attach(lBaselineTestReport)

            ret = tstRpt

        # do some garbage collection
        del lX, lY
        gc.collect()

        return ret
Ejemplo n.º 6
0
    def test(self, lGraph, lsDocName=None):
        """
        Test the model using those graphs and report results on stderr
        if some baseline model(s) were set, they are also tested
        Return a Report object
        """
        assert lGraph
        lLabelName = lGraph[0].getLabelNameList()
        bConstraint = lGraph[0].getPageConstraint()

        traceln("- computing features on test set")
        chronoOn("test")
        lX, lY = self.get_lX_lY(lGraph)
        traceln("\t #nodes=%d  #edges=%d " %
                Graph.getNodeEdgeTotalNumber(lGraph))
        self._computeModelCaracteristics(
            lX
        )  #we discover here dynamically the number of features of nodes and edges
        traceln("\t %s" % self._getNbFeatureAsText())
        traceln("[%.1fs] done\n" % chronoOff("test"))

        traceln("- predicting on test set")
        chronoOn("test2")
        if bConstraint:
            lConstraints = [g.instanciatePageConstraints() for g in lGraph]
            lY_pred = self._ssvm_ad3plus_predict(lX, lConstraints)
        else:
            lY_pred = self.ssvm.predict(lX)

        traceln(" [%.1fs] done\n" % chronoOff("test2"))

        tstRpt = TestReport(self.sName,
                            lY_pred,
                            lY,
                            lLabelName,
                            lsDocName=lsDocName)

        lBaselineTestReport = self._testBaselines(lX,
                                                  lY,
                                                  lLabelName,
                                                  lsDocName=lsDocName)
        tstRpt.attach(lBaselineTestReport)

        tstRpt.attach(
            self._testEdgeBaselines(lX, lY, lLabelName, lsDocName=lsDocName))

        # do some garbage collection
        del lX, lY
        gc.collect()

        return tstRpt
Ejemplo n.º 7
0
    def train(self,
              lGraph,
              bWarmStart=True,
              expiration_timestamp=None,
              verbose=0):
        print('Ensemble ECN Training')
        traceln("\t- computing features on training set")
        traceln("\t\t #nodes=%d  #edges=%d " %
                Graph.getNodeEdgeTotalNumber(lGraph))
        chronoOn()
        lX, lY = self.get_lX_lY(lGraph)

        self._computeModelCaracteristics(
            lX
        )  # we discover here dynamically the number of features of nodes and edges
        # self._tNF_EF contains the number of node features and edge features
        traceln("\t\t %s" % self._getNbFeatureAsText())
        traceln("\t [%.1fs] done\n" % chronoOff())

        nb_class = self.getNbClass()  # Is it better to do Y.shape ?
        print('nb_class', nb_class)

        self.model_config['node_dim'] = self._tNF_EF[0]
        self.model_config['edge_dim'] = self._tNF_EF[1]
        self.model_config['nb_class'] = nb_class
        traceln("\t- creating the sub-models")

        # TODO
        # This converts the lX,lY in the format necessary for GCN Models
        #DO we need that , can we share the label binarizer and so on ...
        #This sets the label binarizer
        gcn_graph = self.convert_lX_lY_to_GCNDataset(lX, lY, training=True)

        # Save the label Binarizer for prediction usage
        fd_lb = open(self.getlabelBinarizerFilename(), 'wb')
        pickle.dump(self.labelBinarizer, fd_lb)
        fd_lb.close()
        # Save the model config in order to restore the model later
        fd_mc = open(self.getModelConfigFilename(), 'wb')
        pickle.dump(self.model_config, fd_mc)
        fd_mc.close()

        #This would create all the DU_MODEL
        self._init_model()

        for du_model in self.models:
            #The train will create a tf graph and create the model
            du_model.train(lGraph, bWarmStart=bWarmStart)
Ejemplo n.º 8
0
    def _prepare_for_test(self, lGraph):

        traceln("\t- computing features on test set")
        traceln("\t\t #nodes=%d  #edges=%d " %
                Graph.getNodeEdgeTotalNumber(lGraph))
        chronoOn()

        lX, lY = self.get_lX_lY(lGraph)
        traceln("\t [%.1fs] done\n" % chronoOff())

        gcn_graph_test = self.convert_lX_lY_to_GCNDataset(lX,
                                                          lY,
                                                          training=False,
                                                          test=True)

        return gcn_graph_test, lX, lY
Ejemplo n.º 9
0
 def _trainBaselines(self, lX, lY):
     """
     Train the baseline models, if any
     """
     if self._lMdlBaseline:
         for itype in range(self.nbType):
             X_flat, Y_flat = self._getXY_forType(lX, lY, itype)
             if False:
                 with open("XY_flat_Type%d.pkl"%(itype), "wb") as fd: 
                     pickle.dump((X_flat, Y_flat), fd)
             for mdlBaseline in self._lMdlBaseline:
                 chronoOn()
                 traceln("\t - training baseline model: %s"%str(mdlBaseline))
                 mdlBaseline[itype].fit(X_flat, Y_flat)
                 traceln("\t [%.1fs] done\n"%chronoOff())
             del X_flat, Y_flat
     return True
Ejemplo n.º 10
0
 def _trainEdgeBaseline(self, lX, lY):
     """
     Here we train a logistic regression model to predict the pair of labels of each edge.
     This code assume single type
     """
     self._EdgeBaselineModel = self.getEdgeModel()
     if self._EdgeBaselineModel:
         X_flat, Y_flat = self._getEdgeXEdgeY(lX, lY)
         
         with open("edgeXedgeY_flat.pkl", "wb") as fd: pickle.dump((X_flat, Y_flat), fd)
         
         chronoOn()
         traceln("\t - training edge baseline model: %s"%str(self._EdgeBaselineModel))
         self._EdgeBaselineModel.fit(X_flat, Y_flat)
         traceln("\t [%.1fs] done\n"%chronoOff())
         del X_flat, Y_flat
     else:
         traceln("\t - no edge baseline model for this model")
         
     return True
Ejemplo n.º 11
0
    def test(self, lGraph, lsDocName=None, predict_proba=False):
        """
        Test the model using those graphs and report results on stderr

        if some baseline model(s) were set, they are also tested

        Return a Report object
        """
        #Assume the model was created or loaded

        assert lGraph
        lLabelName = lGraph[0].getLabelNameList()
        traceln("\t- computing features on test set")
        traceln("\t\t #nodes=%d  #edges=%d " %
                Graph.getNodeEdgeTotalNumber(lGraph))
        chronoOn()
        lY = self.get_lY(lGraph)

        lY_pred_proba = []
        for du_model in self.models:
            model_pred = du_model.test(lGraph,
                                       lsDocName=lsDocName,
                                       predict_proba=True)
            lY_pred_proba.append(model_pred)

        print('Number of Models', len(lY_pred_proba))
        lY_pred, _ = DU_Ensemble_ECN.average_prediction(lY_pred_proba)
        tstRpt = TestReport(self.sName,
                            lY_pred,
                            lY,
                            lLabelName,
                            lsDocName=lsDocName)

        # do some garbage collection
        del lY
        gc.collect()

        return tstRpt
Ejemplo n.º 12
0
    def _trainBaselines(self, lX, lY):
        """
        Train the baseline models, if any
        """
        if self._lMdlBaseline:
            X_flat = self._get_X_flat(lX)
            Y_flat = np.hstack(lY)
            if False:
                with open("XY_flat.pkl", "wb") as fd:
                    pickle.dump((X_flat, Y_flat), fd)
            for mdlBaseline in self._lMdlBaseline:
                chronoOn()
                traceln("\t - training baseline model: %s" % str(mdlBaseline))
                mdlBaseline.fit(X_flat, Y_flat)
                traceln("\t [%.1fs] done\n" % chronoOff())
            del X_flat, Y_flat

        if self.bTrainEdgeBaseline:
            traceln(' - training edge baseline')
            self._trainEdgeBaseline(
                lX, lY)  #we always train a predefined model on edges

        return True
Ejemplo n.º 13
0
    def _testBaselines(self, lX, lY, lLabelName=None, lsDocName=None):
        """
        test the baseline models, 
        return a test report list, one per baseline method
        """
        if lsDocName: assert len(lX) == len(lsDocName), "Internal error"

        lTstRpt = []
        if self._lMdlBaseline:
            X_flat = self._get_X_flat(lX)
            Y_flat = np.hstack(lY)
            for mdl in self._lMdlBaseline:  #code in extenso, to call del on the Y_pred_flat array...
                chronoOn("_testBaselines")
                Y_pred_flat = mdl.predict(X_flat)
                traceln("\t\t [%.1fs] done\n" % chronoOff("_testBaselines"))
                lTstRpt.append(
                    TestReport(str(mdl),
                               Y_pred_flat,
                               Y_flat,
                               lLabelName,
                               lsDocName=lsDocName))

            del X_flat, Y_flat, Y_pred_flat
        return lTstRpt
Ejemplo n.º 14
0
    def testFiles(self, lsFilename, loadFun, bBaseLine=False):
        """
        Test the model using those files. The corresponding graphs are loaded using the loadFun function (which must return a singleton list).
        It reports results on stderr

        if some baseline model(s) were set, they are also tested

        Return a Report object
        """
        traceln("- predicting on test set")
        chronoOn("testFiles")
        lX, lY, lY_pred = [], [], []
        lLabelName = None

        for sFilename in lsFilename:

            lg = loadFun(sFilename)  # returns a singleton list
            for g in lg:
                if g.bConjugate: g.computeEdgeLabels()
                [X], [Y] = self.get_lX_lY([g])

                [gcn_graph_test
                 ] = self.convert_lX_lY_to_GCNDataset([X], [Y],
                                                      training=False,
                                                      test=True)

                if lLabelName == None:
                    lLabelName = g.getLabelNameList()
                    traceln("\t #nodes=%d  #edges=%d " %
                            Graph.getNodeEdgeTotalNumber([g]))
                    tNF_EF = (X[0].shape[1], X[2].shape[1])
                    traceln("node-dim,edge-dim", tNF_EF)

                m_pred = []
                for du_model in self.models:
                    [Y_pred] = du_model.gcn_model.predict_prob_lG(
                        du_model.tf_session, [gcn_graph_test], verbose=False)
                    m_pred.append([Y_pred])

                [Y_pred], [_Y_pred_proba
                           ] = DU_Ensemble_ECN.average_prediction(m_pred)

                #lX.append(X)
                lY.append(Y)
                lY_pred.append(Y_pred)
                del _Y_pred_proba
                g.detachFromDoc()
                del g  # this can be very large
            gc.collect()

        traceln("[%.1fs] done\n" % chronoOff("testFiles"))

        tstRpt = TestReport(self.sName,
                            lY_pred,
                            lY,
                            lLabelName,
                            lsDocName=lsFilename)

        del lX, lY, lY_pred
        gc.collect()

        return tstRpt
Ejemplo n.º 15
0
    def runForExternalMLMethod(self,
                               lsColDir,
                               storeX,
                               applyY,
                               bRevertEdges=False):
        """
        HACK: to test new ML methods, not yet integrated in our SW: storeX=None, storeXY=None, applyY=None
        Return the list of produced files
        """

        self.traceln("-" * 50)
        if storeX: traceln("Loading data and storing [X] (1 X per graph)")
        if applyY:
            traceln(
                "Loading data, loading Y, labelling data, storing annotated data"
            )
        self.traceln("-" * 50)

        if storeX and applyY:
            raise ValueError("Either store X or applyY, not both")

        if not self._mdl:
            raise Exception("The model must be loaded beforehand!")

        #list files
        _, lFilename = self.listMaxTimestampFile(lsColDir,
                                                 self.sXmlFilenamePattern)

        DU_GraphClass = self.getGraphClass()

        lPageConstraint = DU_GraphClass.getPageConstraint()
        if lPageConstraint:
            for dat in lPageConstraint:
                self.traceln("\t\t%s" % str(dat))

        if applyY:
            self.traceln("LOADING [Y] from %s" % applyY)
            lY = self._mdl.gzip_cPickle_load(applyY)
        if storeX: lX = []

        chronoOn("predict")
        self.traceln(
            "- loading collection as graphs, and processing each in turn. (%d files)"
            % len(lFilename))
        du_postfix = "_du" + MultiPageXml.sEXT
        lsOutputFilename = []
        for sFilename in lFilename:
            if sFilename.endswith(du_postfix): continue  #:)
            chronoOn("predict_1")
            lg = DU_GraphClass.loadGraphs([sFilename],
                                          bDetach=False,
                                          bLabelled=False,
                                          iVerbose=1)
            #normally, we get one graph per file, but in case we load one graph per page, for instance, we have a list
            if lg:
                for g in lg:
                    doc = g.doc
                    if bRevertEdges:
                        g.revertEdges()  #revert the directions of the edges
                    if lPageConstraint:
                        self.traceln(
                            "\t- prediction with logical constraints: %s" %
                            sFilename)
                    else:
                        self.traceln("\t- prediction : %s" % sFilename)
                    if storeX:
                        [X] = self._mdl.get_lX([g])
                        lX.append(X)
                    else:
                        Y = lY.pop(0)
                        g.setDomLabels(Y)
                del lg

                if applyY:
                    MultiPageXml.setMetadata(doc, None, self.sMetadata_Creator,
                                             self.sMetadata_Comments)
                    sDUFilename = sFilename[:-len(MultiPageXml.sEXT
                                                  )] + du_postfix
                    doc.saveFormatFileEnc(sDUFilename, "utf-8",
                                          True)  #True to indent the XML
                    doc.freeDoc()
                    lsOutputFilename.append(sDUFilename)
            else:
                self.traceln("\t- no prediction to do for: %s" % sFilename)

            self.traceln("\t done [%.2fs]" % chronoOff("predict_1"))
        self.traceln(" done [%.2fs]" % chronoOff("predict"))

        if storeX:
            self.traceln("STORING [X] in %s" % storeX)
            self._mdl.gzip_cPickle_dump(storeX, lX)

        return lsOutputFilename
Ejemplo n.º 16
0
    def predict(self, lsColDir, docid=None):
        """
        Return the list of produced files
        """
        self.traceln("-" * 50)
        self.traceln("Predicting for collection(s):", lsColDir)
        self.traceln("-" * 50)

        if not self._mdl:
            raise Exception("The model must be loaded beforehand!")

        #list files
        if docid is None:
            _, lFilename = self.listMaxTimestampFile(lsColDir,
                                                     self.sXmlFilenamePattern)
        # predict for this file only
        else:
            try:
                lFilename = [
                    os.path.abspath(
                        os.path.join(lsColDir[0], docid + MultiPageXml.sEXT))
                ]
            except IndexError:
                raise Exception("a collection directory must be provided!")

        DU_GraphClass = self.getGraphClass()

        lPageConstraint = DU_GraphClass.getPageConstraint()
        if lPageConstraint:
            for dat in lPageConstraint:
                self.traceln("\t\t%s" % str(dat))

        chronoOn("predict")
        self.traceln(
            "- loading collection as graphs, and processing each in turn. (%d files)"
            % len(lFilename))
        du_postfix = "_du" + MultiPageXml.sEXT
        lsOutputFilename = []
        for sFilename in lFilename:
            if sFilename.endswith(du_postfix): continue  #:)
            chronoOn("predict_1")
            lg = DU_GraphClass.loadGraphs([sFilename],
                                          bDetach=False,
                                          bLabelled=False,
                                          iVerbose=1)
            #normally, we get one graph per file, but in case we load one graph per page, for instance, we have a list
            if lg:
                for g in lg:
                    doc = g.doc
                    if lPageConstraint:
                        self.traceln(
                            "\t- prediction with logical constraints: %s" %
                            sFilename)
                    else:
                        self.traceln("\t- prediction : %s" % sFilename)
                    Y = self._mdl.predict(g)

                    g.setDomLabels(Y)
                    del Y
                del lg

                MultiPageXml.setMetadata(doc, None, self.sMetadata_Creator,
                                         self.sMetadata_Comments)
                sDUFilename = sFilename[:-len(MultiPageXml.sEXT)] + du_postfix
                doc.write(sDUFilename,
                          xml_declaration=True,
                          encoding="utf-8",
                          pretty_print=True
                          #compression=0,  #0 to 9
                          )

                lsOutputFilename.append(sDUFilename)
            else:
                self.traceln("\t- no prediction to do for: %s" % sFilename)

            self.traceln("\t done [%.2fs]" % chronoOff("predict_1"))
        self.traceln(" done [%.2fs]" % chronoOff("predict"))

        return lsOutputFilename
Ejemplo n.º 17
0
    def _train_save_test(self, sModelName, bWarm, lFilename_trn, ts_trn,
                         lFilename_tst, bPickleOnly):
        """
        used both by train_save_test and _nfold_runFold
        """
        mdl = self.cModelClass(sModelName, self.sModelDir)

        if os.path.exists(
                mdl.getModelFilename()) and not bWarm and not bPickleOnly:
            raise crf.Model.ModelException(
                "Model exists on disk already (%s), either remove it first or warm-start the training."
                % mdl.getModelFilename())

        mdl.configureLearner(**self.config_learner_kwargs)
        mdl.setBaselineModelList(self._lBaselineModel)
        mdl.saveConfiguration(
            (self.config_extractor_kwargs, self.config_learner_kwargs))
        self.traceln("\t - configuration: ", self.config_learner_kwargs)

        self.traceln("- loading training graphs")
        lGraph_trn = self.cGraphClass.loadGraphs(lFilename_trn,
                                                 bDetach=True,
                                                 bLabelled=True,
                                                 iVerbose=1)
        self.traceln(" %d graphs loaded" % len(lGraph_trn))

        assert self.nbClass and self.lNbClass, "internal error: I expected the number of class to be automatically computed at that stage"
        if self.iNbCRFType == 1:
            mdl.setNbClass(self.nbClass)
        else:
            mdl.setNbClass(self.lNbClass)

        #for this check, we load the Y once...
        self.checkLabelCoverage(
            mdl.get_lY(lGraph_trn)
        )  #NOTE that Y are in bad order if multiptypes. Not a pb here

        self.traceln("- retrieving or creating feature extractors...")
        chronoOn("FeatExtract")
        try:
            mdl.loadTransformers(ts_trn)
        except crf.Model.ModelException:
            fe = self.cFeatureDefinition(**self.config_extractor_kwargs)
            fe.fitTranformers(lGraph_trn)
            fe.cleanTransformers()
            mdl.setTranformers(fe.getTransformers())
            mdl.saveTransformers()
        self.traceln(" done [%.1fs]" % chronoOff("FeatExtract"))

        if bPickleOnly:
            self._pickleData(mdl, lGraph_trn, "trn")
        else:
            self.traceln("- training model...")
            chronoOn("MdlTrn")
            mdl.train(lGraph_trn,
                      True,
                      ts_trn,
                      verbose=1 if self.bVerbose else 0)
            mdl.save()
            self.traceln(" done [%.1fs]" % chronoOff("MdlTrn"))

        # OK!!
        self._mdl = mdl

        if lFilename_tst:
            self.traceln("- loading test graphs")
            lGraph_tst = self.cGraphClass.loadGraphs(lFilename_tst,
                                                     bDetach=True,
                                                     bLabelled=True,
                                                     iVerbose=1)
            self.traceln(" %d graphs loaded" % len(lGraph_tst))
            if bPickleOnly:
                self._pickleData(mdl, lGraph_tst, "tst")
            else:
                oReport = mdl.test(lGraph_tst)
        else:
            oReport = None

        if bPickleOnly:
            self.traceln("- pickle done, exiting")
            exit(0)

        return oReport
Ejemplo n.º 18
0
    def testFiles(self, lsFilename, loadFun, bBaseLine=False):
        """
        Test the model using those files. The corresponding graphs are loaded using the loadFun function (which must return a singleton list).
        It reports results on stderr
        
        if some baseline model(s) were set, they are also tested
        
        Return a Report object
        """
        lX, lY, lY_pred = [], [], []
        lLabelName = None
        traceln("- predicting on test set")
        chronoOn("testFiles")
        for sFilename in lsFilename:
            lg = loadFun(sFilename)  #returns a singleton list
            for g in lg:
                if self.bConjugate: g.computeEdgeLabels()
                [X], [Y] = self.get_lX_lY([g])

                if lLabelName == None:
                    lLabelName = g.getLabelNameList()
                    traceln("\t #nodes=%d  #edges=%d " %
                            Graph.getNodeEdgeTotalNumber([g]))
                    self._computeModelCaracteristics(
                        [X]
                    )  #we discover here dynamically the number of features of nodes and edges
                    traceln("\t %s" % self._getNbFeatureAsText())
                else:
                    assert lLabelName == g.getLabelNameList(
                    ), "Inconsistency among label spaces"
                n_jobs = self.ssvm.n_jobs
                self.ssvm.n_jobs = 1
                if g.getPageConstraint():
                    lConstraints = g.instanciatePageConstraints()
                    [Y_pred] = self._ssvm_ad3plus_predict([X], [lConstraints])
                else:
                    #since we pass a single graph, let force n_jobs to 1 !!
                    [Y_pred] = self.ssvm.predict([X])
                self.ssvm.n_jobs = n_jobs

                lX.append(X)
                lY.append(Y)
                lY_pred.append(Y_pred)
                #g.detachFromDOM()
                del g  #this can be very large
                gc.collect()
        traceln("[%.1fs] done\n" % chronoOff("testFiles"))

        tstRpt = TestReport(self.sName,
                            lY_pred,
                            lY,
                            lLabelName,
                            lsDocName=lsFilename)

        if bBaseLine:
            lBaselineTestReport = self._testBaselinesEco(lX,
                                                         lY,
                                                         lLabelName,
                                                         lsDocName=lsFilename)
            tstRpt.attach(lBaselineTestReport)

#         if True:
#             #experimental code, not so interesting...
#             node_transformer, _ = self.getTransformers()
#             try:
#                 _testable_extractor_ = node_transformer._testable_extractor_
#                 lExtractorTestReport = _testable_extractor_.testEco(lX, lY)
#                 tstRpt.attach(lExtractorTestReport)
#             except AttributeError:
#                 pass

#do some garbage collection
        del lX, lY
        gc.collect()

        return tstRpt
Ejemplo n.º 19
0
 def DYNAMIC_IMPORT(name, package=None):
     chronoOn("import")
     trace("SETUP: Dynamic import of '%s' from '%s'" % (name, package))
     m = import_module(name, package)
     traceln("  done [%.1fs]" % chronoOff("import"))
     return m
Ejemplo n.º 20
0
    def _train_save_test(self, sModelName, bWarm, lFilename_trn, ts_trn, lFilename_tst, lFilename_vld, bPickleXY
                         , ratio_train_val=None):
        """
        used both by train_save_test and _nfold_runFold
        if provided, try using lFilename_vld as validation set to make best model.
        """
        mdl = self.cModelClass(sModelName, self.sModelDir)
        
        if os.path.exists(mdl.getModelFilename()) and not bWarm: 
            raise GraphModelException("Model exists on disk already (%s), either remove it first or warm-start the training."%mdl.getModelFilename())
            
        mdl.configureLearner(**self.config_learner_kwargs)
        mdl.setBaselineModelList(self._lBaselineModel)
        mdl.saveConfiguration( (self.config_extractor_kwargs, self.config_learner_kwargs) )
        self.traceln("\t - configuration: ", self.config_learner_kwargs )

        self.traceln("- loading training graphs")
        lGraph_trn = self.cGraphClass.loadGraphs(self.cGraphClass, lFilename_trn, bDetach=True, bLabelled=True, iVerbose=1)
        self.traceln(" %d training graphs loaded"%len(lGraph_trn))

        if lFilename_vld:
            self.traceln("- loading validation graphs")
            lGraph_vld = self.cGraphClass.loadGraphs(self.cGraphClass, lFilename_vld, bDetach=True, bLabelled=True, iVerbose=1)
            self.traceln(" %d validation graphs loaded"%len(lGraph_vld))
        else:
            lGraph_vld = []
            if ratio_train_val is None:
                self.traceln("- no validation graphs")
            else:
                lG = [g for g in lGraph_trn]
                split_idx = int(ratio_train_val * len(lG))
                lGraph_vld = lG[:split_idx ]
                lGraph_trn = lG[ split_idx:]
                del lG
                self.traceln("- extracted %d validation graphs, got %d training graphs (ratio=%.3f)" 
                             % (len(lGraph_vld), len(lGraph_trn), ratio_train_val))

        #for this check, we load the Y once...
        if self.bConjugate:
            mdl.setNbClass(self.nbEdgeLabel)
            for _g in lGraph_trn: _g.computeEdgeLabels()
            for _g in lGraph_vld: _g.computeEdgeLabels()
        else:
            assert self.nbClass and self.lNbClass, "internal error: I expected the number of class to be automatically computed at that stage"
            if self.iNbNodeType == 1:
                mdl.setNbClass(self.nbClass)
            else:
                mdl.setNbClass(self.lNbClass)
            self.checkLabelCoverage(mdl.get_lY(lGraph_trn)) #NOTE that Y are in bad order if multiptypes. Not a pb here
            
        self.traceln("- retrieving or creating feature extractors...")
        chronoOn("FeatExtract")
        try:
            mdl.loadTransformers(ts_trn)
        except GraphModelException:
            fe = self.cFeatureDefinition(**self.config_extractor_kwargs)         
            fe.fitTranformers(lGraph_trn)
            fe.cleanTransformers()
            mdl.setTranformers(fe.getTransformers())
            mdl.saveTransformers()
        
        # pretty print of features extractors
        self.traceln("""--- Features ---
--- NODES : %s

--- EDGES : %s
--- -------- ---
""" % mdl.getTransformers())
        
        self.traceln(" done [%.1fs]"%chronoOff("FeatExtract"))
        
        if bPickleXY:
            self._pickleData(mdl, lGraph_trn, "trn")

        self.traceln("- training model...")
        chronoOn("MdlTrn")
        mdl.train(lGraph_trn, lGraph_vld, True, ts_trn, verbose=1 if self.bVerbose else 0)
        mdl.save()
        self.traceln(" done [%.1fs]"%chronoOff("MdlTrn"))
        
        # OK!!
        self._mdl = mdl
        
        if lFilename_tst:
            self.traceln("- loading test graphs")
            lGraph_tst = self.cGraphClass.loadGraphs(self.cGraphClass, lFilename_tst, bDetach=True, bLabelled=True, iVerbose=1)
            if self.bConjugate:
                for _g in lGraph_tst: _g.computeEdgeLabels()
            self.traceln(" %d graphs loaded"%len(lGraph_tst))
            if bPickleXY:
                self._pickleData(mdl, lGraph_tst, "tst")
            else:
                oReport = mdl.test(lGraph_tst)
        else:
            oReport = None

        if bPickleXY:
            self.traceln("- pickle done, exiting")
            exit(0)
        
        return oReport
Ejemplo n.º 21
0
    def _prepare_for_train(self, lGraph, lGraph_vld):
        """
        Prepare for training eCN or EnsembleECN
        """
        traceln('ECN Training ', self.sName)
        traceln("\t- computing features on training set")
        traceln("\t\t #nodes=%d  #edges=%d " %
                Graph.getNodeEdgeTotalNumber(lGraph))
        chronoOn()

        lX, lY = self.get_lX_lY(lGraph)
        self._computeModelCaracteristics(
            lX
        )  # we discover here dynamically the number of features of nodes and edges
        # self._tNF_EF contains the number of node features and edge features
        traceln("\t\t %s" % self._getNbFeatureAsText())
        traceln("\t [%.1fs] done\n" % chronoOff())
        nb_class = len(
            lGraph[0].getLabelNameList())  #Is it better to do Y.shape ?
        traceln("\t- %d classes" % nb_class)

        traceln("\t- retrieving or creating model...")

        self.model_config['node_dim'] = self._tNF_EF[0]
        self.model_config['edge_dim'] = self._tNF_EF[1]
        self.model_config['nb_class'] = nb_class

        if False:
            with open('linear_reg', 'wb') as save_file:
                pickle.dump((lX, lY), save_file, pickle.HIGHEST_PROTOCOL)

        #This converts the lX,lY in the format necessary for GCN Models
        gcn_graph = self.convert_lX_lY_to_GCNDataset(lX, lY, training=True)

        #Save the label Binarizer for prediction usage
        fd_lb = open(self.getlabelBinarizerFilename(), 'wb')
        pickle.dump(self.labelBinarizer, fd_lb)
        fd_lb.close()

        #TODO Save the validation set too to reproduce experiments
        random.shuffle(gcn_graph)

        if lGraph_vld:
            gcn_graph_train = gcn_graph
            lX_vld, lY_vld = self.get_lX_lY(lGraph_vld)
            gcn_graph_val = self.convert_lX_lY_to_GCNDataset(lX_vld,
                                                             lY_vld,
                                                             test=True)
            del lX_vld, lY_vld
        else:
            #Get a validation set from the training set
            split_idx = max(
                1, int(self.model_config['ratio_train_val'] * len(gcn_graph)))
            traceln(" - using %d train graphs as validation graphs" %
                    split_idx)
            gcn_graph_train = []
            gcn_graph_val = []
            gcn_graph_val.extend(gcn_graph[:split_idx])
            gcn_graph_train.extend(gcn_graph[split_idx:])
        traceln("%d training graphs --  %d validation graphs" %
                (len(gcn_graph_train), len(gcn_graph_val)))
        self._cleanTmpCheckpointFiles()

        return gcn_graph_train, gcn_graph_val
Ejemplo n.º 22
0
    def testFiles(self, lsFilename, loadFun, bBaseLine=False):
        """
        Test the model using those files. The corresponding graphs are loaded using the loadFun function (which must return a singleton list).
        It reports results on stderr

        if some baseline model(s) were set, they are also tested

        Return a Report object
        """
        raise NotImplementedError
        lX, lY, lY_pred = [], [], []
        lLabelName = None
        traceln("- predicting on test set")
        chronoOn("testFiles")

        # ? Iterate over files or over models

        for du_model in self.models:
            #du_model.load()

            m_pred = []
            #with tf.Session(graph=du_model.tf_graph) as session:
            #session.run(du_model.gcn_model.init)
            #du_model.gcn_model.restore_model(session, du_model.getModelFilename())

            for sFilename in lsFilename:
                [g] = loadFun(sFilename)  # returns a singleton list
                [X], [Y] = self.get_lX_lY([g])

                gcn_graph_test = self.convert_lX_lY_to_GCNDataset(
                    [X], [Y], training=False, test=True)
                if lLabelName == None:
                    lLabelName = g.getLabelNameList()
                    traceln("\t #nodes=%d  #edges=%d " %
                            Graph.getNodeEdgeTotalNumber([g]))
                    tNF_EF = (X[0].shape[1], X[2].shape[1])
                    traceln("node-dim,edge-dim", tNF_EF)
                else:
                    assert lLabelName == g.getLabelNameList(
                    ), "Inconsistency among label spaces"

                model_pred = du_model.test(gcn_graph_test, predict_proba=True)

                m_pred.append(model_pred[0])
                lX.append(X)
                lY.append(Y)
                g.detachFromDOM()
                del g  # this can be very large
                gc.collect()
            lY_pred.append(model_pred)

        lY_pred, _ = DU_Ensemble_ECN.average_prediction(lY_pred)
        traceln("[%.1fs] done\n" % chronoOff("testFiles"))

        tstRpt = TestReport(self.sName,
                            lY_pred,
                            lY,
                            lLabelName,
                            lsDocName=lsFilename)

        if bBaseLine:
            lBaselineTestReport = self._testBaselinesEco(lX,
                                                         lY,
                                                         lLabelName,
                                                         lsDocName=lsFilename)
            tstRpt.attach(lBaselineTestReport)

        del lX, lY
        gc.collect()

        return tstRpt
Ejemplo n.º 23
0
    def train(self,
              lGraph,
              bWarmStart=True,
              expiration_timestamp=None,
              verbose=0):
        """
        Return a model trained using the given labelled graphs.
        The train method is expected to save the model into self.getModelFilename(), at least at end of training
        If bWarmStart==True, The model is loaded from the disk, if any, and if fresher than given timestamp, and training restarts

        if some baseline model(s) were set, they are also trained, using the node features

        """
        print('ECN Training', self.sName)
        traceln("\t- computing features on training set")
        traceln("\t\t #nodes=%d  #edges=%d " %
                Graph.getNodeEdgeTotalNumber(lGraph))
        chronoOn()
        lX, lY = self.get_lX_lY(lGraph)

        self._computeModelCaracteristics(
            lX
        )  # we discover here dynamically the number of features of nodes and edges
        # self._tNF_EF contains the number of node features and edge features
        traceln("\t\t %s" % self._getNbFeatureAsText())
        traceln("\t [%.1fs] done\n" % chronoOff())

        traceln("\t- retrieving or creating model...")

        nb_class = self.getNbClass()  #Is it better to do Y.shape ?

        self.model_config['node_dim'] = self._tNF_EF[0]
        self.model_config['edge_dim'] = self._tNF_EF[1]
        self.model_config['nb_class'] = nb_class

        #This call the ECN internal constructor and defines the tensorflow graph
        tf_graph = tf.Graph()
        with tf_graph.as_default():
            self._init_model()
        self.tf_graph = tf_graph

        #This converts the lX,lY in the format necessary for GCN Models
        gcn_graph = self.convert_lX_lY_to_GCNDataset(lX, lY, training=True)

        #Save the label Binarizer for prediction usage
        fd_lb = open(self.getlabelBinarizerFilename(), 'wb')
        pickle.dump(self.labelBinarizer, fd_lb)
        fd_lb.close()
        #Save the model config in order to restore the model later
        fd_mc = open(self.getModelConfigFilename(), 'wb')
        pickle.dump(self.model_config, fd_mc)
        fd_mc.close()

        #TODO Save the validation set too to reproduce experiments
        #Get a validation set from the training set
        split_idx = int(self.model_config['ratio_train_val'] * len(gcn_graph))
        random.shuffle(gcn_graph)
        gcn_graph_train = []
        gcn_graph_val = []

        gcn_graph_val.extend(gcn_graph[:split_idx])
        gcn_graph_train.extend(gcn_graph[split_idx:])

        self._cleanTmpCheckpointFiles()

        patience = self.model_config[
            'patience'] if 'patience' in self.model_config else self.model_config[
                'nb_iter']
        with tf.Session(graph=self.tf_graph) as session:
            session.run([self.gcn_model.init])

            R = self.gcn_model.train_with_validation_set(
                session,
                gcn_graph_train,
                gcn_graph_val,
                self.model_config['nb_iter'],
                eval_iter=10,
                patience=patience,
                save_model_path=self.getTmpModelFilename())
            f = open(self.getValScoreFilename(), 'wb')
            pickle.dump(R, f)
            f.close()

        #This save the model
        self._getBestModelVal()
        self._cleanTmpCheckpointFiles()
        #We reopen a session here and load the selected model if we need one
        self.restore()
Ejemplo n.º 24
0
    def test(self, lGraph, lsDocName=None, predict_proba=False):
        """
        Test the model using those graphs and report results on stderr

        if some baseline model(s) were set, they are also tested

        Return a Report object
        """
        #Assume the model was created or loaded

        assert lGraph
        lLabelName = lGraph[0].getLabelNameList()
        traceln("\t- computing features on test set")
        traceln("\t\t #nodes=%d  #edges=%d " %
                Graph.getNodeEdgeTotalNumber(lGraph))
        chronoOn()
        lX, lY = self.get_lX_lY(lGraph)

        traceln("\t [%.1fs] done\n" % chronoOff())

        gcn_graph_test = self.convert_lX_lY_to_GCNDataset(lX,
                                                          lY,
                                                          training=False,
                                                          test=True)

        chronoOn("test2")
        #with tf.Session(graph=self.tf_graph) as session:
        #with tf.Session() as session:
        #session.run(self.gcn_model.init)
        #self.gcn_model.restore_model(session, self.getModelFilename())
        session = self.tf_session
        if predict_proba:
            #TODO Should split that function diryt
            lY_pred_proba = self.gcn_model.predict_prob_lG(session,
                                                           gcn_graph_test,
                                                           verbose=False)
            traceln(" [%.1fs] done\n" % chronoOff("test2"))

            del lX, lY
            gc.collect()

            return lY_pred_proba

        else:
            #pdb.set_trace()
            lY_pred = self.gcn_model.predict_lG(session,
                                                gcn_graph_test,
                                                verbose=False)
            #end_time = time.time()
            #print("--- %s seconds ---" % (end_time - start_time))
            #print('Number of graphs:', len(lY_pred))

            # Convert to list as Python pickle does not  seem like the array while the list can be pickled
            lY_list = []
            for x in lY_pred:
                lY_list.append(list(x))

            traceln(" [%.1fs] done\n" % chronoOff("test2"))
            tstRpt = TestReport(self.sName,
                                lY_list,
                                lY,
                                lLabelName,
                                lsDocName=lsDocName)

            lBaselineTestReport = self._testBaselines(lX,
                                                      lY,
                                                      lLabelName,
                                                      lsDocName=lsDocName)
            tstRpt.attach(lBaselineTestReport)

            # do some garbage collection
            del lX, lY
            gc.collect()

            return tstRpt
Ejemplo n.º 25
0
    def train(self,
              lGraph_trn,
              lGraph_vld,
              bWarmStart=True,
              expiration_timestamp=None,
              verbose=0):
        """
        Train a CRF model using the list of labelled graph as training
        if bWarmStart if True, try to continue from previous training, IF the stored model is older than expiration_timestamp!!
            , otherwise, starts from scratch
        return nothing
        """
        if self.bGridSearch:
            return self.gridsearch(lGraph_trn, verbose=verbose)

        traceln("\t- computing features on training set")
        traceln("\t\t #nodes=%d  #edges=%d " %
                Graph.getNodeEdgeTotalNumber(lGraph_trn))
        lX, lY = self.get_lX_lY(lGraph_trn)
        lX_vld, lY_vld = self.get_lX_lY(lGraph_vld)
        bMakeSlim = not bWarmStart  # for warm-start mode, we do not make the model slimer!"

        traceln("\t- retrieving or creating model...")
        self.ssvm = None
        sModelFN = self.getModelFilename()
        if bWarmStart:
            try:
                try:
                    self.ssvm = self._loadIfFresh(
                        sModelFN + "._last_", expiration_timestamp,
                        lambda x: SaveLogger(x).load())
                    traceln(
                        "\t- warmstarting with last saved model (not necessarily best one)!"
                    )
                except:
                    self.ssvm = self._loadIfFresh(
                        sModelFN, expiration_timestamp,
                        lambda x: SaveLogger(x).load())
                    traceln("\t- warmstarting from last best model!")
                #we allow to change the max_iter of the model
                try:
                    self.ssvm.max_iter  #to make sure we do something that makes sense...
                    if self.ssvm.max_iter != self.max_iter:
                        traceln(
                            "\t- changing max_iter value from (stored) %d to %d"
                            % (self.ssvm.max_iter, self.max_iter))
                        self.ssvm.max_iter = self.max_iter
                except AttributeError:
                    traceln("\t- cannot access or change the max_iter value")

                try:
                    self.ssvm.n_jobs  #to make sure we do something that makes sense...
                    if self.ssvm.n_jobs != self.njobs:
                        traceln(
                            "\t- changing n_jobs value from (stored) %d to %d"
                            % (self.ssvm.n_jobs, self.njobs))
                        self.ssvm.n_jobs = self.njobs
                except AttributeError:
                    traceln("\t- cannot access or change the n_jobs value")

            except Exception as e:
                self.ssvm = None
                traceln("\t- Cannot warmstart: %s" % e)
            #self.ssvm is either None or containing a nice ssvm model!!

        chronoOn("train")
        traceln("\t- training graph-based model")
        traceln("\t\t solver parameters:", " inference_cache=",
                self.inference_cache, " C=", self.C, " tol=", self.tol,
                " n_jobs=", self.njobs)

        if not self.ssvm:
            traceln("\t- creating a new SSVM-trained CRF model")

            traceln("\t\t- computing class weight:")
            if self.balanced:
                traceln("\t\tusing balanced weights")
                self.setBalancedWeights()
            clsWeights = self.computeClassWeight(lY)
            traceln("\t\t\t --> %s" % clsWeights)

            #clsWeights = np.array([1, 4.5])
            # These weights are tuned for best performance of LR and SVM and hence consistently used here
            crf = self._getCRFModel(clsWeights)

            self.ssvm = OneSlackSSVM(crf,
                                     inference_cache=self.inference_cache,
                                     C=self.C,
                                     tol=self.tol,
                                     n_jobs=self.njobs,
                                     logger=SaveLogger(
                                         sModelFN, save_every=self.save_every),
                                     max_iter=self.max_iter,
                                     show_loss_every=10,
                                     verbose=verbose)
            bWarmStart = False

        if lGraph_vld:
            self.ssvm.fit_with_valid(lX,
                                     lY,
                                     lX_vld,
                                     lY_vld,
                                     warm_start=bWarmStart,
                                     valid_every=self.save_every)
        else:
            # old classical method
            self.ssvm.fit(lX, lY, warm_start=bWarmStart)
        traceln("\t [%.1fs] done (graph-CRF model is trained) \n" %
                chronoOff("train"))

        #traceln(self.getModelInfo())

        #cleaning useless data that takes MB on disk
        if bMakeSlim:
            self.ssvm.alphas = None
            self.ssvm.constraints_ = None
            self.ssvm.inference_cache_ = None
            traceln(
                "\t\t(model made slimmer. Not sure you can efficiently warm-start it later on. See option -w.)"
            )

        #the baseline model(s) if any
        self._trainBaselines(lX, lY)

        #do some garbage collection
        del lX, lY
        gc.collect()

        return
Ejemplo n.º 26
0
    def gridsearch(self, lGraph, verbose=0):
        """
        do a grid search instead of a normal training
        """
        traceln("--- GRID SEARCH FOR CRF MODEL ---")
        traceln("\t- computing features on training set")
        traceln("\t\t #nodes=%d  #edges=%d " %
                Graph.getNodeEdgeTotalNumber(lGraph))
        chronoOn()
        lX, lY = self.get_lX_lY(lGraph)

        dPrm = {}
        dPrm['C'] = self.C if type(self.C) == list else [self.C]
        dPrm['tol'] = self.tol if type(self.tol) == list else [self.tol]
        dPrm['inference_cache'] = self.inference_cache if type(
            self.inference_cache) == list else [self.inference_cache]
        dPrm['max_iter'] = self.max_iter if type(
            self.max_iter) == list else [self.max_iter]

        traceln("\t- creating a SSVM-trained CRF model")

        traceln("\t\t- computing class weight:")
        clsWeights = self.computeClassWeight(lY)
        traceln("\t\t\t%s" % clsWeights)

        crf = self._getCRFModel(clsWeights)

        self._ssvm = OneSlackSSVM(
            crf
            #, inference_cache=self.inference_cache, C=self.C, tol=self.tol
            ,
            n_jobs=self.njobs
            #, logger=SaveLogger(sModelFN, save_every=self.save_every)
            #, max_iter=self.max_iter
            ,
            show_loss_every=10
            #                            , verbose=verbose)
            ,
            verbose=1)

        self._gs_ssvm = GridSearchCV(self._ssvm,
                                     dPrm,
                                     n_jobs=1,
                                     verbose=verbose)
        self.ssvm = None

        chronoOn()
        traceln("\t - training by grid search a graph-based model")
        traceln("\t\t solver parameters for grid search:", " inference_cache=",
                self.inference_cache, " C=", self.C, " tol=", self.tol,
                " n_jobs=", self.njobs, " max_iter=", self.max_iter)
        self._gs_ssvm.fit(lX, lY)
        traceln(
            "\t [%.1fs] done (graph-based model is trained with best parameters, selected by grid search) \n"
            % chronoOff())

        self.ssvm = self._gs_ssvm.best_estimator_  #Estimator that was chosen by the search

        try:
            #win32
            dBestParams = self._gs_ssvm.best_params_
        except:
            #do not know how to get this... in
            dBestParams = {
                'C': self.ssvm.C,
                'inference_cache': self.ssvm.inference_cache,
                'max_iter': self.ssvm.max_iter,
                'tol': self.ssvm.tol
            }

        self.storeBestParams(dBestParams)
        traceln("\t", "- " * 20)
        traceln("\tBest parameters: ", dBestParams)
        traceln("\t", "- " * 20)

        try:
            self.ssvm.alphas = None
            self.ssvm.constraints_ = None
            self.ssvm.inference_cache_ = None
            traceln(
                "\t\t(model made slimmer. Not sure you can efficiently warm-start it later on. See option -w.)"
            )
        except Exception as e:
            traceln(
                "\t\t(COULD NOT make the model slimmer. Got exception: %s" %
                str(e))

        logger = SaveLogger(self.getModelFilename())
        logger(self.ssvm)  #save this model!

        traceln(self.getModelInfo())

        #Also save the details of this grid search
        sFN = self.getModelFilename()[:-4] + "GridSearchCV.pkl"
        try:
            self.gzip_cPickle_dump(sFN, self._gs_ssvm)
            traceln("\n\n--- GridSearchCV details: (also in %s)" % sFN)
            traceln("--- Best parameters set found on development set:")
            traceln(self._gs_ssvm.best_params_)
            traceln("--- Grid scores on development set:")
            means = self._gs_ssvm.cv_results_['mean_test_score']
            stds = self._gs_ssvm.cv_results_['std_test_score']
            for mean, std, params in zip(means, stds,
                                         self._gs_ssvm.cv_results_['params']):
                traceln("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
            traceln("--- ---")
        except Exception as e:
            traceln(
                "WARNING: error while dealing with the GridSearchCV object.")
            traceln(e)

        #the baseline model(s) if any
        self._trainBaselines(lX, lY)

        #do some garbage collection
        del lX, lY
        gc.collect()
        return
Ejemplo n.º 27
0
    def testFiles(self, lsFilename, loadFun, bBaseLine=False):
        """
        Test the model using those files. The corresponding graphs are loaded using the loadFun function (which must return a singleton list).
        It reports results on stderr

        if some baseline model(s) were set, they are also tested

        Return a Report object
        """
        lX, lY, lY_pred = [], [], []
        lLabelName = None
        traceln("- predicting on test set")
        chronoOn("testFiles")

        #   ??? why commenting this?
        #with tf.Session(graph=self.tf_graph) as session:
        #session.run(self.gcn_model.init)
        #self.gcn_model.restore_model(session, self.getModelFilename())

        for sFilename in lsFilename:

            lg = loadFun(sFilename)  # returns a singleton list
            for g in lg:
                if g.bConjugate: g.computeEdgeLabels()
                [X], [Y] = self.get_lX_lY([g])

                gcn_graph_test = self.convert_lX_lY_to_GCNDataset(
                    [X], [Y], training=False, test=True)
                if lLabelName == None:
                    lLabelName = g.getEdgeLabelNameList(
                    ) if g.bConjugate else g.getLabelNameList()
                    traceln("\t #nodes=%d  #edges=%d " %
                            Graph.getNodeEdgeTotalNumber([g]))
                    tNF_EF = (X[0].shape[1], X[2].shape[1])
                    traceln("node-dim,edge-dim", tNF_EF)

    #             else:
    #                 assert lLabelName == g.getLabelNameList(), "Inconsistency among label spaces"

    #SC     lY_pred_ = self.gcn_model.predict_lG(session, gcn_graph_test, verbose=False)
    #             [Y_pred] = self.gcn_model.predict_prob_lG(self.tf_session, gcn_graph_test, verbose=False)
    #             lY_pred.append(Y_pred.argmax(axis=1))
                [Y_pred] = self.gcn_model.predict_lG(self.tf_session,
                                                     gcn_graph_test,
                                                     verbose=False)
                lY_pred.append(Y_pred)

                lX.append(X)
                lY.append(Y)
                del g  # this can be very large
                gc.collect()

        traceln("[%.1fs] done\n" % chronoOff("testFiles"))

        tstRpt = TestReport(self.sName,
                            lY_pred,
                            lY,
                            lLabelName,
                            lsDocName=lsFilename)

        # ??? why commented out?
        #TODO
        # if bBaseLine:
        # lBaselineTestReport = self._testBaselinesEco(lX, lY, lLabelName, lsDocName=lsFilename)
        # tstRpt.attach(lBaselineTestReport)

        del lX, lY
        gc.collect()

        return tstRpt
Ejemplo n.º 28
0
    def train(self, lGraph, bWarmStart=True, expiration_timestamp=None, verbose=0):
        """
        Train a CRF model using the list of labelled graph as training
        if bWarmStart if True, try to continue from previous training, IF the stored model is older than expiration_timestamp!!
            , otherwise, starts from scratch
        return nothing
        """
        if self.bGridSearch:
            return self.gridsearch(lGraph, verbose=verbose)
    
        traceln("\t- computing features on training set")
        traceln("\t\t #nodes=%d  #edges=%d "%Graph.getNodeEdgeTotalNumber(lGraph))
        chronoOn()
        lX, lY = self.get_lX_lY(lGraph)
        self._computeModelCaracteristics(lX)    #we discover here dynamically the number of features of nodes and edges
        traceln("\t\t %s"%self._getNbFeatureAsText())
        traceln("\t [%.1fs] done\n"%chronoOff())
        
        bMakeSlim = not bWarmStart  # for warm-start mode, we do not make the model slimer!"
        
        traceln("\t- retrieving or creating model...")
        self.ssvm = None
        sModelFN = self.getModelFilename()
        if bWarmStart:
            try:
                self.ssvm = self._loadIfFresh(sModelFN, expiration_timestamp, lambda x: SaveLogger(x).load())
                traceln("\t- warmstarting!")
                #we allow to change the max_iter of the model
                try:
                    self.ssvm.max_iter #to make sure we do something that makes sense...
                    if self.ssvm.max_iter != self.max_iter:
                        traceln("\t- changing max_iter value from (stored) %d to %d"%(self.ssvm.max_iter, self.max_iter))
                        self.ssvm.max_iter = self.max_iter
                except AttributeError:
                    traceln("\t- cannot access or change the max_iter value")
                    
                try:
                    self.ssvm.n_jobs #to make sure we do something that makes sense...
                    if self.ssvm.n_jobs != self.njobs:
                        traceln("\t- changing n_jobs value from (stored) %d to %d"%(self.ssvm.n_jobs, self.njobs))
                        self.ssvm.n_jobs = self.njobs
                except AttributeError:
                    traceln("\t- cannot access or change the n_jobs value")

            except Exception as e:
                self.ssvm = None
                traceln("\t- Cannot warmstart: %s"%e)
            #self.ssvm is either None or containing a nice ssvm model!!
        
        if not self.ssvm:
            traceln("\t- creating a new SSVM-trained CRF model")
            
            traceln("\t\t- computing class weight:")
            clsWeights = self.computeClassWeight(lY)
            traceln("\t\t\t --> %s" % clsWeights)
            
            crf = self._getCRFModel(clsWeights)
    
            self.ssvm = OneSlackSSVM(crf
                                , inference_cache=self.inference_cache, C=self.C, tol=self.tol, n_jobs=self.njobs
                                , logger=SaveLogger(sModelFN, save_every=self.save_every)
                                , max_iter=self.max_iter                                        
                                , show_loss_every=10, verbose=verbose)
            bWarmStart = False
        
        chronoOn()
        traceln("\t- training graph-based model")
        traceln("\t\t solver parameters:"
                    , " inference_cache=",self.inference_cache
                    , " C=",self.C, " tol=",self.tol, " n_jobs=",self.njobs)
        self.ssvm.fit(lX, lY, warm_start=bWarmStart)
        traceln("\t [%.1fs] done (graph-based model is trained) \n"%chronoOff())
        
        traceln(self.getModelInfo())
        
        #cleaning useless data that takes MB on disk
        if bMakeSlim:
            self.ssvm.alphas = None  
            self.ssvm.constraints_ = None
            self.ssvm.inference_cache_ = None    
            traceln("\t\t(model made slimmer. Not sure you can efficiently warm-start it later on. See option -w.)")   
                 
        #the baseline model(s) if any
        self._trainBaselines(lX, lY)
        
        #do some garbage collection
        del lX, lY
        gc.collect()
        return