def test(self, lGraph, lsDocName=None, predict_proba=False): """ Test the model using those graphs and report results on stderr if some baseline model(s) were set, they are also tested Return a Report object """ #Assume the model was created or loaded assert lGraph gcn_graph_test, lX, lY = self._prepare_for_test(lGraph) lLabelName = lGraph[0].getLabelNameList() chronoOn("test2") session = self.tf_session if predict_proba: #TODO Should split that function diryt lY_pred_proba = self.gcn_model.predict_prob_lG(session, gcn_graph_test, verbose=False) traceln(" [%.1fs] done\n" % chronoOff("test2")) ret = lY_pred_proba else: #pdb.set_trace() lY_pred = self.gcn_model.predict_lG(session, gcn_graph_test, verbose=False) # # Convert to list as Python pickle does not seem like the array while the list can be pickled # lY_list = [] # for x in lY_pred: # lY_list.append(list(x)) traceln(" [%.1fs] done\n" % chronoOff("test2")) # tstRpt = TestReport(self.sName, lY_list, lY, lLabelName, lsDocName=lsDocName) tstRpt = TestReport(self.sName, lY_pred, lY, lLabelName, lsDocName=lsDocName) lBaselineTestReport = self._testBaselines(lX, lY, lLabelName, lsDocName=lsDocName) tstRpt.attach(lBaselineTestReport) ret = tstRpt # do some garbage collection del lX, lY gc.collect() return ret
def test(self, lGraph, lsDocName=None): """ Test the model using those graphs and report results on stderr if some baseline model(s) were set, they are also tested Return a Report object """ assert lGraph lLabelName = lGraph[0].getLabelNameList() bConstraint = lGraph[0].getPageConstraint() traceln("- computing features on test set") chronoOn("test") lX, lY = self.get_lX_lY(lGraph) traceln("\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber(lGraph)) self._computeModelCaracteristics( lX ) #we discover here dynamically the number of features of nodes and edges traceln("\t %s" % self._getNbFeatureAsText()) traceln("[%.1fs] done\n" % chronoOff("test")) traceln("- predicting on test set") chronoOn("test2") if bConstraint: lConstraints = [g.instanciatePageConstraints() for g in lGraph] lY_pred = self._ssvm_ad3plus_predict(lX, lConstraints) else: lY_pred = self.ssvm.predict(lX) traceln(" [%.1fs] done\n" % chronoOff("test2")) tstRpt = TestReport(self.sName, lY_pred, lY, lLabelName, lsDocName=lsDocName) lBaselineTestReport = self._testBaselines(lX, lY, lLabelName, lsDocName=lsDocName) tstRpt.attach(lBaselineTestReport) tstRpt.attach( self._testEdgeBaselines(lX, lY, lLabelName, lsDocName=lsDocName)) # do some garbage collection del lX, lY gc.collect() return tstRpt
def _testBaselinesEco(self, lX, lY, lLabelName=None, lsDocName=None): """ test the baseline models, WITHOUT MAKING A HUGE X IN MEMORY return a test report list, one per baseline method """ if lsDocName: assert len(lX) == len(lsDocName), "Internal error" lTstRpt = [] for mdl in self._lMdlBaseline: #code in extenso, to call del on the Y_pred_flat array... chronoOn("_testBaselinesEco_T") #using a COnfusionMatrix-based test report object, we can accumulate results oTestReportConfu = TestReportConfusion(str(mdl), list(), lLabelName, lsDocName=lsDocName) for X, Y in zip(lX, lY): for itype in range(self.nbType): X_flat, Y_flat = self._getXY_forType([X], [Y], itype) Y_flat_pred = mdl[itype].predict(X_flat) oTestReportConfu.accumulate( TestReport(str(mdl), Y_flat_pred, Y_flat, lLabelName, lsDocName=lsDocName)) traceln("\t\t [%.1fs] done\n" % chronoOff("_testBaselinesEco_T")) lTstRpt.append(oTestReportConfu) return lTstRpt
def _testBaselines(self, lX, lY, lLabelName=None, lsDocName=None): """ test the baseline models, return a test report list, one per baseline method """ if lsDocName: assert len(lX) == len(lsDocName), "Internal error" lTstRpt = [] if self._lMdlBaseline: for itype in range(self.nbType): X_flat, Y_flat = self._getXY_forType(lX, lY, itype) traceln("\t\t type %d #nodes=%d #features=%d" % (itype, X_flat.shape[0], X_flat.shape[1])) for mdl in self._lMdlBaseline: #code in extenso, to call del on the Y_pred_flat array... chronoOn("_testBaselines_T") Y_pred_flat = mdl[itype].predict(X_flat) traceln("\t\t [%.1fs] done\n" % chronoOff("_testBaselines_T")) lTstRpt.append( TestReport(str(mdl), Y_pred_flat, Y_flat, lLabelName, lsDocName=lsDocName)) del X_flat, Y_flat, Y_pred_flat return lTstRpt
def _testBaselinesEco(self, lX, lY, lLabelName=None, lsDocName=None): """ test the baseline models, WITHOUT MAKING A HUGE X IN MEMORY return a test report list, one per baseline method """ if lsDocName: assert len(lX) == len(lsDocName), "Internal error" lTstRpt = [] for mdl in self._lMdlBaseline: #code in extenso, to call del on the Y_pred_flat array... chronoOn() #using a COnfusionMatrix-based test report object, we can accumulate results oTestReportConfu = TestReportConfusion(str(mdl), list(), lLabelName, lsDocName=lsDocName) for X, Y in zip(lX, lY): Y_pred = mdl.predict( X ) #I suspect a bug here. (JLM June 2017) Because X_flat is probably required. oTestReportConfu.accumulate( TestReport(str(mdl), Y_pred, Y, lLabelName, lsDocName=lsDocName)) traceln("\t\t [%.1fs] done\n" % chronoOff()) lTstRpt.append(oTestReportConfu) return lTstRpt
def _testEdgeBaselines(self, lX, lY, lLabelName=None, lsDocName=None): """ test the edge baseline model, return a test report list (a singleton for now) """ lTstRpt = [] if self._EdgeBaselineModel: if lsDocName: assert len(lX) == len(lsDocName), "Internal error" lEdgeLabelName = [ "%s_%s" % (lbl1, lbl2) for lbl1 in lLabelName for lbl2 in lLabelName ] if lLabelName else None lTstRpt = [] X_flat, Y_flat = self._getEdgeXEdgeY(lX, lY) chronoOn("_testEdgeBaselines") Y_pred_flat = self._EdgeBaselineModel.predict(X_flat) traceln("\t\t [%.1fs] done\n" % chronoOff("_testEdgeBaselines")) lTstRpt.append( TestReport(str(self._EdgeBaselineModel), Y_pred_flat, Y_flat, lEdgeLabelName, lsDocName=lsDocName)) del X_flat, Y_flat, Y_pred_flat return lTstRpt
def train(self, lGraph, bWarmStart=True, expiration_timestamp=None, verbose=0): print('Ensemble ECN Training') traceln("\t- computing features on training set") traceln("\t\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber(lGraph)) chronoOn() lX, lY = self.get_lX_lY(lGraph) self._computeModelCaracteristics( lX ) # we discover here dynamically the number of features of nodes and edges # self._tNF_EF contains the number of node features and edge features traceln("\t\t %s" % self._getNbFeatureAsText()) traceln("\t [%.1fs] done\n" % chronoOff()) nb_class = self.getNbClass() # Is it better to do Y.shape ? print('nb_class', nb_class) self.model_config['node_dim'] = self._tNF_EF[0] self.model_config['edge_dim'] = self._tNF_EF[1] self.model_config['nb_class'] = nb_class traceln("\t- creating the sub-models") # TODO # This converts the lX,lY in the format necessary for GCN Models #DO we need that , can we share the label binarizer and so on ... #This sets the label binarizer gcn_graph = self.convert_lX_lY_to_GCNDataset(lX, lY, training=True) # Save the label Binarizer for prediction usage fd_lb = open(self.getlabelBinarizerFilename(), 'wb') pickle.dump(self.labelBinarizer, fd_lb) fd_lb.close() # Save the model config in order to restore the model later fd_mc = open(self.getModelConfigFilename(), 'wb') pickle.dump(self.model_config, fd_mc) fd_mc.close() #This would create all the DU_MODEL self._init_model() for du_model in self.models: #The train will create a tf graph and create the model du_model.train(lGraph, bWarmStart=bWarmStart)
def _prepare_for_test(self, lGraph): traceln("\t- computing features on test set") traceln("\t\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber(lGraph)) chronoOn() lX, lY = self.get_lX_lY(lGraph) traceln("\t [%.1fs] done\n" % chronoOff()) gcn_graph_test = self.convert_lX_lY_to_GCNDataset(lX, lY, training=False, test=True) return gcn_graph_test, lX, lY
def _trainBaselines(self, lX, lY): """ Train the baseline models, if any """ if self._lMdlBaseline: for itype in range(self.nbType): X_flat, Y_flat = self._getXY_forType(lX, lY, itype) if False: with open("XY_flat_Type%d.pkl"%(itype), "wb") as fd: pickle.dump((X_flat, Y_flat), fd) for mdlBaseline in self._lMdlBaseline: chronoOn() traceln("\t - training baseline model: %s"%str(mdlBaseline)) mdlBaseline[itype].fit(X_flat, Y_flat) traceln("\t [%.1fs] done\n"%chronoOff()) del X_flat, Y_flat return True
def _trainEdgeBaseline(self, lX, lY): """ Here we train a logistic regression model to predict the pair of labels of each edge. This code assume single type """ self._EdgeBaselineModel = self.getEdgeModel() if self._EdgeBaselineModel: X_flat, Y_flat = self._getEdgeXEdgeY(lX, lY) with open("edgeXedgeY_flat.pkl", "wb") as fd: pickle.dump((X_flat, Y_flat), fd) chronoOn() traceln("\t - training edge baseline model: %s"%str(self._EdgeBaselineModel)) self._EdgeBaselineModel.fit(X_flat, Y_flat) traceln("\t [%.1fs] done\n"%chronoOff()) del X_flat, Y_flat else: traceln("\t - no edge baseline model for this model") return True
def _trainBaselines(self, lX, lY): """ Train the baseline models, if any """ if self._lMdlBaseline: X_flat = self._get_X_flat(lX) Y_flat = np.hstack(lY) if False: with open("XY_flat.pkl", "wb") as fd: pickle.dump((X_flat, Y_flat), fd) for mdlBaseline in self._lMdlBaseline: chronoOn() traceln("\t - training baseline model: %s" % str(mdlBaseline)) mdlBaseline.fit(X_flat, Y_flat) traceln("\t [%.1fs] done\n" % chronoOff()) del X_flat, Y_flat if self.bTrainEdgeBaseline: traceln(' - training edge baseline') self._trainEdgeBaseline( lX, lY) #we always train a predefined model on edges return True
def _testBaselines(self, lX, lY, lLabelName=None, lsDocName=None): """ test the baseline models, return a test report list, one per baseline method """ if lsDocName: assert len(lX) == len(lsDocName), "Internal error" lTstRpt = [] if self._lMdlBaseline: X_flat = self._get_X_flat(lX) Y_flat = np.hstack(lY) for mdl in self._lMdlBaseline: #code in extenso, to call del on the Y_pred_flat array... chronoOn("_testBaselines") Y_pred_flat = mdl.predict(X_flat) traceln("\t\t [%.1fs] done\n" % chronoOff("_testBaselines")) lTstRpt.append( TestReport(str(mdl), Y_pred_flat, Y_flat, lLabelName, lsDocName=lsDocName)) del X_flat, Y_flat, Y_pred_flat return lTstRpt
def testFiles(self, lsFilename, loadFun, bBaseLine=False): """ Test the model using those files. The corresponding graphs are loaded using the loadFun function (which must return a singleton list). It reports results on stderr if some baseline model(s) were set, they are also tested Return a Report object """ traceln("- predicting on test set") chronoOn("testFiles") lX, lY, lY_pred = [], [], [] lLabelName = None for sFilename in lsFilename: lg = loadFun(sFilename) # returns a singleton list for g in lg: if g.bConjugate: g.computeEdgeLabels() [X], [Y] = self.get_lX_lY([g]) [gcn_graph_test ] = self.convert_lX_lY_to_GCNDataset([X], [Y], training=False, test=True) if lLabelName == None: lLabelName = g.getLabelNameList() traceln("\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber([g])) tNF_EF = (X[0].shape[1], X[2].shape[1]) traceln("node-dim,edge-dim", tNF_EF) m_pred = [] for du_model in self.models: [Y_pred] = du_model.gcn_model.predict_prob_lG( du_model.tf_session, [gcn_graph_test], verbose=False) m_pred.append([Y_pred]) [Y_pred], [_Y_pred_proba ] = DU_Ensemble_ECN.average_prediction(m_pred) #lX.append(X) lY.append(Y) lY_pred.append(Y_pred) del _Y_pred_proba g.detachFromDoc() del g # this can be very large gc.collect() traceln("[%.1fs] done\n" % chronoOff("testFiles")) tstRpt = TestReport(self.sName, lY_pred, lY, lLabelName, lsDocName=lsFilename) del lX, lY, lY_pred gc.collect() return tstRpt
def runForExternalMLMethod(self, lsColDir, storeX, applyY, bRevertEdges=False): """ HACK: to test new ML methods, not yet integrated in our SW: storeX=None, storeXY=None, applyY=None Return the list of produced files """ self.traceln("-" * 50) if storeX: traceln("Loading data and storing [X] (1 X per graph)") if applyY: traceln( "Loading data, loading Y, labelling data, storing annotated data" ) self.traceln("-" * 50) if storeX and applyY: raise ValueError("Either store X or applyY, not both") if not self._mdl: raise Exception("The model must be loaded beforehand!") #list files _, lFilename = self.listMaxTimestampFile(lsColDir, self.sXmlFilenamePattern) DU_GraphClass = self.getGraphClass() lPageConstraint = DU_GraphClass.getPageConstraint() if lPageConstraint: for dat in lPageConstraint: self.traceln("\t\t%s" % str(dat)) if applyY: self.traceln("LOADING [Y] from %s" % applyY) lY = self._mdl.gzip_cPickle_load(applyY) if storeX: lX = [] chronoOn("predict") self.traceln( "- loading collection as graphs, and processing each in turn. (%d files)" % len(lFilename)) du_postfix = "_du" + MultiPageXml.sEXT lsOutputFilename = [] for sFilename in lFilename: if sFilename.endswith(du_postfix): continue #:) chronoOn("predict_1") lg = DU_GraphClass.loadGraphs([sFilename], bDetach=False, bLabelled=False, iVerbose=1) #normally, we get one graph per file, but in case we load one graph per page, for instance, we have a list if lg: for g in lg: doc = g.doc if bRevertEdges: g.revertEdges() #revert the directions of the edges if lPageConstraint: self.traceln( "\t- prediction with logical constraints: %s" % sFilename) else: self.traceln("\t- prediction : %s" % sFilename) if storeX: [X] = self._mdl.get_lX([g]) lX.append(X) else: Y = lY.pop(0) g.setDomLabels(Y) del lg if applyY: MultiPageXml.setMetadata(doc, None, self.sMetadata_Creator, self.sMetadata_Comments) sDUFilename = sFilename[:-len(MultiPageXml.sEXT )] + du_postfix doc.saveFormatFileEnc(sDUFilename, "utf-8", True) #True to indent the XML doc.freeDoc() lsOutputFilename.append(sDUFilename) else: self.traceln("\t- no prediction to do for: %s" % sFilename) self.traceln("\t done [%.2fs]" % chronoOff("predict_1")) self.traceln(" done [%.2fs]" % chronoOff("predict")) if storeX: self.traceln("STORING [X] in %s" % storeX) self._mdl.gzip_cPickle_dump(storeX, lX) return lsOutputFilename
def predict(self, lsColDir, docid=None): """ Return the list of produced files """ self.traceln("-" * 50) self.traceln("Predicting for collection(s):", lsColDir) self.traceln("-" * 50) if not self._mdl: raise Exception("The model must be loaded beforehand!") #list files if docid is None: _, lFilename = self.listMaxTimestampFile(lsColDir, self.sXmlFilenamePattern) # predict for this file only else: try: lFilename = [ os.path.abspath( os.path.join(lsColDir[0], docid + MultiPageXml.sEXT)) ] except IndexError: raise Exception("a collection directory must be provided!") DU_GraphClass = self.getGraphClass() lPageConstraint = DU_GraphClass.getPageConstraint() if lPageConstraint: for dat in lPageConstraint: self.traceln("\t\t%s" % str(dat)) chronoOn("predict") self.traceln( "- loading collection as graphs, and processing each in turn. (%d files)" % len(lFilename)) du_postfix = "_du" + MultiPageXml.sEXT lsOutputFilename = [] for sFilename in lFilename: if sFilename.endswith(du_postfix): continue #:) chronoOn("predict_1") lg = DU_GraphClass.loadGraphs([sFilename], bDetach=False, bLabelled=False, iVerbose=1) #normally, we get one graph per file, but in case we load one graph per page, for instance, we have a list if lg: for g in lg: doc = g.doc if lPageConstraint: self.traceln( "\t- prediction with logical constraints: %s" % sFilename) else: self.traceln("\t- prediction : %s" % sFilename) Y = self._mdl.predict(g) g.setDomLabels(Y) del Y del lg MultiPageXml.setMetadata(doc, None, self.sMetadata_Creator, self.sMetadata_Comments) sDUFilename = sFilename[:-len(MultiPageXml.sEXT)] + du_postfix doc.write(sDUFilename, xml_declaration=True, encoding="utf-8", pretty_print=True #compression=0, #0 to 9 ) lsOutputFilename.append(sDUFilename) else: self.traceln("\t- no prediction to do for: %s" % sFilename) self.traceln("\t done [%.2fs]" % chronoOff("predict_1")) self.traceln(" done [%.2fs]" % chronoOff("predict")) return lsOutputFilename
def _train_save_test(self, sModelName, bWarm, lFilename_trn, ts_trn, lFilename_tst, bPickleOnly): """ used both by train_save_test and _nfold_runFold """ mdl = self.cModelClass(sModelName, self.sModelDir) if os.path.exists( mdl.getModelFilename()) and not bWarm and not bPickleOnly: raise crf.Model.ModelException( "Model exists on disk already (%s), either remove it first or warm-start the training." % mdl.getModelFilename()) mdl.configureLearner(**self.config_learner_kwargs) mdl.setBaselineModelList(self._lBaselineModel) mdl.saveConfiguration( (self.config_extractor_kwargs, self.config_learner_kwargs)) self.traceln("\t - configuration: ", self.config_learner_kwargs) self.traceln("- loading training graphs") lGraph_trn = self.cGraphClass.loadGraphs(lFilename_trn, bDetach=True, bLabelled=True, iVerbose=1) self.traceln(" %d graphs loaded" % len(lGraph_trn)) assert self.nbClass and self.lNbClass, "internal error: I expected the number of class to be automatically computed at that stage" if self.iNbCRFType == 1: mdl.setNbClass(self.nbClass) else: mdl.setNbClass(self.lNbClass) #for this check, we load the Y once... self.checkLabelCoverage( mdl.get_lY(lGraph_trn) ) #NOTE that Y are in bad order if multiptypes. Not a pb here self.traceln("- retrieving or creating feature extractors...") chronoOn("FeatExtract") try: mdl.loadTransformers(ts_trn) except crf.Model.ModelException: fe = self.cFeatureDefinition(**self.config_extractor_kwargs) fe.fitTranformers(lGraph_trn) fe.cleanTransformers() mdl.setTranformers(fe.getTransformers()) mdl.saveTransformers() self.traceln(" done [%.1fs]" % chronoOff("FeatExtract")) if bPickleOnly: self._pickleData(mdl, lGraph_trn, "trn") else: self.traceln("- training model...") chronoOn("MdlTrn") mdl.train(lGraph_trn, True, ts_trn, verbose=1 if self.bVerbose else 0) mdl.save() self.traceln(" done [%.1fs]" % chronoOff("MdlTrn")) # OK!! self._mdl = mdl if lFilename_tst: self.traceln("- loading test graphs") lGraph_tst = self.cGraphClass.loadGraphs(lFilename_tst, bDetach=True, bLabelled=True, iVerbose=1) self.traceln(" %d graphs loaded" % len(lGraph_tst)) if bPickleOnly: self._pickleData(mdl, lGraph_tst, "tst") else: oReport = mdl.test(lGraph_tst) else: oReport = None if bPickleOnly: self.traceln("- pickle done, exiting") exit(0) return oReport
def testFiles(self, lsFilename, loadFun, bBaseLine=False): """ Test the model using those files. The corresponding graphs are loaded using the loadFun function (which must return a singleton list). It reports results on stderr if some baseline model(s) were set, they are also tested Return a Report object """ lX, lY, lY_pred = [], [], [] lLabelName = None traceln("- predicting on test set") chronoOn("testFiles") for sFilename in lsFilename: lg = loadFun(sFilename) #returns a singleton list for g in lg: if self.bConjugate: g.computeEdgeLabels() [X], [Y] = self.get_lX_lY([g]) if lLabelName == None: lLabelName = g.getLabelNameList() traceln("\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber([g])) self._computeModelCaracteristics( [X] ) #we discover here dynamically the number of features of nodes and edges traceln("\t %s" % self._getNbFeatureAsText()) else: assert lLabelName == g.getLabelNameList( ), "Inconsistency among label spaces" n_jobs = self.ssvm.n_jobs self.ssvm.n_jobs = 1 if g.getPageConstraint(): lConstraints = g.instanciatePageConstraints() [Y_pred] = self._ssvm_ad3plus_predict([X], [lConstraints]) else: #since we pass a single graph, let force n_jobs to 1 !! [Y_pred] = self.ssvm.predict([X]) self.ssvm.n_jobs = n_jobs lX.append(X) lY.append(Y) lY_pred.append(Y_pred) #g.detachFromDOM() del g #this can be very large gc.collect() traceln("[%.1fs] done\n" % chronoOff("testFiles")) tstRpt = TestReport(self.sName, lY_pred, lY, lLabelName, lsDocName=lsFilename) if bBaseLine: lBaselineTestReport = self._testBaselinesEco(lX, lY, lLabelName, lsDocName=lsFilename) tstRpt.attach(lBaselineTestReport) # if True: # #experimental code, not so interesting... # node_transformer, _ = self.getTransformers() # try: # _testable_extractor_ = node_transformer._testable_extractor_ # lExtractorTestReport = _testable_extractor_.testEco(lX, lY) # tstRpt.attach(lExtractorTestReport) # except AttributeError: # pass #do some garbage collection del lX, lY gc.collect() return tstRpt
def DYNAMIC_IMPORT(name, package=None): chronoOn("import") trace("SETUP: Dynamic import of '%s' from '%s'" % (name, package)) m = import_module(name, package) traceln(" done [%.1fs]" % chronoOff("import")) return m
def _train_save_test(self, sModelName, bWarm, lFilename_trn, ts_trn, lFilename_tst, lFilename_vld, bPickleXY , ratio_train_val=None): """ used both by train_save_test and _nfold_runFold if provided, try using lFilename_vld as validation set to make best model. """ mdl = self.cModelClass(sModelName, self.sModelDir) if os.path.exists(mdl.getModelFilename()) and not bWarm: raise GraphModelException("Model exists on disk already (%s), either remove it first or warm-start the training."%mdl.getModelFilename()) mdl.configureLearner(**self.config_learner_kwargs) mdl.setBaselineModelList(self._lBaselineModel) mdl.saveConfiguration( (self.config_extractor_kwargs, self.config_learner_kwargs) ) self.traceln("\t - configuration: ", self.config_learner_kwargs ) self.traceln("- loading training graphs") lGraph_trn = self.cGraphClass.loadGraphs(self.cGraphClass, lFilename_trn, bDetach=True, bLabelled=True, iVerbose=1) self.traceln(" %d training graphs loaded"%len(lGraph_trn)) if lFilename_vld: self.traceln("- loading validation graphs") lGraph_vld = self.cGraphClass.loadGraphs(self.cGraphClass, lFilename_vld, bDetach=True, bLabelled=True, iVerbose=1) self.traceln(" %d validation graphs loaded"%len(lGraph_vld)) else: lGraph_vld = [] if ratio_train_val is None: self.traceln("- no validation graphs") else: lG = [g for g in lGraph_trn] split_idx = int(ratio_train_val * len(lG)) lGraph_vld = lG[:split_idx ] lGraph_trn = lG[ split_idx:] del lG self.traceln("- extracted %d validation graphs, got %d training graphs (ratio=%.3f)" % (len(lGraph_vld), len(lGraph_trn), ratio_train_val)) #for this check, we load the Y once... if self.bConjugate: mdl.setNbClass(self.nbEdgeLabel) for _g in lGraph_trn: _g.computeEdgeLabels() for _g in lGraph_vld: _g.computeEdgeLabels() else: assert self.nbClass and self.lNbClass, "internal error: I expected the number of class to be automatically computed at that stage" if self.iNbNodeType == 1: mdl.setNbClass(self.nbClass) else: mdl.setNbClass(self.lNbClass) self.checkLabelCoverage(mdl.get_lY(lGraph_trn)) #NOTE that Y are in bad order if multiptypes. Not a pb here self.traceln("- retrieving or creating feature extractors...") chronoOn("FeatExtract") try: mdl.loadTransformers(ts_trn) except GraphModelException: fe = self.cFeatureDefinition(**self.config_extractor_kwargs) fe.fitTranformers(lGraph_trn) fe.cleanTransformers() mdl.setTranformers(fe.getTransformers()) mdl.saveTransformers() # pretty print of features extractors self.traceln("""--- Features --- --- NODES : %s --- EDGES : %s --- -------- --- """ % mdl.getTransformers()) self.traceln(" done [%.1fs]"%chronoOff("FeatExtract")) if bPickleXY: self._pickleData(mdl, lGraph_trn, "trn") self.traceln("- training model...") chronoOn("MdlTrn") mdl.train(lGraph_trn, lGraph_vld, True, ts_trn, verbose=1 if self.bVerbose else 0) mdl.save() self.traceln(" done [%.1fs]"%chronoOff("MdlTrn")) # OK!! self._mdl = mdl if lFilename_tst: self.traceln("- loading test graphs") lGraph_tst = self.cGraphClass.loadGraphs(self.cGraphClass, lFilename_tst, bDetach=True, bLabelled=True, iVerbose=1) if self.bConjugate: for _g in lGraph_tst: _g.computeEdgeLabels() self.traceln(" %d graphs loaded"%len(lGraph_tst)) if bPickleXY: self._pickleData(mdl, lGraph_tst, "tst") else: oReport = mdl.test(lGraph_tst) else: oReport = None if bPickleXY: self.traceln("- pickle done, exiting") exit(0) return oReport
def _prepare_for_train(self, lGraph, lGraph_vld): """ Prepare for training eCN or EnsembleECN """ traceln('ECN Training ', self.sName) traceln("\t- computing features on training set") traceln("\t\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber(lGraph)) chronoOn() lX, lY = self.get_lX_lY(lGraph) self._computeModelCaracteristics( lX ) # we discover here dynamically the number of features of nodes and edges # self._tNF_EF contains the number of node features and edge features traceln("\t\t %s" % self._getNbFeatureAsText()) traceln("\t [%.1fs] done\n" % chronoOff()) nb_class = len( lGraph[0].getLabelNameList()) #Is it better to do Y.shape ? traceln("\t- %d classes" % nb_class) traceln("\t- retrieving or creating model...") self.model_config['node_dim'] = self._tNF_EF[0] self.model_config['edge_dim'] = self._tNF_EF[1] self.model_config['nb_class'] = nb_class if False: with open('linear_reg', 'wb') as save_file: pickle.dump((lX, lY), save_file, pickle.HIGHEST_PROTOCOL) #This converts the lX,lY in the format necessary for GCN Models gcn_graph = self.convert_lX_lY_to_GCNDataset(lX, lY, training=True) #Save the label Binarizer for prediction usage fd_lb = open(self.getlabelBinarizerFilename(), 'wb') pickle.dump(self.labelBinarizer, fd_lb) fd_lb.close() #TODO Save the validation set too to reproduce experiments random.shuffle(gcn_graph) if lGraph_vld: gcn_graph_train = gcn_graph lX_vld, lY_vld = self.get_lX_lY(lGraph_vld) gcn_graph_val = self.convert_lX_lY_to_GCNDataset(lX_vld, lY_vld, test=True) del lX_vld, lY_vld else: #Get a validation set from the training set split_idx = max( 1, int(self.model_config['ratio_train_val'] * len(gcn_graph))) traceln(" - using %d train graphs as validation graphs" % split_idx) gcn_graph_train = [] gcn_graph_val = [] gcn_graph_val.extend(gcn_graph[:split_idx]) gcn_graph_train.extend(gcn_graph[split_idx:]) traceln("%d training graphs -- %d validation graphs" % (len(gcn_graph_train), len(gcn_graph_val))) self._cleanTmpCheckpointFiles() return gcn_graph_train, gcn_graph_val
def testFiles(self, lsFilename, loadFun, bBaseLine=False): """ Test the model using those files. The corresponding graphs are loaded using the loadFun function (which must return a singleton list). It reports results on stderr if some baseline model(s) were set, they are also tested Return a Report object """ raise NotImplementedError lX, lY, lY_pred = [], [], [] lLabelName = None traceln("- predicting on test set") chronoOn("testFiles") # ? Iterate over files or over models for du_model in self.models: #du_model.load() m_pred = [] #with tf.Session(graph=du_model.tf_graph) as session: #session.run(du_model.gcn_model.init) #du_model.gcn_model.restore_model(session, du_model.getModelFilename()) for sFilename in lsFilename: [g] = loadFun(sFilename) # returns a singleton list [X], [Y] = self.get_lX_lY([g]) gcn_graph_test = self.convert_lX_lY_to_GCNDataset( [X], [Y], training=False, test=True) if lLabelName == None: lLabelName = g.getLabelNameList() traceln("\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber([g])) tNF_EF = (X[0].shape[1], X[2].shape[1]) traceln("node-dim,edge-dim", tNF_EF) else: assert lLabelName == g.getLabelNameList( ), "Inconsistency among label spaces" model_pred = du_model.test(gcn_graph_test, predict_proba=True) m_pred.append(model_pred[0]) lX.append(X) lY.append(Y) g.detachFromDOM() del g # this can be very large gc.collect() lY_pred.append(model_pred) lY_pred, _ = DU_Ensemble_ECN.average_prediction(lY_pred) traceln("[%.1fs] done\n" % chronoOff("testFiles")) tstRpt = TestReport(self.sName, lY_pred, lY, lLabelName, lsDocName=lsFilename) if bBaseLine: lBaselineTestReport = self._testBaselinesEco(lX, lY, lLabelName, lsDocName=lsFilename) tstRpt.attach(lBaselineTestReport) del lX, lY gc.collect() return tstRpt
def train(self, lGraph, bWarmStart=True, expiration_timestamp=None, verbose=0): """ Return a model trained using the given labelled graphs. The train method is expected to save the model into self.getModelFilename(), at least at end of training If bWarmStart==True, The model is loaded from the disk, if any, and if fresher than given timestamp, and training restarts if some baseline model(s) were set, they are also trained, using the node features """ print('ECN Training', self.sName) traceln("\t- computing features on training set") traceln("\t\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber(lGraph)) chronoOn() lX, lY = self.get_lX_lY(lGraph) self._computeModelCaracteristics( lX ) # we discover here dynamically the number of features of nodes and edges # self._tNF_EF contains the number of node features and edge features traceln("\t\t %s" % self._getNbFeatureAsText()) traceln("\t [%.1fs] done\n" % chronoOff()) traceln("\t- retrieving or creating model...") nb_class = self.getNbClass() #Is it better to do Y.shape ? self.model_config['node_dim'] = self._tNF_EF[0] self.model_config['edge_dim'] = self._tNF_EF[1] self.model_config['nb_class'] = nb_class #This call the ECN internal constructor and defines the tensorflow graph tf_graph = tf.Graph() with tf_graph.as_default(): self._init_model() self.tf_graph = tf_graph #This converts the lX,lY in the format necessary for GCN Models gcn_graph = self.convert_lX_lY_to_GCNDataset(lX, lY, training=True) #Save the label Binarizer for prediction usage fd_lb = open(self.getlabelBinarizerFilename(), 'wb') pickle.dump(self.labelBinarizer, fd_lb) fd_lb.close() #Save the model config in order to restore the model later fd_mc = open(self.getModelConfigFilename(), 'wb') pickle.dump(self.model_config, fd_mc) fd_mc.close() #TODO Save the validation set too to reproduce experiments #Get a validation set from the training set split_idx = int(self.model_config['ratio_train_val'] * len(gcn_graph)) random.shuffle(gcn_graph) gcn_graph_train = [] gcn_graph_val = [] gcn_graph_val.extend(gcn_graph[:split_idx]) gcn_graph_train.extend(gcn_graph[split_idx:]) self._cleanTmpCheckpointFiles() patience = self.model_config[ 'patience'] if 'patience' in self.model_config else self.model_config[ 'nb_iter'] with tf.Session(graph=self.tf_graph) as session: session.run([self.gcn_model.init]) R = self.gcn_model.train_with_validation_set( session, gcn_graph_train, gcn_graph_val, self.model_config['nb_iter'], eval_iter=10, patience=patience, save_model_path=self.getTmpModelFilename()) f = open(self.getValScoreFilename(), 'wb') pickle.dump(R, f) f.close() #This save the model self._getBestModelVal() self._cleanTmpCheckpointFiles() #We reopen a session here and load the selected model if we need one self.restore()
def test(self, lGraph, lsDocName=None, predict_proba=False): """ Test the model using those graphs and report results on stderr if some baseline model(s) were set, they are also tested Return a Report object """ #Assume the model was created or loaded assert lGraph lLabelName = lGraph[0].getLabelNameList() traceln("\t- computing features on test set") traceln("\t\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber(lGraph)) chronoOn() lX, lY = self.get_lX_lY(lGraph) traceln("\t [%.1fs] done\n" % chronoOff()) gcn_graph_test = self.convert_lX_lY_to_GCNDataset(lX, lY, training=False, test=True) chronoOn("test2") #with tf.Session(graph=self.tf_graph) as session: #with tf.Session() as session: #session.run(self.gcn_model.init) #self.gcn_model.restore_model(session, self.getModelFilename()) session = self.tf_session if predict_proba: #TODO Should split that function diryt lY_pred_proba = self.gcn_model.predict_prob_lG(session, gcn_graph_test, verbose=False) traceln(" [%.1fs] done\n" % chronoOff("test2")) del lX, lY gc.collect() return lY_pred_proba else: #pdb.set_trace() lY_pred = self.gcn_model.predict_lG(session, gcn_graph_test, verbose=False) #end_time = time.time() #print("--- %s seconds ---" % (end_time - start_time)) #print('Number of graphs:', len(lY_pred)) # Convert to list as Python pickle does not seem like the array while the list can be pickled lY_list = [] for x in lY_pred: lY_list.append(list(x)) traceln(" [%.1fs] done\n" % chronoOff("test2")) tstRpt = TestReport(self.sName, lY_list, lY, lLabelName, lsDocName=lsDocName) lBaselineTestReport = self._testBaselines(lX, lY, lLabelName, lsDocName=lsDocName) tstRpt.attach(lBaselineTestReport) # do some garbage collection del lX, lY gc.collect() return tstRpt
def train(self, lGraph_trn, lGraph_vld, bWarmStart=True, expiration_timestamp=None, verbose=0): """ Train a CRF model using the list of labelled graph as training if bWarmStart if True, try to continue from previous training, IF the stored model is older than expiration_timestamp!! , otherwise, starts from scratch return nothing """ if self.bGridSearch: return self.gridsearch(lGraph_trn, verbose=verbose) traceln("\t- computing features on training set") traceln("\t\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber(lGraph_trn)) lX, lY = self.get_lX_lY(lGraph_trn) lX_vld, lY_vld = self.get_lX_lY(lGraph_vld) bMakeSlim = not bWarmStart # for warm-start mode, we do not make the model slimer!" traceln("\t- retrieving or creating model...") self.ssvm = None sModelFN = self.getModelFilename() if bWarmStart: try: try: self.ssvm = self._loadIfFresh( sModelFN + "._last_", expiration_timestamp, lambda x: SaveLogger(x).load()) traceln( "\t- warmstarting with last saved model (not necessarily best one)!" ) except: self.ssvm = self._loadIfFresh( sModelFN, expiration_timestamp, lambda x: SaveLogger(x).load()) traceln("\t- warmstarting from last best model!") #we allow to change the max_iter of the model try: self.ssvm.max_iter #to make sure we do something that makes sense... if self.ssvm.max_iter != self.max_iter: traceln( "\t- changing max_iter value from (stored) %d to %d" % (self.ssvm.max_iter, self.max_iter)) self.ssvm.max_iter = self.max_iter except AttributeError: traceln("\t- cannot access or change the max_iter value") try: self.ssvm.n_jobs #to make sure we do something that makes sense... if self.ssvm.n_jobs != self.njobs: traceln( "\t- changing n_jobs value from (stored) %d to %d" % (self.ssvm.n_jobs, self.njobs)) self.ssvm.n_jobs = self.njobs except AttributeError: traceln("\t- cannot access or change the n_jobs value") except Exception as e: self.ssvm = None traceln("\t- Cannot warmstart: %s" % e) #self.ssvm is either None or containing a nice ssvm model!! chronoOn("train") traceln("\t- training graph-based model") traceln("\t\t solver parameters:", " inference_cache=", self.inference_cache, " C=", self.C, " tol=", self.tol, " n_jobs=", self.njobs) if not self.ssvm: traceln("\t- creating a new SSVM-trained CRF model") traceln("\t\t- computing class weight:") if self.balanced: traceln("\t\tusing balanced weights") self.setBalancedWeights() clsWeights = self.computeClassWeight(lY) traceln("\t\t\t --> %s" % clsWeights) #clsWeights = np.array([1, 4.5]) # These weights are tuned for best performance of LR and SVM and hence consistently used here crf = self._getCRFModel(clsWeights) self.ssvm = OneSlackSSVM(crf, inference_cache=self.inference_cache, C=self.C, tol=self.tol, n_jobs=self.njobs, logger=SaveLogger( sModelFN, save_every=self.save_every), max_iter=self.max_iter, show_loss_every=10, verbose=verbose) bWarmStart = False if lGraph_vld: self.ssvm.fit_with_valid(lX, lY, lX_vld, lY_vld, warm_start=bWarmStart, valid_every=self.save_every) else: # old classical method self.ssvm.fit(lX, lY, warm_start=bWarmStart) traceln("\t [%.1fs] done (graph-CRF model is trained) \n" % chronoOff("train")) #traceln(self.getModelInfo()) #cleaning useless data that takes MB on disk if bMakeSlim: self.ssvm.alphas = None self.ssvm.constraints_ = None self.ssvm.inference_cache_ = None traceln( "\t\t(model made slimmer. Not sure you can efficiently warm-start it later on. See option -w.)" ) #the baseline model(s) if any self._trainBaselines(lX, lY) #do some garbage collection del lX, lY gc.collect() return
def gridsearch(self, lGraph, verbose=0): """ do a grid search instead of a normal training """ traceln("--- GRID SEARCH FOR CRF MODEL ---") traceln("\t- computing features on training set") traceln("\t\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber(lGraph)) chronoOn() lX, lY = self.get_lX_lY(lGraph) dPrm = {} dPrm['C'] = self.C if type(self.C) == list else [self.C] dPrm['tol'] = self.tol if type(self.tol) == list else [self.tol] dPrm['inference_cache'] = self.inference_cache if type( self.inference_cache) == list else [self.inference_cache] dPrm['max_iter'] = self.max_iter if type( self.max_iter) == list else [self.max_iter] traceln("\t- creating a SSVM-trained CRF model") traceln("\t\t- computing class weight:") clsWeights = self.computeClassWeight(lY) traceln("\t\t\t%s" % clsWeights) crf = self._getCRFModel(clsWeights) self._ssvm = OneSlackSSVM( crf #, inference_cache=self.inference_cache, C=self.C, tol=self.tol , n_jobs=self.njobs #, logger=SaveLogger(sModelFN, save_every=self.save_every) #, max_iter=self.max_iter , show_loss_every=10 # , verbose=verbose) , verbose=1) self._gs_ssvm = GridSearchCV(self._ssvm, dPrm, n_jobs=1, verbose=verbose) self.ssvm = None chronoOn() traceln("\t - training by grid search a graph-based model") traceln("\t\t solver parameters for grid search:", " inference_cache=", self.inference_cache, " C=", self.C, " tol=", self.tol, " n_jobs=", self.njobs, " max_iter=", self.max_iter) self._gs_ssvm.fit(lX, lY) traceln( "\t [%.1fs] done (graph-based model is trained with best parameters, selected by grid search) \n" % chronoOff()) self.ssvm = self._gs_ssvm.best_estimator_ #Estimator that was chosen by the search try: #win32 dBestParams = self._gs_ssvm.best_params_ except: #do not know how to get this... in dBestParams = { 'C': self.ssvm.C, 'inference_cache': self.ssvm.inference_cache, 'max_iter': self.ssvm.max_iter, 'tol': self.ssvm.tol } self.storeBestParams(dBestParams) traceln("\t", "- " * 20) traceln("\tBest parameters: ", dBestParams) traceln("\t", "- " * 20) try: self.ssvm.alphas = None self.ssvm.constraints_ = None self.ssvm.inference_cache_ = None traceln( "\t\t(model made slimmer. Not sure you can efficiently warm-start it later on. See option -w.)" ) except Exception as e: traceln( "\t\t(COULD NOT make the model slimmer. Got exception: %s" % str(e)) logger = SaveLogger(self.getModelFilename()) logger(self.ssvm) #save this model! traceln(self.getModelInfo()) #Also save the details of this grid search sFN = self.getModelFilename()[:-4] + "GridSearchCV.pkl" try: self.gzip_cPickle_dump(sFN, self._gs_ssvm) traceln("\n\n--- GridSearchCV details: (also in %s)" % sFN) traceln("--- Best parameters set found on development set:") traceln(self._gs_ssvm.best_params_) traceln("--- Grid scores on development set:") means = self._gs_ssvm.cv_results_['mean_test_score'] stds = self._gs_ssvm.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, self._gs_ssvm.cv_results_['params']): traceln("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) traceln("--- ---") except Exception as e: traceln( "WARNING: error while dealing with the GridSearchCV object.") traceln(e) #the baseline model(s) if any self._trainBaselines(lX, lY) #do some garbage collection del lX, lY gc.collect() return
def testFiles(self, lsFilename, loadFun, bBaseLine=False): """ Test the model using those files. The corresponding graphs are loaded using the loadFun function (which must return a singleton list). It reports results on stderr if some baseline model(s) were set, they are also tested Return a Report object """ lX, lY, lY_pred = [], [], [] lLabelName = None traceln("- predicting on test set") chronoOn("testFiles") # ??? why commenting this? #with tf.Session(graph=self.tf_graph) as session: #session.run(self.gcn_model.init) #self.gcn_model.restore_model(session, self.getModelFilename()) for sFilename in lsFilename: lg = loadFun(sFilename) # returns a singleton list for g in lg: if g.bConjugate: g.computeEdgeLabels() [X], [Y] = self.get_lX_lY([g]) gcn_graph_test = self.convert_lX_lY_to_GCNDataset( [X], [Y], training=False, test=True) if lLabelName == None: lLabelName = g.getEdgeLabelNameList( ) if g.bConjugate else g.getLabelNameList() traceln("\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber([g])) tNF_EF = (X[0].shape[1], X[2].shape[1]) traceln("node-dim,edge-dim", tNF_EF) # else: # assert lLabelName == g.getLabelNameList(), "Inconsistency among label spaces" #SC lY_pred_ = self.gcn_model.predict_lG(session, gcn_graph_test, verbose=False) # [Y_pred] = self.gcn_model.predict_prob_lG(self.tf_session, gcn_graph_test, verbose=False) # lY_pred.append(Y_pred.argmax(axis=1)) [Y_pred] = self.gcn_model.predict_lG(self.tf_session, gcn_graph_test, verbose=False) lY_pred.append(Y_pred) lX.append(X) lY.append(Y) del g # this can be very large gc.collect() traceln("[%.1fs] done\n" % chronoOff("testFiles")) tstRpt = TestReport(self.sName, lY_pred, lY, lLabelName, lsDocName=lsFilename) # ??? why commented out? #TODO # if bBaseLine: # lBaselineTestReport = self._testBaselinesEco(lX, lY, lLabelName, lsDocName=lsFilename) # tstRpt.attach(lBaselineTestReport) del lX, lY gc.collect() return tstRpt
def train(self, lGraph, bWarmStart=True, expiration_timestamp=None, verbose=0): """ Train a CRF model using the list of labelled graph as training if bWarmStart if True, try to continue from previous training, IF the stored model is older than expiration_timestamp!! , otherwise, starts from scratch return nothing """ if self.bGridSearch: return self.gridsearch(lGraph, verbose=verbose) traceln("\t- computing features on training set") traceln("\t\t #nodes=%d #edges=%d "%Graph.getNodeEdgeTotalNumber(lGraph)) chronoOn() lX, lY = self.get_lX_lY(lGraph) self._computeModelCaracteristics(lX) #we discover here dynamically the number of features of nodes and edges traceln("\t\t %s"%self._getNbFeatureAsText()) traceln("\t [%.1fs] done\n"%chronoOff()) bMakeSlim = not bWarmStart # for warm-start mode, we do not make the model slimer!" traceln("\t- retrieving or creating model...") self.ssvm = None sModelFN = self.getModelFilename() if bWarmStart: try: self.ssvm = self._loadIfFresh(sModelFN, expiration_timestamp, lambda x: SaveLogger(x).load()) traceln("\t- warmstarting!") #we allow to change the max_iter of the model try: self.ssvm.max_iter #to make sure we do something that makes sense... if self.ssvm.max_iter != self.max_iter: traceln("\t- changing max_iter value from (stored) %d to %d"%(self.ssvm.max_iter, self.max_iter)) self.ssvm.max_iter = self.max_iter except AttributeError: traceln("\t- cannot access or change the max_iter value") try: self.ssvm.n_jobs #to make sure we do something that makes sense... if self.ssvm.n_jobs != self.njobs: traceln("\t- changing n_jobs value from (stored) %d to %d"%(self.ssvm.n_jobs, self.njobs)) self.ssvm.n_jobs = self.njobs except AttributeError: traceln("\t- cannot access or change the n_jobs value") except Exception as e: self.ssvm = None traceln("\t- Cannot warmstart: %s"%e) #self.ssvm is either None or containing a nice ssvm model!! if not self.ssvm: traceln("\t- creating a new SSVM-trained CRF model") traceln("\t\t- computing class weight:") clsWeights = self.computeClassWeight(lY) traceln("\t\t\t --> %s" % clsWeights) crf = self._getCRFModel(clsWeights) self.ssvm = OneSlackSSVM(crf , inference_cache=self.inference_cache, C=self.C, tol=self.tol, n_jobs=self.njobs , logger=SaveLogger(sModelFN, save_every=self.save_every) , max_iter=self.max_iter , show_loss_every=10, verbose=verbose) bWarmStart = False chronoOn() traceln("\t- training graph-based model") traceln("\t\t solver parameters:" , " inference_cache=",self.inference_cache , " C=",self.C, " tol=",self.tol, " n_jobs=",self.njobs) self.ssvm.fit(lX, lY, warm_start=bWarmStart) traceln("\t [%.1fs] done (graph-based model is trained) \n"%chronoOff()) traceln(self.getModelInfo()) #cleaning useless data that takes MB on disk if bMakeSlim: self.ssvm.alphas = None self.ssvm.constraints_ = None self.ssvm.inference_cache_ = None traceln("\t\t(model made slimmer. Not sure you can efficiently warm-start it later on. See option -w.)") #the baseline model(s) if any self._trainBaselines(lX, lY) #do some garbage collection del lX, lY gc.collect() return