def getConfiguredGraphClass(cls): Graph.resetNodeTypes() DU_GRAPH = Graph_MultiPageXml nt = NodeType_PageXml("TR" #some short prefix because labels below are prefixed with it , ['catch-word', 'header', 'heading', 'marginalia', 'page-number'] #EXACTLY as in GT data!!!! , [] #no ignored label/ One of those above or nothing, otherwise Exception!! , True #no label means OTHER ) nt.setXpathExpr( (".//pc:TextRegion" #how to find the nodes , "./pc:TextEquiv") #how to get their text ) DU_GRAPH.addNodeType(nt) return DU_GRAPH
def train(self, lGraph, bWarmStart=True, expiration_timestamp=None, verbose=0): print('Ensemble ECN Training') traceln("\t- computing features on training set") traceln("\t\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber(lGraph)) chronoOn() lX, lY = self.get_lX_lY(lGraph) self._computeModelCaracteristics( lX ) # we discover here dynamically the number of features of nodes and edges # self._tNF_EF contains the number of node features and edge features traceln("\t\t %s" % self._getNbFeatureAsText()) traceln("\t [%.1fs] done\n" % chronoOff()) nb_class = self.getNbClass() # Is it better to do Y.shape ? print('nb_class', nb_class) self.model_config['node_dim'] = self._tNF_EF[0] self.model_config['edge_dim'] = self._tNF_EF[1] self.model_config['nb_class'] = nb_class traceln("\t- creating the sub-models") # TODO # This converts the lX,lY in the format necessary for GCN Models #DO we need that , can we share the label binarizer and so on ... #This sets the label binarizer gcn_graph = self.convert_lX_lY_to_GCNDataset(lX, lY, training=True) # Save the label Binarizer for prediction usage fd_lb = open(self.getlabelBinarizerFilename(), 'wb') pickle.dump(self.labelBinarizer, fd_lb) fd_lb.close() # Save the model config in order to restore the model later fd_mc = open(self.getModelConfigFilename(), 'wb') pickle.dump(self.model_config, fd_mc) fd_mc.close() #This would create all the DU_MODEL self._init_model() for du_model in self.models: #The train will create a tf graph and create the model du_model.train(lGraph, bWarmStart=bWarmStart)
def test(self, lGraph, lsDocName=None, predict_proba=False): """ Test the model using those graphs and report results on stderr if some baseline model(s) were set, they are also tested Return a Report object """ #Assume the model was created or loaded assert lGraph lLabelName = lGraph[0].getLabelNameList() traceln("\t- computing features on test set") traceln("\t\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber(lGraph)) chronoOn() lY = self.get_lY(lGraph) lY_pred_proba = [] for du_model in self.models: model_pred = du_model.test(lGraph, lsDocName=lsDocName, predict_proba=True) lY_pred_proba.append(model_pred) print('Number of Models', len(lY_pred_proba)) lY_pred, _ = DU_Ensemble_ECN.average_prediction(lY_pred_proba) tstRpt = TestReport(self.sName, lY_pred, lY, lLabelName, lsDocName=lsDocName) # do some garbage collection del lY gc.collect() return tstRpt
def predict(self, g): """ predict the class of each node of the graph return a numpy array, which is a 1-dim array of size the number of nodes of the graph. """ lLabelName = None [X], [Y] = self.get_lX_lY([g]) gcn_graph_test = self.convert_lX_lY_to_GCNDataset([X], [Y], training=False, predict=True) if lLabelName is None: lLabelName = g.getLabelNameList() traceln("\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber([g])) tNF_EF = (X[0].shape[1], X[2].shape[1]) traceln("node-dim,edge-dim:", tNF_EF) else: assert lLabelName == g.getLabelNameList( ), "Inconsistency among label spaces" lY_pred = self.gcn_model.predict_lG(self.tf_session, gcn_graph_test, verbose=False) return lY_pred[0]
def testFiles(self, lsFilename, loadFun, bBaseLine=False): """ Test the model using those files. The corresponding graphs are loaded using the loadFun function (which must return a singleton list). It reports results on stderr if some baseline model(s) were set, they are also tested Return a Report object """ raise NotImplementedError lX, lY, lY_pred = [], [], [] lLabelName = None traceln("- predicting on test set") chronoOn("testFiles") # ? Iterate over files or over models for du_model in self.models: #du_model.load() m_pred = [] #with tf.Session(graph=du_model.tf_graph) as session: #session.run(du_model.gcn_model.init) #du_model.gcn_model.restore_model(session, du_model.getModelFilename()) for sFilename in lsFilename: [g] = loadFun(sFilename) # returns a singleton list [X], [Y] = self.get_lX_lY([g]) gcn_graph_test = self.convert_lX_lY_to_GCNDataset( [X], [Y], training=False, test=True) if lLabelName == None: lLabelName = g.getLabelNameList() traceln("\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber([g])) tNF_EF = (X[0].shape[1], X[2].shape[1]) traceln("node-dim,edge-dim", tNF_EF) else: assert lLabelName == g.getLabelNameList( ), "Inconsistency among label spaces" model_pred = du_model.test(gcn_graph_test, predict_proba=True) m_pred.append(model_pred[0]) lX.append(X) lY.append(Y) g.detachFromDOM() del g # this can be very large gc.collect() lY_pred.append(model_pred) lY_pred, _ = DU_Ensemble_ECN.average_prediction(lY_pred) traceln("[%.1fs] done\n" % chronoOff("testFiles")) tstRpt = TestReport(self.sName, lY_pred, lY, lLabelName, lsDocName=lsFilename) if bBaseLine: lBaselineTestReport = self._testBaselinesEco(lX, lY, lLabelName, lsDocName=lsFilename) tstRpt.attach(lBaselineTestReport) del lX, lY gc.collect() return tstRpt
def test(self, lGraph, lsDocName=None, predict_proba=False): """ Test the model using those graphs and report results on stderr if some baseline model(s) were set, they are also tested Return a Report object """ #Assume the model was created or loaded assert lGraph lLabelName = lGraph[0].getLabelNameList() traceln("\t- computing features on test set") traceln("\t\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber(lGraph)) chronoOn() lX, lY = self.get_lX_lY(lGraph) traceln("\t [%.1fs] done\n" % chronoOff()) gcn_graph_test = self.convert_lX_lY_to_GCNDataset(lX, lY, training=False, test=True) chronoOn("test2") #with tf.Session(graph=self.tf_graph) as session: #with tf.Session() as session: #session.run(self.gcn_model.init) #self.gcn_model.restore_model(session, self.getModelFilename()) session = self.tf_session if predict_proba: #TODO Should split that function diryt lY_pred_proba = self.gcn_model.predict_prob_lG(session, gcn_graph_test, verbose=False) traceln(" [%.1fs] done\n" % chronoOff("test2")) del lX, lY gc.collect() return lY_pred_proba else: #pdb.set_trace() lY_pred = self.gcn_model.predict_lG(session, gcn_graph_test, verbose=False) #end_time = time.time() #print("--- %s seconds ---" % (end_time - start_time)) #print('Number of graphs:', len(lY_pred)) # Convert to list as Python pickle does not seem like the array while the list can be pickled lY_list = [] for x in lY_pred: lY_list.append(list(x)) traceln(" [%.1fs] done\n" % chronoOff("test2")) tstRpt = TestReport(self.sName, lY_list, lY, lLabelName, lsDocName=lsDocName) lBaselineTestReport = self._testBaselines(lX, lY, lLabelName, lsDocName=lsDocName) tstRpt.attach(lBaselineTestReport) # do some garbage collection del lX, lY gc.collect() return tstRpt
def train(self, lGraph, bWarmStart=True, expiration_timestamp=None, verbose=0): """ Return a model trained using the given labelled graphs. The train method is expected to save the model into self.getModelFilename(), at least at end of training If bWarmStart==True, The model is loaded from the disk, if any, and if fresher than given timestamp, and training restarts if some baseline model(s) were set, they are also trained, using the node features """ print('ECN Training', self.sName) traceln("\t- computing features on training set") traceln("\t\t #nodes=%d #edges=%d " % Graph.getNodeEdgeTotalNumber(lGraph)) chronoOn() lX, lY = self.get_lX_lY(lGraph) self._computeModelCaracteristics( lX ) # we discover here dynamically the number of features of nodes and edges # self._tNF_EF contains the number of node features and edge features traceln("\t\t %s" % self._getNbFeatureAsText()) traceln("\t [%.1fs] done\n" % chronoOff()) traceln("\t- retrieving or creating model...") nb_class = self.getNbClass() #Is it better to do Y.shape ? self.model_config['node_dim'] = self._tNF_EF[0] self.model_config['edge_dim'] = self._tNF_EF[1] self.model_config['nb_class'] = nb_class #This call the ECN internal constructor and defines the tensorflow graph tf_graph = tf.Graph() with tf_graph.as_default(): self._init_model() self.tf_graph = tf_graph #This converts the lX,lY in the format necessary for GCN Models gcn_graph = self.convert_lX_lY_to_GCNDataset(lX, lY, training=True) #Save the label Binarizer for prediction usage fd_lb = open(self.getlabelBinarizerFilename(), 'wb') pickle.dump(self.labelBinarizer, fd_lb) fd_lb.close() #Save the model config in order to restore the model later fd_mc = open(self.getModelConfigFilename(), 'wb') pickle.dump(self.model_config, fd_mc) fd_mc.close() #TODO Save the validation set too to reproduce experiments #Get a validation set from the training set split_idx = int(self.model_config['ratio_train_val'] * len(gcn_graph)) random.shuffle(gcn_graph) gcn_graph_train = [] gcn_graph_val = [] gcn_graph_val.extend(gcn_graph[:split_idx]) gcn_graph_train.extend(gcn_graph[split_idx:]) self._cleanTmpCheckpointFiles() patience = self.model_config[ 'patience'] if 'patience' in self.model_config else self.model_config[ 'nb_iter'] with tf.Session(graph=self.tf_graph) as session: session.run([self.gcn_model.init]) R = self.gcn_model.train_with_validation_set( session, gcn_graph_train, gcn_graph_val, self.model_config['nb_iter'], eval_iter=10, patience=patience, save_model_path=self.getTmpModelFilename()) f = open(self.getValScoreFilename(), 'wb') pickle.dump(R, f) f.close() #This save the model self._getBestModelVal() self._cleanTmpCheckpointFiles() #We reopen a session here and load the selected model if we need one self.restore()