コード例 #1
0
 def getConfiguredGraphClass(cls):
     Graph.resetNodeTypes()
     
     DU_GRAPH = Graph_MultiPageXml
     nt = NodeType_PageXml("TR"                   #some short prefix because labels below are prefixed with it
                       , ['catch-word', 'header', 'heading', 'marginalia', 'page-number']   #EXACTLY as in GT data!!!!
                       , []      #no ignored label/ One of those above or nothing, otherwise Exception!!
                       , True    #no label means OTHER
                       )
     nt.setXpathExpr( (".//pc:TextRegion"        #how to find the nodes
                   , "./pc:TextEquiv")       #how to get their text
                )
     DU_GRAPH.addNodeType(nt)        
     return DU_GRAPH
コード例 #2
0
    def train(self,
              lGraph,
              bWarmStart=True,
              expiration_timestamp=None,
              verbose=0):
        print('Ensemble ECN Training')
        traceln("\t- computing features on training set")
        traceln("\t\t #nodes=%d  #edges=%d " %
                Graph.getNodeEdgeTotalNumber(lGraph))
        chronoOn()
        lX, lY = self.get_lX_lY(lGraph)

        self._computeModelCaracteristics(
            lX
        )  # we discover here dynamically the number of features of nodes and edges
        # self._tNF_EF contains the number of node features and edge features
        traceln("\t\t %s" % self._getNbFeatureAsText())
        traceln("\t [%.1fs] done\n" % chronoOff())

        nb_class = self.getNbClass()  # Is it better to do Y.shape ?
        print('nb_class', nb_class)

        self.model_config['node_dim'] = self._tNF_EF[0]
        self.model_config['edge_dim'] = self._tNF_EF[1]
        self.model_config['nb_class'] = nb_class
        traceln("\t- creating the sub-models")

        # TODO
        # This converts the lX,lY in the format necessary for GCN Models
        #DO we need that , can we share the label binarizer and so on ...
        #This sets the label binarizer
        gcn_graph = self.convert_lX_lY_to_GCNDataset(lX, lY, training=True)

        # Save the label Binarizer for prediction usage
        fd_lb = open(self.getlabelBinarizerFilename(), 'wb')
        pickle.dump(self.labelBinarizer, fd_lb)
        fd_lb.close()
        # Save the model config in order to restore the model later
        fd_mc = open(self.getModelConfigFilename(), 'wb')
        pickle.dump(self.model_config, fd_mc)
        fd_mc.close()

        #This would create all the DU_MODEL
        self._init_model()

        for du_model in self.models:
            #The train will create a tf graph and create the model
            du_model.train(lGraph, bWarmStart=bWarmStart)
コード例 #3
0
    def test(self, lGraph, lsDocName=None, predict_proba=False):
        """
        Test the model using those graphs and report results on stderr

        if some baseline model(s) were set, they are also tested

        Return a Report object
        """
        #Assume the model was created or loaded

        assert lGraph
        lLabelName = lGraph[0].getLabelNameList()
        traceln("\t- computing features on test set")
        traceln("\t\t #nodes=%d  #edges=%d " %
                Graph.getNodeEdgeTotalNumber(lGraph))
        chronoOn()
        lY = self.get_lY(lGraph)

        lY_pred_proba = []
        for du_model in self.models:
            model_pred = du_model.test(lGraph,
                                       lsDocName=lsDocName,
                                       predict_proba=True)
            lY_pred_proba.append(model_pred)

        print('Number of Models', len(lY_pred_proba))
        lY_pred, _ = DU_Ensemble_ECN.average_prediction(lY_pred_proba)
        tstRpt = TestReport(self.sName,
                            lY_pred,
                            lY,
                            lLabelName,
                            lsDocName=lsDocName)

        # do some garbage collection
        del lY
        gc.collect()

        return tstRpt
コード例 #4
0
    def predict(self, g):
        """
        predict the class of each node of the graph
        return a numpy array, which is a 1-dim array of size the number of nodes of the graph.
        """
        lLabelName = None

        [X], [Y] = self.get_lX_lY([g])
        gcn_graph_test = self.convert_lX_lY_to_GCNDataset([X], [Y],
                                                          training=False,
                                                          predict=True)
        if lLabelName is None:
            lLabelName = g.getLabelNameList()
            traceln("\t #nodes=%d  #edges=%d " %
                    Graph.getNodeEdgeTotalNumber([g]))
            tNF_EF = (X[0].shape[1], X[2].shape[1])
            traceln("node-dim,edge-dim:", tNF_EF)
        else:
            assert lLabelName == g.getLabelNameList(
            ), "Inconsistency among label spaces"
        lY_pred = self.gcn_model.predict_lG(self.tf_session,
                                            gcn_graph_test,
                                            verbose=False)
        return lY_pred[0]
コード例 #5
0
    def testFiles(self, lsFilename, loadFun, bBaseLine=False):
        """
        Test the model using those files. The corresponding graphs are loaded using the loadFun function (which must return a singleton list).
        It reports results on stderr

        if some baseline model(s) were set, they are also tested

        Return a Report object
        """
        raise NotImplementedError
        lX, lY, lY_pred = [], [], []
        lLabelName = None
        traceln("- predicting on test set")
        chronoOn("testFiles")

        # ? Iterate over files or over models

        for du_model in self.models:
            #du_model.load()

            m_pred = []
            #with tf.Session(graph=du_model.tf_graph) as session:
            #session.run(du_model.gcn_model.init)
            #du_model.gcn_model.restore_model(session, du_model.getModelFilename())

            for sFilename in lsFilename:
                [g] = loadFun(sFilename)  # returns a singleton list
                [X], [Y] = self.get_lX_lY([g])

                gcn_graph_test = self.convert_lX_lY_to_GCNDataset(
                    [X], [Y], training=False, test=True)
                if lLabelName == None:
                    lLabelName = g.getLabelNameList()
                    traceln("\t #nodes=%d  #edges=%d " %
                            Graph.getNodeEdgeTotalNumber([g]))
                    tNF_EF = (X[0].shape[1], X[2].shape[1])
                    traceln("node-dim,edge-dim", tNF_EF)
                else:
                    assert lLabelName == g.getLabelNameList(
                    ), "Inconsistency among label spaces"

                model_pred = du_model.test(gcn_graph_test, predict_proba=True)

                m_pred.append(model_pred[0])
                lX.append(X)
                lY.append(Y)
                g.detachFromDOM()
                del g  # this can be very large
                gc.collect()
            lY_pred.append(model_pred)

        lY_pred, _ = DU_Ensemble_ECN.average_prediction(lY_pred)
        traceln("[%.1fs] done\n" % chronoOff("testFiles"))

        tstRpt = TestReport(self.sName,
                            lY_pred,
                            lY,
                            lLabelName,
                            lsDocName=lsFilename)

        if bBaseLine:
            lBaselineTestReport = self._testBaselinesEco(lX,
                                                         lY,
                                                         lLabelName,
                                                         lsDocName=lsFilename)
            tstRpt.attach(lBaselineTestReport)

        del lX, lY
        gc.collect()

        return tstRpt
コード例 #6
0
    def test(self, lGraph, lsDocName=None, predict_proba=False):
        """
        Test the model using those graphs and report results on stderr

        if some baseline model(s) were set, they are also tested

        Return a Report object
        """
        #Assume the model was created or loaded

        assert lGraph
        lLabelName = lGraph[0].getLabelNameList()
        traceln("\t- computing features on test set")
        traceln("\t\t #nodes=%d  #edges=%d " %
                Graph.getNodeEdgeTotalNumber(lGraph))
        chronoOn()
        lX, lY = self.get_lX_lY(lGraph)

        traceln("\t [%.1fs] done\n" % chronoOff())

        gcn_graph_test = self.convert_lX_lY_to_GCNDataset(lX,
                                                          lY,
                                                          training=False,
                                                          test=True)

        chronoOn("test2")
        #with tf.Session(graph=self.tf_graph) as session:
        #with tf.Session() as session:
        #session.run(self.gcn_model.init)
        #self.gcn_model.restore_model(session, self.getModelFilename())
        session = self.tf_session
        if predict_proba:
            #TODO Should split that function diryt
            lY_pred_proba = self.gcn_model.predict_prob_lG(session,
                                                           gcn_graph_test,
                                                           verbose=False)
            traceln(" [%.1fs] done\n" % chronoOff("test2"))

            del lX, lY
            gc.collect()

            return lY_pred_proba

        else:
            #pdb.set_trace()
            lY_pred = self.gcn_model.predict_lG(session,
                                                gcn_graph_test,
                                                verbose=False)
            #end_time = time.time()
            #print("--- %s seconds ---" % (end_time - start_time))
            #print('Number of graphs:', len(lY_pred))

            # Convert to list as Python pickle does not  seem like the array while the list can be pickled
            lY_list = []
            for x in lY_pred:
                lY_list.append(list(x))

            traceln(" [%.1fs] done\n" % chronoOff("test2"))
            tstRpt = TestReport(self.sName,
                                lY_list,
                                lY,
                                lLabelName,
                                lsDocName=lsDocName)

            lBaselineTestReport = self._testBaselines(lX,
                                                      lY,
                                                      lLabelName,
                                                      lsDocName=lsDocName)
            tstRpt.attach(lBaselineTestReport)

            # do some garbage collection
            del lX, lY
            gc.collect()

            return tstRpt
コード例 #7
0
    def train(self,
              lGraph,
              bWarmStart=True,
              expiration_timestamp=None,
              verbose=0):
        """
        Return a model trained using the given labelled graphs.
        The train method is expected to save the model into self.getModelFilename(), at least at end of training
        If bWarmStart==True, The model is loaded from the disk, if any, and if fresher than given timestamp, and training restarts

        if some baseline model(s) were set, they are also trained, using the node features

        """
        print('ECN Training', self.sName)
        traceln("\t- computing features on training set")
        traceln("\t\t #nodes=%d  #edges=%d " %
                Graph.getNodeEdgeTotalNumber(lGraph))
        chronoOn()
        lX, lY = self.get_lX_lY(lGraph)

        self._computeModelCaracteristics(
            lX
        )  # we discover here dynamically the number of features of nodes and edges
        # self._tNF_EF contains the number of node features and edge features
        traceln("\t\t %s" % self._getNbFeatureAsText())
        traceln("\t [%.1fs] done\n" % chronoOff())

        traceln("\t- retrieving or creating model...")

        nb_class = self.getNbClass()  #Is it better to do Y.shape ?

        self.model_config['node_dim'] = self._tNF_EF[0]
        self.model_config['edge_dim'] = self._tNF_EF[1]
        self.model_config['nb_class'] = nb_class

        #This call the ECN internal constructor and defines the tensorflow graph
        tf_graph = tf.Graph()
        with tf_graph.as_default():
            self._init_model()
        self.tf_graph = tf_graph

        #This converts the lX,lY in the format necessary for GCN Models
        gcn_graph = self.convert_lX_lY_to_GCNDataset(lX, lY, training=True)

        #Save the label Binarizer for prediction usage
        fd_lb = open(self.getlabelBinarizerFilename(), 'wb')
        pickle.dump(self.labelBinarizer, fd_lb)
        fd_lb.close()
        #Save the model config in order to restore the model later
        fd_mc = open(self.getModelConfigFilename(), 'wb')
        pickle.dump(self.model_config, fd_mc)
        fd_mc.close()

        #TODO Save the validation set too to reproduce experiments
        #Get a validation set from the training set
        split_idx = int(self.model_config['ratio_train_val'] * len(gcn_graph))
        random.shuffle(gcn_graph)
        gcn_graph_train = []
        gcn_graph_val = []

        gcn_graph_val.extend(gcn_graph[:split_idx])
        gcn_graph_train.extend(gcn_graph[split_idx:])

        self._cleanTmpCheckpointFiles()

        patience = self.model_config[
            'patience'] if 'patience' in self.model_config else self.model_config[
                'nb_iter']
        with tf.Session(graph=self.tf_graph) as session:
            session.run([self.gcn_model.init])

            R = self.gcn_model.train_with_validation_set(
                session,
                gcn_graph_train,
                gcn_graph_val,
                self.model_config['nb_iter'],
                eval_iter=10,
                patience=patience,
                save_model_path=self.getTmpModelFilename())
            f = open(self.getValScoreFilename(), 'wb')
            pickle.dump(R, f)
            f.close()

        #This save the model
        self._getBestModelVal()
        self._cleanTmpCheckpointFiles()
        #We reopen a session here and load the selected model if we need one
        self.restore()