Ejemplo n.º 1
0
 def trainListFile(self, listTrainFile, listmanualfiles):
     if len(listmanualfiles) != len(listTrainFile):
         print("Co loi")
         sys.exit()
     self.reset()
     
     
     queries = dlib.ranking_pairs()
     
     for index in range(0, len(listTrainFile)):
         self.reset()
         data = dlib.ranking_pair()
         
         inputNonRelevant = " ".join([line for line in open(listTrainFile[index], 'r').readlines()])
         tpAllSent = myTokenizer(inputNonRelevant)
         self.inputFromString(inputNonRelevant)
         inputRelevant = " ".join([line for line in open(listmanualfiles[index], 'r').readlines()])
         tpRelevant = myTokenizer(inputRelevant)
         tpNonRelevant = list(set(tpAllSent).difference(set(tpRelevant)))
         
         self.genAllVector()
         for sent in tpRelevant:
             data.relevant.append(dlib.vector(self.dicVector.get(sent.strip())))
         for sent in tpNonRelevant:
             data.nonrelevant.append(dlib.vector(self.dicVector.get(sent.strip())))
             
         queries.append(data)
     
     trainer = dlib.svm_rank_trainer()
     trainer.c = 10
     rank = trainer.train(queries)
     _weight = []
     for i in range(0, len(rank.weights)):
         _weight.append(rank.weights[i])
     return _weight
Ejemplo n.º 2
0
# ranking function that gives every relevant vector a higher score than every
# non-relevant vector.  Sometimes what you want to do is a little more complex
# than this.
#
# For example, in the web page ranking example we have to rank pages based on a
# user's query.  In this case, each query will have its own set of relevant and
# non-relevant documents.  What might be relevant to one query may well be
# non-relevant to another.  So in this case we don't have a single global set of
# relevant web pages and another set of non-relevant web pages.
#
# To handle cases like this, we can simply give multiple ranking_pair instances
# to the trainer.  Therefore, each ranking_pair would represent the
# relevant/non-relevant sets for a particular query.  An example is shown below
# (for simplicity, we reuse our data from above to make 4 identical "queries").

queries = dlib.ranking_pairs()
queries.append(data)
queries.append(data)
queries.append(data)
queries.append(data)

# We can train just as before.
rank = trainer.train(queries)

# Now that we have multiple ranking_pair instances, we can also use
# cross_validate_ranking_trainer().  This performs cross-validation by splitting
# the queries up into folds.  That is, it lets the trainer train on a subset of
# ranking_pair instances and tests on the rest.  It does this over 4 different
# splits and returns the overall ranking accuracy based on the held out data.
# Just like test_ranking_function(), it reports both the ordering accuracy and
# mean average precision.
Ejemplo n.º 3
0
    def train(self, directoryPlain, directoryManual):
        self.reset()
        listFile = []
        listPlainFile = listAllFileInFolder(directoryPlain)
        listManualFile = listAllFileInFolder(directoryManual)
        dicPlainFile = {}
        dicManualFile = {}
        for file in listPlainFile:
            fname = file.strip().split('/')[-1]
            listFile.append(fname)
            dicPlainFile[fname] = file
        
        for file in listManualFile:
            fname = file.strip().split('/')[-1]
            listFile.append(fname)
            dicManualFile[fname] = file
        
        listFile = list(set(listFile))
        
        queries = dlib.ranking_pairs()
        
        countt = 0
        outfile = open("completefile.txt", 'w')
        for file in listFile:
            outvecfile = open("/home/hien/Data/Work/Wordnet_naiscorp/test/valuevector/"+file.strip().split('/')[-1], 'w')
            countt = countt + 1
            outfile.write(file+'\n')
            print (file, countt)
            self.reset()
            data = dlib.ranking_pair()
            
            inputNonRelevant = " ".join([line for line in open(dicPlainFile.get(file), 'r').readlines()])
            tpAllSent = myTokenizer(inputNonRelevant)
            self.inputFromString(inputNonRelevant)
            inputRelevant = " ".join([line for line in open(dicManualFile.get(file), 'r').readlines()])
            tpRelevant = myTokenizer(inputRelevant)
            tpNonRelevant = list(set(tpAllSent).difference(set(tpRelevant)))
            
            self.genAllVector()
            for sent in tpAllSent:
                outvecfile.write(str(self.dicVector.get(sent.strip()))+"\t"+sent.strip()+'\n')
            outvecfile.close()
            for sent in tpRelevant:
#                 print (sent)
#                 print(self.dicVector.get(sent))
#                 print(type(self.dicVector.get(sent)))
                data.relevant.append(dlib.vector(self.dicVector.get(sent.strip())))
#                 outvecfile.write(str(self.dicVector.get(sent.strip()))+"\t"+sent.strip()+'\n')
#                 outvecfile.
            for sent in tpNonRelevant:
#                 print(self.dicVector.get(sent))
                data.nonrelevant.append(dlib.vector(self.dicVector.get(sent.strip())))
                
            queries.append(data)
        
        trainer = dlib.svm_rank_trainer()
        trainer.c = 10
        rank = trainer.train(queries)
        _weight = []
        for i in range(0, len(rank.weights)):
            _weight.append(rank.weights[i])
#         print(type(rank.weights))
#         print (rank.weights[0])
#         print (rank.weights)
#         print(_weight)
#         return rank.weights
        return _weight
Ejemplo n.º 4
0
# relevant set and non-relevant set.  The trainer is attempting to find a
# ranking function that gives every relevant vector a higher score than every
# non-relevant vector.  Sometimes what you want to do is a little more complex
# than this. 
#
# For example, in the web page ranking example we have to rank pages based on a
# user's query.  In this case, each query will have its own set of relevant and
# non-relevant documents.  What might be relevant to one query may well be
# non-relevant to another.  So in this case we don't have a single global set of
# relevant web pages and another set of non-relevant web pages.  
#
# To handle cases like this, we can simply give multiple ranking_pair instances
# to the trainer.  Therefore, each ranking_pair would represent the
# relevant/non-relevant sets for a particular query.  An example is shown below
# (for simplicity, we reuse our data from above to make 4 identical "queries").
queries = dlib.ranking_pairs()
queries.append(data)
queries.append(data)
queries.append(data)
queries.append(data)

# We can train just as before.  
rank = trainer.train(queries)

# Now that we have multiple ranking_pair instances, we can also use
# cross_validate_ranking_trainer().  This performs cross-validation by splitting
# the queries up into folds.  That is, it lets the trainer train on a subset of
# ranking_pair instances and tests on the rest.  It does this over 4 different
# splits and returns the overall ranking accuracy based on the held out data.
# Just like test_ranking_function(), it reports both the ordering accuracy and
# mean average precision.