Example #1
0
 def trainListFile(self, listTrainFile, listmanualfiles):
     if len(listmanualfiles) != len(listTrainFile):
         print("Co loi")
         sys.exit()
     self.reset()
     
     
     queries = dlib.ranking_pairs()
     
     for index in range(0, len(listTrainFile)):
         self.reset()
         data = dlib.ranking_pair()
         
         inputNonRelevant = " ".join([line for line in open(listTrainFile[index], 'r').readlines()])
         tpAllSent = myTokenizer(inputNonRelevant)
         self.inputFromString(inputNonRelevant)
         inputRelevant = " ".join([line for line in open(listmanualfiles[index], 'r').readlines()])
         tpRelevant = myTokenizer(inputRelevant)
         tpNonRelevant = list(set(tpAllSent).difference(set(tpRelevant)))
         
         self.genAllVector()
         for sent in tpRelevant:
             data.relevant.append(dlib.vector(self.dicVector.get(sent.strip())))
         for sent in tpNonRelevant:
             data.nonrelevant.append(dlib.vector(self.dicVector.get(sent.strip())))
             
         queries.append(data)
     
     trainer = dlib.svm_rank_trainer()
     trainer.c = 10
     rank = trainer.train(queries)
     _weight = []
     for i in range(0, len(rank.weights)):
         _weight.append(rank.weights[i])
     return _weight
Example #2
0
#   Dlib comes with a compiled python interface for python 2.7 on MS Windows.  If
#   you are using another python version or operating system then you need to
#   compile the dlib python interface before you can use this file.  To do this,
#   run compile_dlib_python_module.bat.  This should work on any operating system
#   so long as you have CMake and boost-python installed.  On Ubuntu, this can be
#   done easily by running the command:  sudo apt-get install libboost-python-dev cmake

import dlib

# Now let's make some testing data.  To make it really simple, let's suppose that
# we are ranking 2D vectors and that vectors with positive values in the first
# dimension should rank higher than other vectors.  So what we do is make
# examples of relevant (i.e. high ranking) and non-relevant (i.e. low ranking)
# vectors and store them into a ranking_pair object like so:

data = dlib.ranking_pair()
# Here we add two examples.  In real applications, you would want lots of
# examples of relevant and non-relevant vectors.
data.relevant.append(dlib.vector([1, 0]))
data.nonrelevant.append(dlib.vector([0, 1]))

# Now that we have some data, we can use a machine learning method to learn a
# function that will give high scores to the relevant vectors and low scores to
# the non-relevant vectors.
trainer = dlib.svm_rank_trainer()
# Note that the trainer object has some parameters that control how it behaves.
# For example, since this is the SVM-Rank algorithm it has a C parameter that
# controls the trade-off between trying to fit the training data exactly or
# selecting a "simpler" solution which might generalize better.
trainer.c = 10
Example #3
0
    def train(self, directoryPlain, directoryManual):
        self.reset()
        listFile = []
        listPlainFile = listAllFileInFolder(directoryPlain)
        listManualFile = listAllFileInFolder(directoryManual)
        dicPlainFile = {}
        dicManualFile = {}
        for file in listPlainFile:
            fname = file.strip().split('/')[-1]
            listFile.append(fname)
            dicPlainFile[fname] = file
        
        for file in listManualFile:
            fname = file.strip().split('/')[-1]
            listFile.append(fname)
            dicManualFile[fname] = file
        
        listFile = list(set(listFile))
        
        queries = dlib.ranking_pairs()
        
        countt = 0
        outfile = open("completefile.txt", 'w')
        for file in listFile:
            outvecfile = open("/home/hien/Data/Work/Wordnet_naiscorp/test/valuevector/"+file.strip().split('/')[-1], 'w')
            countt = countt + 1
            outfile.write(file+'\n')
            print (file, countt)
            self.reset()
            data = dlib.ranking_pair()
            
            inputNonRelevant = " ".join([line for line in open(dicPlainFile.get(file), 'r').readlines()])
            tpAllSent = myTokenizer(inputNonRelevant)
            self.inputFromString(inputNonRelevant)
            inputRelevant = " ".join([line for line in open(dicManualFile.get(file), 'r').readlines()])
            tpRelevant = myTokenizer(inputRelevant)
            tpNonRelevant = list(set(tpAllSent).difference(set(tpRelevant)))
            
            self.genAllVector()
            for sent in tpAllSent:
                outvecfile.write(str(self.dicVector.get(sent.strip()))+"\t"+sent.strip()+'\n')
            outvecfile.close()
            for sent in tpRelevant:
#                 print (sent)
#                 print(self.dicVector.get(sent))
#                 print(type(self.dicVector.get(sent)))
                data.relevant.append(dlib.vector(self.dicVector.get(sent.strip())))
#                 outvecfile.write(str(self.dicVector.get(sent.strip()))+"\t"+sent.strip()+'\n')
#                 outvecfile.
            for sent in tpNonRelevant:
#                 print(self.dicVector.get(sent))
                data.nonrelevant.append(dlib.vector(self.dicVector.get(sent.strip())))
                
            queries.append(data)
        
        trainer = dlib.svm_rank_trainer()
        trainer.c = 10
        rank = trainer.train(queries)
        _weight = []
        for i in range(0, len(rank.weights)):
            _weight.append(rank.weights[i])
#         print(type(rank.weights))
#         print (rank.weights[0])
#         print (rank.weights)
#         print(_weight)
#         return rank.weights
        return _weight
Example #4
0
#
#   Compiling dlib should work on any operating system so long as you have
#   CMake installed.  On Ubuntu, this can be done easily by running the
#   command:
#       sudo apt-get install cmake
#

import dlib


# Now let's make some testing data.  To make it really simple, let's suppose
# that we are ranking 2D vectors and that vectors with positive values in the
# first dimension should rank higher than other vectors.  So what we do is make
# examples of relevant (i.e. high ranking) and non-relevant (i.e. low ranking)
# vectors and store them into a ranking_pair object like so:
data = dlib.ranking_pair()
# Here we add two examples.  In real applications, you would want lots of
# examples of relevant and non-relevant vectors.
data.relevant.append(dlib.vector([1, 0]))
data.nonrelevant.append(dlib.vector([0, 1]))

# Now that we have some data, we can use a machine learning method to learn a
# function that will give high scores to the relevant vectors and low scores to
# the non-relevant vectors.
trainer = dlib.svm_rank_trainer()
# Note that the trainer object has some parameters that control how it behaves.
# For example, since this is the SVM-Rank algorithm it has a C parameter that
# controls the trade-off between trying to fit the training data exactly or
# selecting a "simpler" solution which might generalize better. 
trainer.c = 10