def trainListFile(self, listTrainFile, listmanualfiles): if len(listmanualfiles) != len(listTrainFile): print("Co loi") sys.exit() self.reset() queries = dlib.ranking_pairs() for index in range(0, len(listTrainFile)): self.reset() data = dlib.ranking_pair() inputNonRelevant = " ".join([line for line in open(listTrainFile[index], 'r').readlines()]) tpAllSent = myTokenizer(inputNonRelevant) self.inputFromString(inputNonRelevant) inputRelevant = " ".join([line for line in open(listmanualfiles[index], 'r').readlines()]) tpRelevant = myTokenizer(inputRelevant) tpNonRelevant = list(set(tpAllSent).difference(set(tpRelevant))) self.genAllVector() for sent in tpRelevant: data.relevant.append(dlib.vector(self.dicVector.get(sent.strip()))) for sent in tpNonRelevant: data.nonrelevant.append(dlib.vector(self.dicVector.get(sent.strip()))) queries.append(data) trainer = dlib.svm_rank_trainer() trainer.c = 10 rank = trainer.train(queries) _weight = [] for i in range(0, len(rank.weights)): _weight.append(rank.weights[i]) return _weight
# Dlib comes with a compiled python interface for python 2.7 on MS Windows. If # you are using another python version or operating system then you need to # compile the dlib python interface before you can use this file. To do this, # run compile_dlib_python_module.bat. This should work on any operating system # so long as you have CMake and boost-python installed. On Ubuntu, this can be # done easily by running the command: sudo apt-get install libboost-python-dev cmake import dlib # Now let's make some testing data. To make it really simple, let's suppose that # we are ranking 2D vectors and that vectors with positive values in the first # dimension should rank higher than other vectors. So what we do is make # examples of relevant (i.e. high ranking) and non-relevant (i.e. low ranking) # vectors and store them into a ranking_pair object like so: data = dlib.ranking_pair() # Here we add two examples. In real applications, you would want lots of # examples of relevant and non-relevant vectors. data.relevant.append(dlib.vector([1, 0])) data.nonrelevant.append(dlib.vector([0, 1])) # Now that we have some data, we can use a machine learning method to learn a # function that will give high scores to the relevant vectors and low scores to # the non-relevant vectors. trainer = dlib.svm_rank_trainer() # Note that the trainer object has some parameters that control how it behaves. # For example, since this is the SVM-Rank algorithm it has a C parameter that # controls the trade-off between trying to fit the training data exactly or # selecting a "simpler" solution which might generalize better. trainer.c = 10
def train(self, directoryPlain, directoryManual): self.reset() listFile = [] listPlainFile = listAllFileInFolder(directoryPlain) listManualFile = listAllFileInFolder(directoryManual) dicPlainFile = {} dicManualFile = {} for file in listPlainFile: fname = file.strip().split('/')[-1] listFile.append(fname) dicPlainFile[fname] = file for file in listManualFile: fname = file.strip().split('/')[-1] listFile.append(fname) dicManualFile[fname] = file listFile = list(set(listFile)) queries = dlib.ranking_pairs() countt = 0 outfile = open("completefile.txt", 'w') for file in listFile: outvecfile = open("/home/hien/Data/Work/Wordnet_naiscorp/test/valuevector/"+file.strip().split('/')[-1], 'w') countt = countt + 1 outfile.write(file+'\n') print (file, countt) self.reset() data = dlib.ranking_pair() inputNonRelevant = " ".join([line for line in open(dicPlainFile.get(file), 'r').readlines()]) tpAllSent = myTokenizer(inputNonRelevant) self.inputFromString(inputNonRelevant) inputRelevant = " ".join([line for line in open(dicManualFile.get(file), 'r').readlines()]) tpRelevant = myTokenizer(inputRelevant) tpNonRelevant = list(set(tpAllSent).difference(set(tpRelevant))) self.genAllVector() for sent in tpAllSent: outvecfile.write(str(self.dicVector.get(sent.strip()))+"\t"+sent.strip()+'\n') outvecfile.close() for sent in tpRelevant: # print (sent) # print(self.dicVector.get(sent)) # print(type(self.dicVector.get(sent))) data.relevant.append(dlib.vector(self.dicVector.get(sent.strip()))) # outvecfile.write(str(self.dicVector.get(sent.strip()))+"\t"+sent.strip()+'\n') # outvecfile. for sent in tpNonRelevant: # print(self.dicVector.get(sent)) data.nonrelevant.append(dlib.vector(self.dicVector.get(sent.strip()))) queries.append(data) trainer = dlib.svm_rank_trainer() trainer.c = 10 rank = trainer.train(queries) _weight = [] for i in range(0, len(rank.weights)): _weight.append(rank.weights[i]) # print(type(rank.weights)) # print (rank.weights[0]) # print (rank.weights) # print(_weight) # return rank.weights return _weight
# # Compiling dlib should work on any operating system so long as you have # CMake installed. On Ubuntu, this can be done easily by running the # command: # sudo apt-get install cmake # import dlib # Now let's make some testing data. To make it really simple, let's suppose # that we are ranking 2D vectors and that vectors with positive values in the # first dimension should rank higher than other vectors. So what we do is make # examples of relevant (i.e. high ranking) and non-relevant (i.e. low ranking) # vectors and store them into a ranking_pair object like so: data = dlib.ranking_pair() # Here we add two examples. In real applications, you would want lots of # examples of relevant and non-relevant vectors. data.relevant.append(dlib.vector([1, 0])) data.nonrelevant.append(dlib.vector([0, 1])) # Now that we have some data, we can use a machine learning method to learn a # function that will give high scores to the relevant vectors and low scores to # the non-relevant vectors. trainer = dlib.svm_rank_trainer() # Note that the trainer object has some parameters that control how it behaves. # For example, since this is the SVM-Rank algorithm it has a C parameter that # controls the trade-off between trying to fit the training data exactly or # selecting a "simpler" solution which might generalize better. trainer.c = 10