def trainClassifier(self, train_data): '''trains a decision tree, svm and naive bayes classifier''' #NaiveBayes Classifier #Support Vector Machine feature_set = [] labels = [] for instance in train_data: feat = self.getFeatures(instance, train=True) labels.append(feat[0]) feature_set.append(feat[1:len(feat)]) '''a feature_set is a list consisting of: [label, f1, f2, f3...], [label, f1, f2, f3...]''' vector_data = VectorDataSet(feature_set, L=labels) #Linear Discriminant svm = SVM() svm.train(vector_data, saveSpace=False) svm.save('opinion-classifier')
class SVMImpl: def __init__(self): self.Features = SVMFeatures() self.TDFeatures = SVMTDFeatures() self.svminstance = SVM() def domaintrain(self, annotatedxmllist): datalist = list() labelslist = list() for annotatedxml in annotatedxmllist: for page in annotatedxml[0]: for col in page: if(len(col) < 2): continue for tup in col: if(tup[1].text is None or tup[1].text.strip() == ''): col.remove(tup) for i in xrange(0, len(col)): if(int(col[i][0]) == SparseType.TABLELINE): labelslist.append("S") else: labelslist.append("NS") datalist.append(self.Features.domainfindfeatureFunction(i, col, annotatedxml[1])) self.train(datalist, labelslist) def domaintrainforTableDecomposition(self, tableslist): labelslist = list() datalist = list() for table in tableslist: for i in xrange(0, len(table)): if(int(table[i][0]) == SparseType.HEADER): labelslist.append("HEADER") else: labelslist.append("DATA") datalist.append(self.TDFeatures.domainfindfeatureFunction(i, table, None)) self.trainforTD(datalist, labelslist) def domainpredictforTableDecomposition(self, table): errorcount = 0 sparseerror = 0 for i in xrange(0, len(table)): test_list = list() test_list.append(self.TDFeatures.domainfindfeatureFunction(i, table, None)) if(self.predict(test_list) == 'HEADER'): predicted = SparseType.HEADER else: predicted = SparseType.DATA if((predicted) != int(table[i][0])): errorcount += 1 if((predicted) == SparseType.HEADER): sparseerror += 1 table[i][0] = predicted return [table, errorcount, sparseerror] def domainpredict(self, col, fontdict): errorcount = 0 sparseerror = 0 for i in xrange(0, len(col)): test_list = list() test_list.append(self.Features.domainfindfeatureFunction(i, col, fontdict)) if(self.predict(test_list) == 'S'): predicted = SparseType.TABLELINE else: predicted = SparseType.NONTABLELINE if((predicted) != int(col[i][0])): errorcount += 1 if((predicted) == SparseType.NONTABLELINE): sparseerror += 1 col[i][0] = predicted return [col, errorcount, sparseerror] def train(self, datalist, labelslist): data = SparseDataSet(datalist, L = labelslist) self.svminstance.C = 20 data.attachKernel('gaussian', degree = 5) self.svminstance.train(data) #result = self.svminstance.cv(data, 5) #print result def trainforTD(self, datalist, labelslist): data = SparseDataSet(datalist, L = labelslist) self.svminstance.train(data) #result = self.svminstance.cv(data, 6) #print result def predict(self, datalist): data = SparseDataSet(datalist) results = self.svminstance.test(data) return results.getPredictedLabels()[0] def save(self, filename): self.svminstance.save(filename)
def svm_prediction(peptides, job_id, input_train="SVM_POS_NEG.fasta"): """ Makes a final prediction based on SVM training files. This code is used for prediciton of blind datasets, based on the training datasets of positives and negatives. :param peptides: input peptides :param job_id: random job id assigned prior to start predicting :param input_train: input positive and negative examples used in training :return: returns SVM scores for each inputed peptide """ print("Begin SVM") # from methods import load_sqlite, store_sqlite global PATH global TMP_PATH # suppress SVM output devnull = open(os.devnull, 'w') sys.stdout, sys.stderr = devnull, devnull svm_scores = [] # query the database # for peptide in peptides: # try: # score = load_sqlite(peptide, method="SVM", unique=True) # svm_scores.append(score) # except: # pass if len(peptides) == len(svm_scores): pass else: # generate a svm input from the peptides rand = job_id input_svm = "%s_svm.fasta" % rand output_tmp = open(os.path.join(TMP_PATH, input_svm), "w") count = 0 for peptide in peptides: count += 1 output_tmp.write("> %i label=%s\n%s\n" % (count, 1, peptide)) for peptide in peptides: count += 1 output_tmp.write("> %i label=%s\n%s\n" % (count, -1, peptide)) output_tmp.close() # outputs model_svm = "%s_svm_model.txt" % rand # train data train_data = SequenceData(os.path.join(PATH, input_train), mink=1, maxk=1, maxShift=0, headerHandler=svm_process_header) train_data.attachKernel('cosine') cval = 1 s = SVM(C=cval) s.train(train_data) s.save(os.path.join(TMP_PATH, model_svm)) # load trained SVM loaded_svm = loadSVM(os.path.join(TMP_PATH, model_svm), train_data) # test data test_data = SequenceData(os.path.join(TMP_PATH, input_svm), mink=1, maxk=1, maxShift=0, headerHandler=svm_process_header) test_data.attachKernel('cosine') results = loaded_svm.test(test_data) # print results out output_svm = "%s_svm.txt" % rand results.toFile(os.path.join(TMP_PATH, output_svm)) # load results process output (positives + negatives) infile = open(os.path.join(TMP_PATH, output_svm), "r") inlines = infile.readlines() infile.close() scores = list() for line in inlines: line = line.rstrip("\r\n") try: entry = int(line.split("\t")[0]) score = float(line.split("\t")[1]) label = int(line.split("\t")[3]) if label != "-1": scores.append([entry, score]) except: pass # order list sorted_scores = sorted(scores, key=lambda scores: scores[0]) svm_scores = list() for score in sorted_scores: svm_score = score[1] svm_scores.append(svm_score) # remove the temporary model files and results try: os.remove(os.path.join(TMP_PATH, input_svm)) os.remove(os.path.join(TMP_PATH, model_svm)) os.remove(os.path.join(TMP_PATH, output_svm)) except: pass # save the peptides in db # for peptide, score in zip(peptides, svm_scores): # store_sqlite(peptide, method="SVM", information=score, save=True) # restore normal output sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ print("End SVM") return svm_scores
class SVMImpl: def __init__(self): self.Features = SVMFeatures() self.TDFeatures = SVMTDFeatures() self.svminstance = SVM() def domaintrain(self, annotatedxmllist): datalist = list() labelslist = list() for annotatedxml in annotatedxmllist: for page in annotatedxml[0]: for col in page: if (len(col) < 2): continue for tup in col: if (tup[1].text is None or tup[1].text.strip() == ''): col.remove(tup) for i in xrange(0, len(col)): if (int(col[i][0]) == SparseType.TABLELINE): labelslist.append("S") else: labelslist.append("NS") datalist.append( self.Features.domainfindfeatureFunction( i, col, annotatedxml[1])) self.train(datalist, labelslist) def domaintrainforTableDecomposition(self, tableslist): labelslist = list() datalist = list() for table in tableslist: for i in xrange(0, len(table)): if (int(table[i][0]) == SparseType.HEADER): labelslist.append("HEADER") else: labelslist.append("DATA") datalist.append( self.TDFeatures.domainfindfeatureFunction(i, table, None)) self.trainforTD(datalist, labelslist) def domainpredictforTableDecomposition(self, table): errorcount = 0 sparseerror = 0 for i in xrange(0, len(table)): test_list = list() test_list.append( self.TDFeatures.domainfindfeatureFunction(i, table, None)) if (self.predict(test_list) == 'HEADER'): predicted = SparseType.HEADER else: predicted = SparseType.DATA if ((predicted) != int(table[i][0])): errorcount += 1 if ((predicted) == SparseType.HEADER): sparseerror += 1 table[i][0] = predicted return [table, errorcount, sparseerror] def domainpredict(self, col, fontdict): errorcount = 0 sparseerror = 0 for i in xrange(0, len(col)): test_list = list() test_list.append( self.Features.domainfindfeatureFunction(i, col, fontdict)) if (self.predict(test_list) == 'S'): predicted = SparseType.TABLELINE else: predicted = SparseType.NONTABLELINE if ((predicted) != int(col[i][0])): errorcount += 1 if ((predicted) == SparseType.NONTABLELINE): sparseerror += 1 col[i][0] = predicted return [col, errorcount, sparseerror] def train(self, datalist, labelslist): data = SparseDataSet(datalist, L=labelslist) self.svminstance.C = 20 data.attachKernel('gaussian', degree=5) self.svminstance.train(data) #result = self.svminstance.cv(data, 5) #print result def trainforTD(self, datalist, labelslist): data = SparseDataSet(datalist, L=labelslist) self.svminstance.train(data) #result = self.svminstance.cv(data, 6) #print result def predict(self, datalist): data = SparseDataSet(datalist) results = self.svminstance.test(data) return results.getPredictedLabels()[0] def save(self, filename): self.svminstance.save(filename)