def learn(classified, histograms): clf = SVM() total_samples = 0 for c in classified.keys(): cim = classified[c] total_samples = total_samples + len(cim) samples = [] labels = [] for c in classified.keys(): cim = classified[c] for im in cim: hist = histograms[im] row = [] for j in range(NUM_BINS): row.append(cv.QueryHistValue_1D(hist, j)) samples.append(row) labels.append(c) data = VectorDataSet(samples, L=labels) print str(data) clf.train(data) return clf
class SVMImpl: def __init__(self): self.Features = SVMFeatures() self.svminstance = SVM() def domaintrain(self, annotatedxmllist): datalist = list() labelslist = list() for annotatedxml in annotatedxmllist: for page in annotatedxml[0]: for col in page: if(len(col) < 2): continue for i in xrange(0, len(col)): if(int(col[i][0]) == SparseType.OTHERSPARSE): labelslist.append("S") else: labelslist.append("NS") datalist.append(self.Features.domainfindfeatureFunction(i, col, annotatedxml[1])) self.train(datalist, labelslist) def domainpredict(self, col, fontdict): for i in xrange(0, len(col)): test_list = list() test_list.append(self.Features.domainfindfeatureFunction(i, col, fontdict)) if(self.predict(test_list) == 'S'): col[i][0] = SparseType.OTHERSPARSE else: col[i][0] = SparseType.NONSPARSE return col def train(self, datalist, labelslist): data = SparseDataSet(datalist, L = labelslist) self.svminstance.C = 20 data.attachKernel('gaussian', degree = 5) self.svminstance.train(data) #result = self.svminstance.cv(data, 2) #print result.getPredictedLabels() def predict(self, datalist): data = SparseDataSet(datalist) results = self.svminstance.test(data) return results.getPredictedLabels()[0]
def trainClassifier(self, train_data): '''trains a decision tree, svm and naive bayes classifier''' #NaiveBayes Classifier #Support Vector Machine feature_set = [] labels = [] for instance in train_data: feat = self.getFeatures(instance, train=True) labels.append(feat[0]) feature_set.append(feat[1:len(feat)]) '''a feature_set is a list consisting of: [label, f1, f2, f3...], [label, f1, f2, f3...]''' vector_data = VectorDataSet(feature_set, L=labels) #Linear Discriminant svm = SVM() svm.train(vector_data, saveSpace=False) svm.save('opinion-classifier')
def run(self, **kwargs): self.run_parameters = kwargs data = self.database.get_pyml_dataset(self.features, **kwargs) data.attachKernel('gaussian', gamma=kwargs['gamma'], normalization='cosine') # normalization='cosine' if self.classifier == Classifier.SVM: # svm = SVM() svm = SVM(optimizer='pegasos') training, testing = self.database.get_cv_folds(kwargs['folds']) self.pyml_result = cvFromFolds(svm, data, training, testing, numFolds=kwargs['folds'], verbose=False) self.get_rfpp() if kwargs['save']: self.__save_results()
def pyMLSVM(basePath, isMulti = False): """ Run SVM on Image Training Data basePath is the directory containing the training file isMulti is True when we are running multiclass classication """ # Creating training dataset as VectorDataSet object cnt = 1 trainLabels = [] trainRawData = [] for img in MLUtilities.readImages(basePath): if cnt % 1000 == 0: print("Creating row "+ str(cnt)) row = None if isMulti: row = MLUtilities.createPyMLSVMRow(img) else: row = MLUtilities.createPyMLBinarySVMRow(img) trainLabels.append(row[0]) trainRawData.append(row[1]) cnt+=1 trainData = VectorDataSet(trainRawData, L=trainLabels) # Changing Kernel # k = ker.Polynomial(degree = 2) # trainData.attachKernel(k) r = None # Training SVM if isMulti: m = multi.OneAgainstRest (SVM()) r = m.cv(trainData, numFolds=5) else: s = SVM() r = s.cv(trainData, numFolds=5) print(r.getConfusionMatrix()) print(r.getROC())
def __init__(self): self.Features = SVMFeatures() self.TDFeatures = SVMTDFeatures() self.svminstance = SVM()
class SVMImpl: def __init__(self): self.Features = SVMFeatures() self.TDFeatures = SVMTDFeatures() self.svminstance = SVM() def domaintrain(self, annotatedxmllist): datalist = list() labelslist = list() for annotatedxml in annotatedxmllist: for page in annotatedxml[0]: for col in page: if(len(col) < 2): continue for tup in col: if(tup[1].text is None or tup[1].text.strip() == ''): col.remove(tup) for i in xrange(0, len(col)): if(int(col[i][0]) == SparseType.TABLELINE): labelslist.append("S") else: labelslist.append("NS") datalist.append(self.Features.domainfindfeatureFunction(i, col, annotatedxml[1])) self.train(datalist, labelslist) def domaintrainforTableDecomposition(self, tableslist): labelslist = list() datalist = list() for table in tableslist: for i in xrange(0, len(table)): if(int(table[i][0]) == SparseType.HEADER): labelslist.append("HEADER") else: labelslist.append("DATA") datalist.append(self.TDFeatures.domainfindfeatureFunction(i, table, None)) self.trainforTD(datalist, labelslist) def domainpredictforTableDecomposition(self, table): errorcount = 0 sparseerror = 0 for i in xrange(0, len(table)): test_list = list() test_list.append(self.TDFeatures.domainfindfeatureFunction(i, table, None)) if(self.predict(test_list) == 'HEADER'): predicted = SparseType.HEADER else: predicted = SparseType.DATA if((predicted) != int(table[i][0])): errorcount += 1 if((predicted) == SparseType.HEADER): sparseerror += 1 table[i][0] = predicted return [table, errorcount, sparseerror] def domainpredict(self, col, fontdict): errorcount = 0 sparseerror = 0 for i in xrange(0, len(col)): test_list = list() test_list.append(self.Features.domainfindfeatureFunction(i, col, fontdict)) if(self.predict(test_list) == 'S'): predicted = SparseType.TABLELINE else: predicted = SparseType.NONTABLELINE if((predicted) != int(col[i][0])): errorcount += 1 if((predicted) == SparseType.NONTABLELINE): sparseerror += 1 col[i][0] = predicted return [col, errorcount, sparseerror] def train(self, datalist, labelslist): data = SparseDataSet(datalist, L = labelslist) self.svminstance.C = 20 data.attachKernel('gaussian', degree = 5) self.svminstance.train(data) #result = self.svminstance.cv(data, 5) #print result def trainforTD(self, datalist, labelslist): data = SparseDataSet(datalist, L = labelslist) self.svminstance.train(data) #result = self.svminstance.cv(data, 6) #print result def predict(self, datalist): data = SparseDataSet(datalist) results = self.svminstance.test(data) return results.getPredictedLabels()[0] def save(self, filename): self.svminstance.save(filename)
from PyML import SparseDataSet, SVM __author__ = 'basir' data = SparseDataSet('data/heartSparse.data', labelsColumn=-1) svm = SVM() res = svm.cv(data, 5) for fold in res: print fold print res # print data # help(sequenceData.spectrum_data)
def init(): '''Inits classifier with optimal options.''' return SVM(C=10, optimization='liblinear')
def solve(self, sample, c, kernel): r""" Solve the SVM classification optimization problem corresponding to the supplied sample, according to specified value for the tradeoff constant `C`. INPUT: - ``sample`` -- list or tuple of ``LabeledExample`` instances whose labels are all set either to `1` or `-1`. - ``c`` -- float or None (the former choice selects the soft-margin version of the algorithm) value for the tradeoff constant `C`. - ``kernel`` -- ``Kernel`` instance defining the kernel to be used. OUTPUT: list of float values -- optimal values for the optimization problem. EXAMPLES: Consider the following representation of the AND binary function, and a default instantiation for ``PyMLClassificationSolver``: :: >>> from yaplf.data import LabeledExample >>> and_sample = [LabeledExample((1, 1), 1), ... LabeledExample((0, 0), -1), LabeledExample((0, 1), -1), ... LabeledExample((1, 0), -1)] >>> from yaplf.algorithms.svm.classification.solvers \ ... import PyMLClassificationSolver >>> s = PyMLClassificationSolver() Once the solver instance is available, it is possible to invoke its ``solve``function, specifying a labeled sample such as ``and_sample``, a positive value for the constant `C` and a kernel instance in order to get the solution of the corresponding SV classification optimization problem: :: >>> from yaplf.models.kernel import LinearKernel >>> alphas = s.solve(and_sample, 2, LinearKernel()) # doctest:+ELLIPSIS Cpos, Cneg... >>> print alphas [2.0, 0.0, 1.0, 1.0] The value for `C` can be set to ``None``, in order to build and solve the original optimization problem rather than the soft-margin formulation: :: >>> alphas = s.solve(and_sample, None, LinearKernel()) # doctest:+ELLIPSIS Cpos, Cneg... >>> print alphas [3.984375, 0.0, 1.9921875, 1.9921875] Note however that this class should never be used directly. It is automatically used by ``SVMClassificationAlgorithm``. AUTHORS: - Dario Malchiodi (2010-04-06) """ patterns = array([[float(p) for p in e.pattern] for e in sample]) # was # patterns = array([map(float, e.pattern) for e in sample]) labels = array([float(e.label) for e in sample]) data = VectorDataSet(patterns, L=labels) if kernel.__class__.__name__ == "LinearKernel": pass elif kernel.__class__.__name__ == "GaussianKernel": data.attachKernel("gaussian", gamma=float(1.0 / (kernel.sigma ** 2))) elif kernel.__class__.__name__ == "PolynomialKernel": data.attachKernel("poly", degree=int(kernel.degree), additiveConst=float(1)) elif kernel.__class__.__name__ == "HomogeneousPolynomialKernel": data.attachKernel("poly", degree=int(kernel.degree), additiveConst=float(0)) else: raise NotImplementedError(str(kernel) + "not implemented in PyML") solver = SVM(Cmode="equal") solver.C = float(c) if c is not None else 100000000.0 solver.train(data, saveSpace=False) alphas = [0.0] * len(sample) for index, value in transpose([solver.model.svID, solver.model.alpha]): alphas[int(index)] = abs(value) return alphas
def svm_prediction(peptides, job_id, input_train="SVM_POS_NEG.fasta"): """ Makes a final prediction based on SVM training files. This code is used for prediciton of blind datasets, based on the training datasets of positives and negatives. :param peptides: input peptides :param job_id: random job id assigned prior to start predicting :param input_train: input positive and negative examples used in training :return: returns SVM scores for each inputed peptide """ print("Begin SVM") # from methods import load_sqlite, store_sqlite global PATH global TMP_PATH # suppress SVM output devnull = open(os.devnull, 'w') sys.stdout, sys.stderr = devnull, devnull svm_scores = [] # query the database # for peptide in peptides: # try: # score = load_sqlite(peptide, method="SVM", unique=True) # svm_scores.append(score) # except: # pass if len(peptides) == len(svm_scores): pass else: # generate a svm input from the peptides rand = job_id input_svm = "%s_svm.fasta" % rand output_tmp = open(os.path.join(TMP_PATH, input_svm), "w") count = 0 for peptide in peptides: count += 1 output_tmp.write("> %i label=%s\n%s\n" % (count, 1, peptide)) for peptide in peptides: count += 1 output_tmp.write("> %i label=%s\n%s\n" % (count, -1, peptide)) output_tmp.close() # outputs model_svm = "%s_svm_model.txt" % rand # train data train_data = SequenceData(os.path.join(PATH, input_train), mink=1, maxk=1, maxShift=0, headerHandler=svm_process_header) train_data.attachKernel('cosine') cval = 1 s = SVM(C=cval) s.train(train_data) s.save(os.path.join(TMP_PATH, model_svm)) # load trained SVM loaded_svm = loadSVM(os.path.join(TMP_PATH, model_svm), train_data) # test data test_data = SequenceData(os.path.join(TMP_PATH, input_svm), mink=1, maxk=1, maxShift=0, headerHandler=svm_process_header) test_data.attachKernel('cosine') results = loaded_svm.test(test_data) # print results out output_svm = "%s_svm.txt" % rand results.toFile(os.path.join(TMP_PATH, output_svm)) # load results process output (positives + negatives) infile = open(os.path.join(TMP_PATH, output_svm), "r") inlines = infile.readlines() infile.close() scores = list() for line in inlines: line = line.rstrip("\r\n") try: entry = int(line.split("\t")[0]) score = float(line.split("\t")[1]) label = int(line.split("\t")[3]) if label != "-1": scores.append([entry, score]) except: pass # order list sorted_scores = sorted(scores, key=lambda scores: scores[0]) svm_scores = list() for score in sorted_scores: svm_score = score[1] svm_scores.append(svm_score) # remove the temporary model files and results try: os.remove(os.path.join(TMP_PATH, input_svm)) os.remove(os.path.join(TMP_PATH, model_svm)) os.remove(os.path.join(TMP_PATH, output_svm)) except: pass # save the peptides in db # for peptide, score in zip(peptides, svm_scores): # store_sqlite(peptide, method="SVM", information=score, save=True) # restore normal output sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ print("End SVM") return svm_scores
class SVMImpl: def __init__(self): self.Features = SVMFeatures() self.TDFeatures = SVMTDFeatures() self.svminstance = SVM() def domaintrain(self, annotatedxmllist): datalist = list() labelslist = list() for annotatedxml in annotatedxmllist: for page in annotatedxml[0]: for col in page: if (len(col) < 2): continue for tup in col: if (tup[1].text is None or tup[1].text.strip() == ''): col.remove(tup) for i in xrange(0, len(col)): if (int(col[i][0]) == SparseType.TABLELINE): labelslist.append("S") else: labelslist.append("NS") datalist.append( self.Features.domainfindfeatureFunction( i, col, annotatedxml[1])) self.train(datalist, labelslist) def domaintrainforTableDecomposition(self, tableslist): labelslist = list() datalist = list() for table in tableslist: for i in xrange(0, len(table)): if (int(table[i][0]) == SparseType.HEADER): labelslist.append("HEADER") else: labelslist.append("DATA") datalist.append( self.TDFeatures.domainfindfeatureFunction(i, table, None)) self.trainforTD(datalist, labelslist) def domainpredictforTableDecomposition(self, table): errorcount = 0 sparseerror = 0 for i in xrange(0, len(table)): test_list = list() test_list.append( self.TDFeatures.domainfindfeatureFunction(i, table, None)) if (self.predict(test_list) == 'HEADER'): predicted = SparseType.HEADER else: predicted = SparseType.DATA if ((predicted) != int(table[i][0])): errorcount += 1 if ((predicted) == SparseType.HEADER): sparseerror += 1 table[i][0] = predicted return [table, errorcount, sparseerror] def domainpredict(self, col, fontdict): errorcount = 0 sparseerror = 0 for i in xrange(0, len(col)): test_list = list() test_list.append( self.Features.domainfindfeatureFunction(i, col, fontdict)) if (self.predict(test_list) == 'S'): predicted = SparseType.TABLELINE else: predicted = SparseType.NONTABLELINE if ((predicted) != int(col[i][0])): errorcount += 1 if ((predicted) == SparseType.NONTABLELINE): sparseerror += 1 col[i][0] = predicted return [col, errorcount, sparseerror] def train(self, datalist, labelslist): data = SparseDataSet(datalist, L=labelslist) self.svminstance.C = 20 data.attachKernel('gaussian', degree=5) self.svminstance.train(data) #result = self.svminstance.cv(data, 5) #print result def trainforTD(self, datalist, labelslist): data = SparseDataSet(datalist, L=labelslist) self.svminstance.train(data) #result = self.svminstance.cv(data, 6) #print result def predict(self, datalist): data = SparseDataSet(datalist) results = self.svminstance.test(data) return results.getPredictedLabels()[0] def save(self, filename): self.svminstance.save(filename)