コード例 #1
0
def learn(classified, histograms):
    clf = SVM()

    
    total_samples = 0
    for c in classified.keys():
        cim = classified[c]
        total_samples = total_samples + len(cim)
        
    samples = []
    labels = []
    for c in classified.keys():
        cim = classified[c]
        for im in cim:
            hist = histograms[im]
            row = []
            for j in range(NUM_BINS):
                row.append(cv.QueryHistValue_1D(hist, j))
            samples.append(row)
            labels.append(c)

    data = VectorDataSet(samples, L=labels)
    print str(data)
    clf.train(data)
    return clf
コード例 #2
0
class SVMImpl:
    def __init__(self):
        self.Features = SVMFeatures()
        self.svminstance = SVM()
        
    def domaintrain(self, annotatedxmllist):
        datalist = list()
        labelslist = list()
        for annotatedxml in annotatedxmllist:
            for page in annotatedxml[0]:
                for col in page:
                    if(len(col) < 2):
                        continue
                    
                    for i in xrange(0, len(col)):
                        if(int(col[i][0]) == SparseType.OTHERSPARSE):
                            labelslist.append("S")
                        else:
                            labelslist.append("NS")
                        datalist.append(self.Features.domainfindfeatureFunction(i, col, annotatedxml[1]))
        self.train(datalist, labelslist)
        
    def domainpredict(self, col, fontdict):
        for i in xrange(0, len(col)):
            test_list = list()
            test_list.append(self.Features.domainfindfeatureFunction(i, col, fontdict)) 
            if(self.predict(test_list) == 'S'):
                col[i][0] = SparseType.OTHERSPARSE
            else:
                col[i][0] = SparseType.NONSPARSE
        return col
        
    def train(self, datalist, labelslist):    
        data = SparseDataSet(datalist, L = labelslist)
        self.svminstance.C = 20
        data.attachKernel('gaussian', degree = 5)
        self.svminstance.train(data)
        #result = self.svminstance.cv(data, 2)
        #print result.getPredictedLabels()
        
    def predict(self, datalist):
        data = SparseDataSet(datalist)
        results = self.svminstance.test(data)
        return results.getPredictedLabels()[0]
コード例 #3
0
   def trainClassifier(self, train_data):
      '''trains a decision tree, svm and naive bayes classifier'''

      #NaiveBayes Classifier


      #Support Vector Machine
      feature_set = []
      labels = []
      for instance in train_data:
         feat = self.getFeatures(instance, train=True)
         labels.append(feat[0])
         feature_set.append(feat[1:len(feat)])
         
         '''a feature_set is a list consisting of:
            [label, f1, f2, f3...], [label, f1, f2, f3...]'''

      vector_data = VectorDataSet(feature_set, L=labels) #Linear Discriminant
      svm = SVM() 
      svm.train(vector_data, saveSpace=False)
      svm.save('opinion-classifier')
コード例 #4
0
ファイル: experiment.py プロジェクト: basirshariat/PAIRpred
 def run(self, **kwargs):
     self.run_parameters = kwargs
     data = self.database.get_pyml_dataset(self.features, **kwargs)
     data.attachKernel('gaussian', gamma=kwargs['gamma'], normalization='cosine')  # normalization='cosine'
     if self.classifier == Classifier.SVM:
         # svm = SVM()
         svm = SVM(optimizer='pegasos')
         training, testing = self.database.get_cv_folds(kwargs['folds'])
         self.pyml_result = cvFromFolds(svm, data, training, testing, numFolds=kwargs['folds'], verbose=False)
         self.get_rfpp()
     if kwargs['save']:
         self.__save_results()
コード例 #5
0
    def pyMLSVM(basePath, isMulti = False):
        """
            Run SVM on Image Training Data 
            basePath is the directory containing the training file
            isMulti is True when we are running multiclass classication
        """

        # Creating training dataset as VectorDataSet object
        cnt = 1
        trainLabels = []
        trainRawData = []
        for img in MLUtilities.readImages(basePath):
            if cnt % 1000 == 0:
                print("Creating row "+ str(cnt))
            row = None
            if isMulti:
                row = MLUtilities.createPyMLSVMRow(img)
            else:
                row = MLUtilities.createPyMLBinarySVMRow(img)
            trainLabels.append(row[0])
            trainRawData.append(row[1])
            cnt+=1
        trainData = VectorDataSet(trainRawData, L=trainLabels)

        # Changing Kernel 
        # k = ker.Polynomial(degree = 2)
        # trainData.attachKernel(k)


        r = None
        # Training SVM
        if isMulti:
            m = multi.OneAgainstRest (SVM())
            r = m.cv(trainData, numFolds=5)
        else:
            s = SVM()
            r = s.cv(trainData, numFolds=5)

        print(r.getConfusionMatrix())
        print(r.getROC())
コード例 #6
0
def learn(classified, histograms):
    clf = SVM()

    total_samples = 0
    for c in classified.keys():
        cim = classified[c]
        total_samples = total_samples + len(cim)

    samples = []
    labels = []
    for c in classified.keys():
        cim = classified[c]
        for im in cim:
            hist = histograms[im]
            row = []
            for j in range(NUM_BINS):
                row.append(cv.QueryHistValue_1D(hist, j))
            samples.append(row)
            labels.append(c)

    data = VectorDataSet(samples, L=labels)
    print str(data)
    clf.train(data)
    return clf
コード例 #7
0
 def __init__(self):
     self.Features = SVMFeatures()
     self.TDFeatures = SVMTDFeatures()
     self.svminstance = SVM()
コード例 #8
0
class SVMImpl:
    def __init__(self):
        self.Features = SVMFeatures()
        self.TDFeatures = SVMTDFeatures()
        self.svminstance = SVM()
        
    def domaintrain(self, annotatedxmllist):
        datalist = list()
        labelslist = list()
        for annotatedxml in annotatedxmllist:
            for page in annotatedxml[0]:
                for col in page:
                    if(len(col) < 2):
                        continue
                    for tup in col:
                        if(tup[1].text is None or tup[1].text.strip() == ''):
                            col.remove(tup)
                    for i in xrange(0, len(col)):
                        if(int(col[i][0]) == SparseType.TABLELINE):
                            labelslist.append("S")
                        else:
                            labelslist.append("NS")
                        datalist.append(self.Features.domainfindfeatureFunction(i, col, annotatedxml[1]))
        self.train(datalist, labelslist)
    
    def domaintrainforTableDecomposition(self, tableslist):
        labelslist = list()
        datalist = list()
        for table in tableslist:
            for i in xrange(0, len(table)):
                if(int(table[i][0]) == SparseType.HEADER):
                    labelslist.append("HEADER")
                else:
                    labelslist.append("DATA")
                datalist.append(self.TDFeatures.domainfindfeatureFunction(i, table, None))
        self.trainforTD(datalist, labelslist)
        
    def domainpredictforTableDecomposition(self, table): 
        errorcount = 0
        sparseerror = 0
        for i in xrange(0, len(table)):
            test_list = list()
            test_list.append(self.TDFeatures.domainfindfeatureFunction(i, table, None)) 
            if(self.predict(test_list) == 'HEADER'):
                predicted = SparseType.HEADER
            else:
                predicted = SparseType.DATA
            if((predicted) != int(table[i][0])):
                errorcount += 1 
                if((predicted) == SparseType.HEADER):
                    sparseerror += 1
            
            table[i][0] = predicted
            
        return [table, errorcount, sparseerror]
               
    def domainpredict(self, col, fontdict):
        errorcount = 0
        sparseerror = 0
        for i in xrange(0, len(col)):
            test_list = list()
            test_list.append(self.Features.domainfindfeatureFunction(i, col, fontdict)) 
            if(self.predict(test_list) == 'S'):
                predicted = SparseType.TABLELINE
            else:
                predicted = SparseType.NONTABLELINE
            if((predicted) != int(col[i][0])):
                errorcount += 1 
                if((predicted) == SparseType.NONTABLELINE):
                    sparseerror += 1
            col[i][0] = predicted
        
        return [col, errorcount, sparseerror]
        
    def train(self, datalist, labelslist):    
        data = SparseDataSet(datalist, L = labelslist)
        self.svminstance.C = 20
        data.attachKernel('gaussian', degree = 5)
        self.svminstance.train(data)
        #result = self.svminstance.cv(data, 5)
        #print result
        
    def trainforTD(self, datalist, labelslist):    
        data = SparseDataSet(datalist, L = labelslist)
        self.svminstance.train(data)
        #result = self.svminstance.cv(data, 6)
        #print result    
        
    def predict(self, datalist):
        data = SparseDataSet(datalist)
        results = self.svminstance.test(data)
        return results.getPredictedLabels()[0]
    
    def save(self, filename):
        self.svminstance.save(filename)
コード例 #9
0
from PyML import SparseDataSet, SVM

__author__ = 'basir'

data = SparseDataSet('data/heartSparse.data', labelsColumn=-1)
svm = SVM()
res = svm.cv(data, 5)
for fold in res:
    print fold
print res
# print data
# help(sequenceData.spectrum_data)
コード例 #10
0
ファイル: classifier.py プロジェクト: zendesk/talon
def init():
    '''Inits classifier with optimal options.'''
    return SVM(C=10, optimization='liblinear')
コード例 #11
0
ファイル: pyml_test.py プロジェクト: sluggishcrow/PAIRpred
from PyML import SparseDataSet, SVM

__author__ = 'basir'




data = SparseDataSet('data/heartSparse.data', labelsColumn=-1)
svm = SVM()
res = svm.cv(data, 5)
for fold in res:
    print fold
print res
# print data
# help(sequenceData.spectrum_data)
コード例 #12
0
ファイル: solvers.py プロジェクト: dariomalchiodi/yaplf
    def solve(self, sample, c, kernel):
        r"""
        Solve the SVM classification optimization problem corresponding
        to the supplied sample, according to specified value for the tradeoff
        constant `C`.

        INPUT:

        - ``sample`` -- list or tuple of ``LabeledExample`` instances whose
          labels are all set either to `1` or `-1`.

        - ``c`` -- float or None (the former choice selects the
          soft-margin version of the algorithm) value for the tradeoff constant
          `C`.

        - ``kernel`` -- ``Kernel`` instance defining the kernel to be used.

        OUTPUT:

        list of float values -- optimal values for the optimization problem.

        EXAMPLES:

        Consider the following representation of the AND binary function, and a
        default instantiation for ``PyMLClassificationSolver``:

        ::

            >>> from yaplf.data import LabeledExample
            >>> and_sample = [LabeledExample((1, 1), 1),
            ... LabeledExample((0, 0), -1), LabeledExample((0, 1), -1),
            ... LabeledExample((1, 0), -1)]
            >>> from yaplf.algorithms.svm.classification.solvers \
            ... import PyMLClassificationSolver
            >>> s = PyMLClassificationSolver()

        Once the solver instance is available, it is possible to invoke its
        ``solve``function, specifying a labeled sample such as ``and_sample``,
        a positive value for the constant `C` and a kernel instance in order to
        get the solution of the corresponding SV classification optimization
        problem:

        ::

            >>> from yaplf.models.kernel import LinearKernel
            >>> alphas = s.solve(and_sample, 2, LinearKernel()) # doctest:+ELLIPSIS
            Cpos, Cneg...
            >>> print alphas
            [2.0, 0.0, 1.0, 1.0]

        The value for `C` can be set to ``None``, in order to build and solve
        the original optimization problem rather than the soft-margin
        formulation:

        ::

            >>> alphas = s.solve(and_sample, None, LinearKernel()) # doctest:+ELLIPSIS
            Cpos, Cneg...
            >>> print alphas
            [3.984375, 0.0, 1.9921875, 1.9921875]

        Note however that this class should never be used directly. It is
        automatically used by ``SVMClassificationAlgorithm``.

        AUTHORS:

        - Dario Malchiodi (2010-04-06)

        """

        patterns = array([[float(p) for p in e.pattern] for e in sample])
        # was
        # patterns = array([map(float, e.pattern) for e in sample])
        labels = array([float(e.label) for e in sample])

        data = VectorDataSet(patterns, L=labels)
        if kernel.__class__.__name__ == "LinearKernel":
            pass
        elif kernel.__class__.__name__ == "GaussianKernel":
            data.attachKernel("gaussian", gamma=float(1.0 / (kernel.sigma ** 2)))
        elif kernel.__class__.__name__ == "PolynomialKernel":
            data.attachKernel("poly", degree=int(kernel.degree), additiveConst=float(1))
        elif kernel.__class__.__name__ == "HomogeneousPolynomialKernel":
            data.attachKernel("poly", degree=int(kernel.degree), additiveConst=float(0))
        else:
            raise NotImplementedError(str(kernel) + "not implemented in PyML")

        solver = SVM(Cmode="equal")
        solver.C = float(c) if c is not None else 100000000.0
        solver.train(data, saveSpace=False)
        alphas = [0.0] * len(sample)
        for index, value in transpose([solver.model.svID, solver.model.alpha]):
            alphas[int(index)] = abs(value)
        return alphas
コード例 #13
0
ファイル: predictor.py プロジェクト: bartongroup/FM_FunPDBe
def svm_prediction(peptides, job_id, input_train="SVM_POS_NEG.fasta"):
    """
    Makes a final prediction based on SVM training files.
    This code is used for prediciton of blind datasets, based on the training
    datasets of positives and negatives.

    :param peptides: input peptides
    :param job_id: random job id assigned prior to start predicting
    :param input_train: input positive and negative examples used in training
    :return: returns SVM scores for each inputed peptide
    """

    print("Begin SVM")

    # from methods import load_sqlite, store_sqlite

    global PATH
    global TMP_PATH

    # suppress SVM output
    devnull = open(os.devnull, 'w')
    sys.stdout, sys.stderr = devnull, devnull

    svm_scores = []
    # query the database
    # for peptide in peptides:
    #     try:
    #         score = load_sqlite(peptide, method="SVM", unique=True)
    #         svm_scores.append(score)
    #     except:
    #         pass

    if len(peptides) == len(svm_scores):
        pass
    else:

        # generate a svm input from the peptides
        rand = job_id
        input_svm = "%s_svm.fasta" % rand
        output_tmp = open(os.path.join(TMP_PATH, input_svm), "w")

        count = 0
        for peptide in peptides:
            count += 1
            output_tmp.write("> %i label=%s\n%s\n" % (count, 1, peptide))
        for peptide in peptides:
            count += 1
            output_tmp.write("> %i label=%s\n%s\n" % (count, -1, peptide))
        output_tmp.close()

        # outputs
        model_svm = "%s_svm_model.txt" % rand

        # train data
        train_data = SequenceData(os.path.join(PATH, input_train),
                                  mink=1,
                                  maxk=1,
                                  maxShift=0,
                                  headerHandler=svm_process_header)
        train_data.attachKernel('cosine')

        cval = 1
        s = SVM(C=cval)
        s.train(train_data)
        s.save(os.path.join(TMP_PATH, model_svm))

        # load trained SVM
        loaded_svm = loadSVM(os.path.join(TMP_PATH, model_svm), train_data)

        # test data
        test_data = SequenceData(os.path.join(TMP_PATH, input_svm),
                                 mink=1,
                                 maxk=1,
                                 maxShift=0,
                                 headerHandler=svm_process_header)
        test_data.attachKernel('cosine')
        results = loaded_svm.test(test_data)

        # print results out
        output_svm = "%s_svm.txt" % rand
        results.toFile(os.path.join(TMP_PATH, output_svm))

        # load results process output (positives + negatives)
        infile = open(os.path.join(TMP_PATH, output_svm), "r")
        inlines = infile.readlines()
        infile.close()
        scores = list()
        for line in inlines:
            line = line.rstrip("\r\n")
            try:
                entry = int(line.split("\t")[0])
                score = float(line.split("\t")[1])
                label = int(line.split("\t")[3])
                if label != "-1":
                    scores.append([entry, score])
            except:
                pass

        # order list
        sorted_scores = sorted(scores, key=lambda scores: scores[0])

        svm_scores = list()
        for score in sorted_scores:
            svm_score = score[1]
            svm_scores.append(svm_score)

        # remove the temporary model files and results
        try:
            os.remove(os.path.join(TMP_PATH, input_svm))
            os.remove(os.path.join(TMP_PATH, model_svm))
            os.remove(os.path.join(TMP_PATH, output_svm))
        except:
            pass

        # save the peptides in db
        # for peptide, score in zip(peptides, svm_scores):
        #     store_sqlite(peptide, method="SVM", information=score, save=True)

    # restore normal output
    sys.stdout = sys.__stdout__
    sys.stderr = sys.__stderr__

    print("End SVM")
    return svm_scores
コード例 #14
0
 def __init__(self):
     self.Features = SVMFeatures()
     self.TDFeatures = SVMTDFeatures()
     self.svminstance = SVM()
コード例 #15
0
class SVMImpl:
    def __init__(self):
        self.Features = SVMFeatures()
        self.TDFeatures = SVMTDFeatures()
        self.svminstance = SVM()

    def domaintrain(self, annotatedxmllist):
        datalist = list()
        labelslist = list()
        for annotatedxml in annotatedxmllist:
            for page in annotatedxml[0]:
                for col in page:
                    if (len(col) < 2):
                        continue
                    for tup in col:
                        if (tup[1].text is None or tup[1].text.strip() == ''):
                            col.remove(tup)
                    for i in xrange(0, len(col)):
                        if (int(col[i][0]) == SparseType.TABLELINE):
                            labelslist.append("S")
                        else:
                            labelslist.append("NS")
                        datalist.append(
                            self.Features.domainfindfeatureFunction(
                                i, col, annotatedxml[1]))
        self.train(datalist, labelslist)

    def domaintrainforTableDecomposition(self, tableslist):
        labelslist = list()
        datalist = list()
        for table in tableslist:
            for i in xrange(0, len(table)):
                if (int(table[i][0]) == SparseType.HEADER):
                    labelslist.append("HEADER")
                else:
                    labelslist.append("DATA")
                datalist.append(
                    self.TDFeatures.domainfindfeatureFunction(i, table, None))
        self.trainforTD(datalist, labelslist)

    def domainpredictforTableDecomposition(self, table):
        errorcount = 0
        sparseerror = 0
        for i in xrange(0, len(table)):
            test_list = list()
            test_list.append(
                self.TDFeatures.domainfindfeatureFunction(i, table, None))
            if (self.predict(test_list) == 'HEADER'):
                predicted = SparseType.HEADER
            else:
                predicted = SparseType.DATA
            if ((predicted) != int(table[i][0])):
                errorcount += 1
                if ((predicted) == SparseType.HEADER):
                    sparseerror += 1

            table[i][0] = predicted

        return [table, errorcount, sparseerror]

    def domainpredict(self, col, fontdict):
        errorcount = 0
        sparseerror = 0
        for i in xrange(0, len(col)):
            test_list = list()
            test_list.append(
                self.Features.domainfindfeatureFunction(i, col, fontdict))
            if (self.predict(test_list) == 'S'):
                predicted = SparseType.TABLELINE
            else:
                predicted = SparseType.NONTABLELINE
            if ((predicted) != int(col[i][0])):
                errorcount += 1
                if ((predicted) == SparseType.NONTABLELINE):
                    sparseerror += 1
            col[i][0] = predicted

        return [col, errorcount, sparseerror]

    def train(self, datalist, labelslist):
        data = SparseDataSet(datalist, L=labelslist)
        self.svminstance.C = 20
        data.attachKernel('gaussian', degree=5)
        self.svminstance.train(data)
        #result = self.svminstance.cv(data, 5)
        #print result

    def trainforTD(self, datalist, labelslist):
        data = SparseDataSet(datalist, L=labelslist)
        self.svminstance.train(data)
        #result = self.svminstance.cv(data, 6)
        #print result

    def predict(self, datalist):
        data = SparseDataSet(datalist)
        results = self.svminstance.test(data)
        return results.getPredictedLabels()[0]

    def save(self, filename):
        self.svminstance.save(filename)