def trainClassifier(self, train_data):
      '''trains a decision tree, svm and naive bayes classifier'''

      #NaiveBayes Classifier


      #Support Vector Machine
      feature_set = []
      labels = []
      for instance in train_data:
         feat = self.getFeatures(instance, train=True)
         labels.append(feat[0])
         feature_set.append(feat[1:len(feat)])
         
         '''a feature_set is a list consisting of:
            [label, f1, f2, f3...], [label, f1, f2, f3...]'''

      vector_data = VectorDataSet(feature_set, L=labels) #Linear Discriminant
      svm = SVM() 
      svm.train(vector_data, saveSpace=False)
      svm.save('opinion-classifier')
class SVMImpl:
    def __init__(self):
        self.Features = SVMFeatures()
        self.TDFeatures = SVMTDFeatures()
        self.svminstance = SVM()
        
    def domaintrain(self, annotatedxmllist):
        datalist = list()
        labelslist = list()
        for annotatedxml in annotatedxmllist:
            for page in annotatedxml[0]:
                for col in page:
                    if(len(col) < 2):
                        continue
                    for tup in col:
                        if(tup[1].text is None or tup[1].text.strip() == ''):
                            col.remove(tup)
                    for i in xrange(0, len(col)):
                        if(int(col[i][0]) == SparseType.TABLELINE):
                            labelslist.append("S")
                        else:
                            labelslist.append("NS")
                        datalist.append(self.Features.domainfindfeatureFunction(i, col, annotatedxml[1]))
        self.train(datalist, labelslist)
    
    def domaintrainforTableDecomposition(self, tableslist):
        labelslist = list()
        datalist = list()
        for table in tableslist:
            for i in xrange(0, len(table)):
                if(int(table[i][0]) == SparseType.HEADER):
                    labelslist.append("HEADER")
                else:
                    labelslist.append("DATA")
                datalist.append(self.TDFeatures.domainfindfeatureFunction(i, table, None))
        self.trainforTD(datalist, labelslist)
        
    def domainpredictforTableDecomposition(self, table): 
        errorcount = 0
        sparseerror = 0
        for i in xrange(0, len(table)):
            test_list = list()
            test_list.append(self.TDFeatures.domainfindfeatureFunction(i, table, None)) 
            if(self.predict(test_list) == 'HEADER'):
                predicted = SparseType.HEADER
            else:
                predicted = SparseType.DATA
            if((predicted) != int(table[i][0])):
                errorcount += 1 
                if((predicted) == SparseType.HEADER):
                    sparseerror += 1
            
            table[i][0] = predicted
            
        return [table, errorcount, sparseerror]
               
    def domainpredict(self, col, fontdict):
        errorcount = 0
        sparseerror = 0
        for i in xrange(0, len(col)):
            test_list = list()
            test_list.append(self.Features.domainfindfeatureFunction(i, col, fontdict)) 
            if(self.predict(test_list) == 'S'):
                predicted = SparseType.TABLELINE
            else:
                predicted = SparseType.NONTABLELINE
            if((predicted) != int(col[i][0])):
                errorcount += 1 
                if((predicted) == SparseType.NONTABLELINE):
                    sparseerror += 1
            col[i][0] = predicted
        
        return [col, errorcount, sparseerror]
        
    def train(self, datalist, labelslist):    
        data = SparseDataSet(datalist, L = labelslist)
        self.svminstance.C = 20
        data.attachKernel('gaussian', degree = 5)
        self.svminstance.train(data)
        #result = self.svminstance.cv(data, 5)
        #print result
        
    def trainforTD(self, datalist, labelslist):    
        data = SparseDataSet(datalist, L = labelslist)
        self.svminstance.train(data)
        #result = self.svminstance.cv(data, 6)
        #print result    
        
    def predict(self, datalist):
        data = SparseDataSet(datalist)
        results = self.svminstance.test(data)
        return results.getPredictedLabels()[0]
    
    def save(self, filename):
        self.svminstance.save(filename)
Beispiel #3
0
def svm_prediction(peptides, job_id, input_train="SVM_POS_NEG.fasta"):
    """
    Makes a final prediction based on SVM training files.
    This code is used for prediciton of blind datasets, based on the training
    datasets of positives and negatives.

    :param peptides: input peptides
    :param job_id: random job id assigned prior to start predicting
    :param input_train: input positive and negative examples used in training
    :return: returns SVM scores for each inputed peptide
    """

    print("Begin SVM")

    # from methods import load_sqlite, store_sqlite

    global PATH
    global TMP_PATH

    # suppress SVM output
    devnull = open(os.devnull, 'w')
    sys.stdout, sys.stderr = devnull, devnull

    svm_scores = []
    # query the database
    # for peptide in peptides:
    #     try:
    #         score = load_sqlite(peptide, method="SVM", unique=True)
    #         svm_scores.append(score)
    #     except:
    #         pass

    if len(peptides) == len(svm_scores):
        pass
    else:

        # generate a svm input from the peptides
        rand = job_id
        input_svm = "%s_svm.fasta" % rand
        output_tmp = open(os.path.join(TMP_PATH, input_svm), "w")

        count = 0
        for peptide in peptides:
            count += 1
            output_tmp.write("> %i label=%s\n%s\n" % (count, 1, peptide))
        for peptide in peptides:
            count += 1
            output_tmp.write("> %i label=%s\n%s\n" % (count, -1, peptide))
        output_tmp.close()

        # outputs
        model_svm = "%s_svm_model.txt" % rand

        # train data
        train_data = SequenceData(os.path.join(PATH, input_train),
                                  mink=1,
                                  maxk=1,
                                  maxShift=0,
                                  headerHandler=svm_process_header)
        train_data.attachKernel('cosine')

        cval = 1
        s = SVM(C=cval)
        s.train(train_data)
        s.save(os.path.join(TMP_PATH, model_svm))

        # load trained SVM
        loaded_svm = loadSVM(os.path.join(TMP_PATH, model_svm), train_data)

        # test data
        test_data = SequenceData(os.path.join(TMP_PATH, input_svm),
                                 mink=1,
                                 maxk=1,
                                 maxShift=0,
                                 headerHandler=svm_process_header)
        test_data.attachKernel('cosine')
        results = loaded_svm.test(test_data)

        # print results out
        output_svm = "%s_svm.txt" % rand
        results.toFile(os.path.join(TMP_PATH, output_svm))

        # load results process output (positives + negatives)
        infile = open(os.path.join(TMP_PATH, output_svm), "r")
        inlines = infile.readlines()
        infile.close()
        scores = list()
        for line in inlines:
            line = line.rstrip("\r\n")
            try:
                entry = int(line.split("\t")[0])
                score = float(line.split("\t")[1])
                label = int(line.split("\t")[3])
                if label != "-1":
                    scores.append([entry, score])
            except:
                pass

        # order list
        sorted_scores = sorted(scores, key=lambda scores: scores[0])

        svm_scores = list()
        for score in sorted_scores:
            svm_score = score[1]
            svm_scores.append(svm_score)

        # remove the temporary model files and results
        try:
            os.remove(os.path.join(TMP_PATH, input_svm))
            os.remove(os.path.join(TMP_PATH, model_svm))
            os.remove(os.path.join(TMP_PATH, output_svm))
        except:
            pass

        # save the peptides in db
        # for peptide, score in zip(peptides, svm_scores):
        #     store_sqlite(peptide, method="SVM", information=score, save=True)

    # restore normal output
    sys.stdout = sys.__stdout__
    sys.stderr = sys.__stderr__

    print("End SVM")
    return svm_scores
class SVMImpl:
    def __init__(self):
        self.Features = SVMFeatures()
        self.TDFeatures = SVMTDFeatures()
        self.svminstance = SVM()

    def domaintrain(self, annotatedxmllist):
        datalist = list()
        labelslist = list()
        for annotatedxml in annotatedxmllist:
            for page in annotatedxml[0]:
                for col in page:
                    if (len(col) < 2):
                        continue
                    for tup in col:
                        if (tup[1].text is None or tup[1].text.strip() == ''):
                            col.remove(tup)
                    for i in xrange(0, len(col)):
                        if (int(col[i][0]) == SparseType.TABLELINE):
                            labelslist.append("S")
                        else:
                            labelslist.append("NS")
                        datalist.append(
                            self.Features.domainfindfeatureFunction(
                                i, col, annotatedxml[1]))
        self.train(datalist, labelslist)

    def domaintrainforTableDecomposition(self, tableslist):
        labelslist = list()
        datalist = list()
        for table in tableslist:
            for i in xrange(0, len(table)):
                if (int(table[i][0]) == SparseType.HEADER):
                    labelslist.append("HEADER")
                else:
                    labelslist.append("DATA")
                datalist.append(
                    self.TDFeatures.domainfindfeatureFunction(i, table, None))
        self.trainforTD(datalist, labelslist)

    def domainpredictforTableDecomposition(self, table):
        errorcount = 0
        sparseerror = 0
        for i in xrange(0, len(table)):
            test_list = list()
            test_list.append(
                self.TDFeatures.domainfindfeatureFunction(i, table, None))
            if (self.predict(test_list) == 'HEADER'):
                predicted = SparseType.HEADER
            else:
                predicted = SparseType.DATA
            if ((predicted) != int(table[i][0])):
                errorcount += 1
                if ((predicted) == SparseType.HEADER):
                    sparseerror += 1

            table[i][0] = predicted

        return [table, errorcount, sparseerror]

    def domainpredict(self, col, fontdict):
        errorcount = 0
        sparseerror = 0
        for i in xrange(0, len(col)):
            test_list = list()
            test_list.append(
                self.Features.domainfindfeatureFunction(i, col, fontdict))
            if (self.predict(test_list) == 'S'):
                predicted = SparseType.TABLELINE
            else:
                predicted = SparseType.NONTABLELINE
            if ((predicted) != int(col[i][0])):
                errorcount += 1
                if ((predicted) == SparseType.NONTABLELINE):
                    sparseerror += 1
            col[i][0] = predicted

        return [col, errorcount, sparseerror]

    def train(self, datalist, labelslist):
        data = SparseDataSet(datalist, L=labelslist)
        self.svminstance.C = 20
        data.attachKernel('gaussian', degree=5)
        self.svminstance.train(data)
        #result = self.svminstance.cv(data, 5)
        #print result

    def trainforTD(self, datalist, labelslist):
        data = SparseDataSet(datalist, L=labelslist)
        self.svminstance.train(data)
        #result = self.svminstance.cv(data, 6)
        #print result

    def predict(self, datalist):
        data = SparseDataSet(datalist)
        results = self.svminstance.test(data)
        return results.getPredictedLabels()[0]

    def save(self, filename):
        self.svminstance.save(filename)