Beispiel #1
0
 def __init__(self, lexicon, C=1, num_features=100):
     self.training_set = None
     self.classes = None 
     self.test_set = None
     self.results = None
     self.kernel = ker.Linear()
     self.C = C  
     self.feature_data = PATH + "/learning/stored/feature.data"
     self.label_data = PATH + "/learning/stored/svm_label.data"
     self.lexicon = lexicon
     self.num_features = len(self.lexicon.words.keys())
     try:
         print "Loading existing SVM..."
         features = pickle.load(open(self.feature_data))
         labels = pickle.load(open(self.label_data))
         sparsedata = SparseDataSet(features, L=labels) 
         self.svm_classifier = loadSVM(PATH + "/learning/stored/svm.classifier",sparsedata)
     except Exception as e:
         print e
         print "Existing SVM not found!"
         self.svm_classifier = svm.SVM(self.kernel)
     self.accuracy = None
     self.predicted_labels = None
     score = featsel.FeatureScore('golub')
     self.filter = featsel.Filter(score)
     self.feature_selector = FeatureSelect(self.svm_classifier, self.filter)
     self.chain = Chain([self.feature_selector, self.svm_classifier])
Beispiel #2
0
 def __init__(self, lexicon, C=1, num_features=100):
     self.training_set = None
     self.classes = None
     self.test_set = None
     self.results = None
     self.kernel = ker.Linear()
     self.C = C
     self.feature_data = PATH + "/learning/stored/feature.data"
     self.label_data = PATH + "/learning/stored/svm_label.data"
     self.lexicon = lexicon
     self.num_features = len(self.lexicon.words.keys())
     try:
         print "Loading existing SVM..."
         features = pickle.load(open(self.feature_data))
         labels = pickle.load(open(self.label_data))
         sparsedata = SparseDataSet(features, L=labels)
         self.svm_classifier = loadSVM(PATH + "/learning/stored/svm.classifier", sparsedata)
     except Exception as e:
         print e
         print "Existing SVM not found!"
         self.svm_classifier = svm.SVM(self.kernel)
     self.accuracy = None
     self.predicted_labels = None
     score = featsel.FeatureScore("golub")
     self.filter = featsel.Filter(score)
     self.feature_selector = FeatureSelect(self.svm_classifier, self.filter)
     self.chain = Chain([self.feature_selector, self.svm_classifier])
Beispiel #3
0
def cross_validate(X,y):

    # Ask what percentage of the data should be trained
    p = 0
    while p < 50 or p > 100:
        p = raw_input("Enter percentage of data to train (between 50 and 75): ")
        p = float(p);

    K = int(floor(float(p)*len(X)/100))

    X1 = X[0:K]
    y1 = y[0:K]

    #Load data into PyML's vector objects, then train set
    data = VectorDataSet(X1,L=y1)

    s = SVM()
    s.train(data)
    s.save("cross_validating")

    #Now check the other data
    X2 = X[K+1:-1]
    y2 = y[K+1:-1]

    #Load our training data
    loadedSVM = loadSVM("cross_validating",data)

    testData = VectorDataSet(X2,L=y2)
    r = loadedSVM.test(testData)
    print r

    #Delete the data now that we're done with it
    os.system("rm cross_validating")
Beispiel #4
0
    def on_load_clicked(self, widget):
        filen = self.getFilenameToRead("Load Classifier",filter='svm')
        if filen is not None:
            #db = shelve.open(filen)
            #if db.has_key('clssfr'):
            #    self.clssfr = db['clssfr'] 
            #else:
            #    self.showMessage("Cannot find a classifier!")
            #db.close()
            #with open(filen, 'wb') as f:
            #    self.clssfr = pickle.load(f)

            datfn = self.getFilenameToRead("Open Training Data",filter='mat')
            if datfn is not None:
                data = ml.VectorDataSet(datfn,labelsColumn=0)
                self.clssfr = loadSVM(filen,data) ## Why do I need to feed data ???
            
            #self.clssfr = loadSVM(filen,None) ## edited PyML for this

            # classifier has been loaded. need to update button status
            self.setDisabledBtns()
            self.showMessage("The classifier has been loaded!")
Beispiel #5
0
    def on_load_clicked(self, widget):
        filen = self.getFilenameToRead("Load Classifier", filter='svm')
        if filen is not None:
            #db = shelve.open(filen)
            #if db.has_key('clssfr'):
            #    self.clssfr = db['clssfr']
            #else:
            #    self.showMessage("Cannot find a classifier!")
            #db.close()
            #with open(filen, 'wb') as f:
            #    self.clssfr = pickle.load(f)

            datfn = self.getFilenameToRead("Open Training Data", filter='mat')
            if datfn is not None:
                data = ml.VectorDataSet(datfn, labelsColumn=0)
                self.clssfr = loadSVM(filen,
                                      data)  ## Why do I need to feed data ???

            #self.clssfr = loadSVM(filen,None) ## edited PyML for this

            # classifier has been loaded. need to update button status
            self.setDisabledBtns()
            self.showMessage("The classifier has been loaded!")
Beispiel #6
0
def test(component="svm", **args):

    container = "SparseDataSet"
    if "container" in args:
        container = args["container"]
    try:
        DataSet = getattr(vectorDatasets, container)
    except:
        raise ValueError, "wrong container " + container

    results = {}

    comp = "general"
    if component == "all" or component == comp:
        s = svm.SVM()
        results = {}
        d = DataSet(heartdatafile, labelsColumn=0)
        s.train(d)
        s.test(d)
        s = svm.SVM()
        s.stratifiedCV(d)
        print "starting aggregate****************"
        d2 = Aggregate([d, d])
        print "end aggregate"
        r = s.stratifiedCV(d2)

        d.attachKernel("polynomial")
        s.cv(d)
        d.attachKernel("linear")
        s = svm.SVM()
        s.train(d)
        s.train(d, saveSpace=False)
        s.save("tmp")
        loaded = svm.loadSVM("tmp", datasetClass=DataSet)
        r = loaded.test(d)
        d.attachKernel("gaussian", gamma=0.01)

        s.train(d, saveSpace=False)
        s.save("tmp")
        loaded = svm.loadSVM("tmp", datasetClass=DataSet, labelsColumn=1)
        r = loaded.test(d)
        os.remove("tmp")

        d = DataSet(numpy.random.randn(100, 10))
        d = DataSet([[1, 2], [2, 3]])
        d = SequenceData(["asa", "ben", "hur"])

    comp = "svm"
    if component == "all" or component == comp:
        d = DataSet(heartdatafile, labelsColumn=0)
        results[comp] = []
        d.attachKernel("polynomial")
        s = svm.SVM()
        results[comp].append(s.cv(d, saveSpace=True))
        d.attachKernel("linear")
        results[comp].append(s.cv(d))

    comp = "kernelData"
    if component == "all" or component == comp:
        d = DataSet(heartdatafile, labelsColumn=0)
        results[comp] = []
        kdata = KernelData("heart.kernel", gistFormat=True)
        kdata.attachLabels(d.labels)
        s = svm.SVM()
        results[comp].append(s.cv(kdata))
        kdata.attachKernel("gaussian", gamma=0.1)
        results[comp].append(s.cv(kdata))

    comp = "normalization"
    if component == "all" or component == comp:
        results[comp] = []
        data = DataSet(heartdatafile, labelsColumn=0)
        data.attachKernel("polynomial", degree=4, normalization="dices")
        s = svm.SVM()
        results[comp].append(s.cv(data))

    comp = "svr"
    if component == "all" or component == comp:
        d = DataSet(heartdatafile, labelsColumn=0, numericLabels=True)
        results[comp] = []
        s = svm.SVR()
        # results[comp].append(
        #    s.cv(d, saveSpace = True))
        # results[comp].append(
        #    s.trainTest(d, range(150), range(151, 250)))
        results[comp].append(s.cv(d))

    comp = "save"
    if component == "all" or component == comp:
        results[comp] = []
        s = svm.SVM()
        data = DataSet(heartdatafile, labelsColumn=0)
        import tempfile

        tmpfile = tempfile.mktemp()
        r = s.cv(data)
        r.save(tmpfile)
        r = assess.loadResults(tmpfile)
        results["save"].append(r)

        r = s.nCV(data)
        r.save(tmpfile)
        results["save"].append(assess.loadResults(tmpfile))

        r = {}
        for i in range(10):
            r[i] = s.cv(data)

        assess.saveResultObjects(r, tmpfile)
        r = assess.loadResults(tmpfile)

    comp = "classifiers"
    if component == "all" or component == comp:
        d = DataSet(heartdatafile, labelsColumn=0)
        results[comp] = []
        cl = knn.KNN()
        results[comp].append(cl.stratifiedCV(d))
        print "testing ridge regression"
        ridge = ridgeRegression.RidgeRegression()
        results[comp].append(ridge.cv(d))

    comp = "platt"
    if component == "all" or component == "platt":
        results[comp] = []
        d = DataSet(heartdatafile, labelsColumn=0)
        p = platt.Platt2(svm.SVM())
        results[comp].append(p.stratifiedCV(d))

    comp = "multi"
    if component == "all" or component == comp:
        results[comp] = []
        d = DataSet(irisdatafile, labelsColumn=-1)

        mc = multi.OneAgainstOne(svm.SVM())
        results[comp].append(mc.cv(d))

        d = DataSet(irisdatafile, labelsColumn=-1)

        mc = multi.OneAgainstRest(svm.SVM())
        results[comp].append(mc.cv(d))

        mc = multi.OneAgainstRest(svm.SVM())
        d.attachKernel("poly")
        results[comp].append(mc.cv(d))
        d.attachKernel("linear")
        mc = multi.OneAgainstRest(svm.SVM())
        # kdata = datafunc.KernelData('iris.linear.kernel',
        #                            labelsFile = 'irisY.csv', labelsColumn = 0, gistFormat = True)
        # results[comp].append(mc.cv(kdata))

    comp = "featsel"
    if component == "all" or component == comp:
        results[comp] = []

        s = svm.SVM()
        d = DataSet(yeastdatafile, labelsColumn=0)
        d2 = labels.oneAgainstRest(d, "2")
        results[comp].append(s.stratifiedCV(d2))

        # feature selection using RFE
        m = composite.FeatureSelect(s, featsel.RFE())
        results[comp].append(m.stratifiedCV(d2, 3))

        fs = featsel.FeatureScore("golub")
        f = featsel.Filter(fs, sigma=2)
        f = featsel.Filter(fs, numFeatures=20)
        m = composite.FeatureSelect(s, f)
        results[comp].append(m.stratifiedCV(d2, 3))

        # same thing but with a Chain:
        c = composite.Chain([f, s])
        # r = c.stratifiedCV (d2)

    comp = "modelSelection"
    if component == "all" or component == comp:
        results[comp] = []
        s = svm.SVM()
        d = DataSet(heartdatafile, labelsColumn=0)
        p = modelSelection.ParamGrid(svm.SVM(ker.Polynomial()), "C", [0.1, 1, 10, 100], "kernel.degree", [2, 3, 4])
        p = modelSelection.ParamGrid(svm.SVM(ker.Gaussian()), "C", [0.1, 1, 10, 100], "kernel.gamma", [0.01, 0.1, 1])
        # p = modelSelection.Param(svm.SVM(), 'C', [0.1, 1, 10, 100])

        m = modelSelection.ModelSelector(p, measure="roc", foldsToPerform=2)
        m = modelSelection.ModelSelector(p)
        # m = modelSelection.SVMselect()
        results[comp].append(m.cv(d))

    return results
Beispiel #7
0
def test(component='svm', **args):

    container = 'SparseDataSet'
    if 'container' in args:
        container = args['container']
    try:
        DataSet = getattr(vectorDatasets, container)
    except:
        raise ValueError, 'wrong container ' + container

    results = {}

    comp = 'general'
    if component == 'all' or component == comp:
        s = svm.SVM()
        results = {}
        d = DataSet(heartdatafile, labelsColumn=0)
        s.train(d)
        s.test(d)
        s = svm.SVM()
        s.stratifiedCV(d)
        print 'starting aggregate****************'
        d2 = Aggregate([d, d])
        print 'end aggregate'
        r = s.stratifiedCV(d2)

        d.attachKernel('polynomial')
        s.cv(d)
        d.attachKernel('linear')
        s = svm.SVM()
        s.train(d)
        s.train(d, saveSpace=False)
        s.save("tmp")
        loaded = svm.loadSVM("tmp", datasetClass=DataSet)
        r = loaded.test(d)
        d.attachKernel('gaussian', gamma=0.01)

        s.train(d, saveSpace=False)
        s.save("tmp")
        loaded = svm.loadSVM("tmp", datasetClass=DataSet, labelsColumn=1)
        r = loaded.test(d)
        os.remove('tmp')

        d = DataSet(numpy.random.randn(100, 10))
        d = DataSet([[1, 2], [2, 3]])
        d = SequenceData(['asa', 'ben', 'hur'])

    comp = 'svm'
    if component == 'all' or component == comp:
        d = DataSet(heartdatafile, labelsColumn=0)
        results[comp] = []
        d.attachKernel('polynomial')
        s = svm.SVM()
        results[comp].append(s.cv(d, saveSpace=True))
        d.attachKernel('linear')
        results[comp].append(s.cv(d))

    comp = 'kernelData'
    if component == 'all' or component == comp:
        d = DataSet(heartdatafile, labelsColumn=0)
        results[comp] = []
        kdata = KernelData('heart.kernel', gistFormat=True)
        kdata.attachLabels(d.labels)
        s = svm.SVM()
        results[comp].append(s.cv(kdata))
        kdata.attachKernel('gaussian', gamma=0.1)
        results[comp].append(s.cv(kdata))

    comp = 'normalization'
    if component == 'all' or component == comp:
        results[comp] = []
        data = DataSet(heartdatafile, labelsColumn=0)
        data.attachKernel('polynomial', degree=4, normalization='dices')
        s = svm.SVM()
        results[comp].append(s.cv(data))

    comp = 'svr'
    if component == 'all' or component == comp:
        d = DataSet(heartdatafile, labelsColumn=0, numericLabels=True)
        results[comp] = []
        s = svm.SVR()
        #results[comp].append(
        #    s.cv(d, saveSpace = True))
        #results[comp].append(
        #    s.trainTest(d, range(150), range(151, 250)))
        results[comp].append(s.cv(d))

    comp = 'save'
    if component == 'all' or component == comp:
        results[comp] = []
        s = svm.SVM()
        data = DataSet(heartdatafile, labelsColumn=0)
        import tempfile
        tmpfile = tempfile.mktemp()
        r = s.cv(data)
        r.save(tmpfile)
        r = assess.loadResults(tmpfile)
        results['save'].append(r)

        r = s.nCV(data)
        r.save(tmpfile)
        results['save'].append(assess.loadResults(tmpfile))

        r = {}
        for i in range(10):
            r[i] = s.cv(data)

        assess.saveResultObjects(r, tmpfile)
        r = assess.loadResults(tmpfile)

    comp = 'classifiers'
    if component == 'all' or component == comp:
        d = DataSet(heartdatafile, labelsColumn=0)
        results[comp] = []
        cl = knn.KNN()
        results[comp].append(cl.stratifiedCV(d))
        print 'testing ridge regression'
        ridge = ridgeRegression.RidgeRegression()
        results[comp].append(ridge.cv(d))

    comp = 'platt'
    if component == 'all' or component == 'platt':
        results[comp] = []
        d = DataSet(heartdatafile, labelsColumn=0)
        p = platt.Platt2(svm.SVM())
        results[comp].append(p.stratifiedCV(d))

    comp = 'multi'
    if component == 'all' or component == comp:
        results[comp] = []
        d = DataSet(irisdatafile, labelsColumn=-1)

        mc = multi.OneAgainstOne(svm.SVM())
        results[comp].append(mc.cv(d))

        d = DataSet(irisdatafile, labelsColumn=-1)

        mc = multi.OneAgainstRest(svm.SVM())
        results[comp].append(mc.cv(d))

        mc = multi.OneAgainstRest(svm.SVM())
        d.attachKernel('poly')
        results[comp].append(mc.cv(d))
        d.attachKernel('linear')
        mc = multi.OneAgainstRest(svm.SVM())
        #kdata = datafunc.KernelData('iris.linear.kernel',
        #                            labelsFile = 'irisY.csv', labelsColumn = 0, gistFormat = True)
        #results[comp].append(mc.cv(kdata))

    comp = 'featsel'
    if component == 'all' or component == comp:
        results[comp] = []

        s = svm.SVM()
        d = DataSet(yeastdatafile, labelsColumn=0)
        d2 = labels.oneAgainstRest(d, '2')
        results[comp].append(s.stratifiedCV(d2))

        # feature selection using RFE
        m = composite.FeatureSelect(s, featsel.RFE())
        results[comp].append(m.stratifiedCV(d2, 3))

        fs = featsel.FeatureScore('golub')
        f = featsel.Filter(fs, sigma=2)
        f = featsel.Filter(fs, numFeatures=20)
        m = composite.FeatureSelect(s, f)
        results[comp].append(m.stratifiedCV(d2, 3))

        # same thing but with a Chain:
        c = composite.Chain([f, s])
        #r = c.stratifiedCV (d2)

    comp = 'modelSelection'
    if component == 'all' or component == comp:
        results[comp] = []
        s = svm.SVM()
        d = DataSet(heartdatafile, labelsColumn=0)
        p = modelSelection.ParamGrid(svm.SVM(ker.Polynomial()), 'C',
                                     [0.1, 1, 10, 100], 'kernel.degree',
                                     [2, 3, 4])
        p = modelSelection.ParamGrid(svm.SVM(ker.Gaussian()), 'C',
                                     [0.1, 1, 10, 100], 'kernel.gamma',
                                     [0.01, 0.1, 1])
        #p = modelSelection.Param(svm.SVM(), 'C', [0.1, 1, 10, 100])

        m = modelSelection.ModelSelector(p, measure='roc', foldsToPerform=2)
        m = modelSelection.ModelSelector(p)
        #m = modelSelection.SVMselect()
        results[comp].append(m.cv(d))

    return results
Beispiel #8
0
        if line.rstrip() == 'True':
            temp = 'True'
        else:
            temp = 'False'

        y.append(temp)

## Now it's time to load our data into PyML's vector objects

data = VectorDataSet(X2,L=y2)

#Create SVM object, then train our set
s = SVM()
s.train(data)
s.save("freePizza")

## Yay!

# Now to cross-validate the data; we first take the other set
X3 = X[fifth+1:-1];
y3 = y[fifth+1:-1];

print y3
#Load our training data
from PyML.classifiers.svm import loadSVM
loadedSVM = loadSVM("freePizza",data)

testData = VectorDataSet(X3,L=y3)
r = loadedSVM.test(testData)
print r
Beispiel #9
0
def svm_prediction(peptides, job_id, input_train="SVM_POS_NEG.fasta"):
    """
    Makes a final prediction based on SVM training files.
    This code is used for prediciton of blind datasets, based on the training
    datasets of positives and negatives.

    :param peptides: input peptides
    :param job_id: random job id assigned prior to start predicting
    :param input_train: input positive and negative examples used in training
    :return: returns SVM scores for each inputed peptide
    """

    print("Begin SVM")

    # from methods import load_sqlite, store_sqlite

    global PATH
    global TMP_PATH

    # suppress SVM output
    devnull = open(os.devnull, 'w')
    sys.stdout, sys.stderr = devnull, devnull

    svm_scores = []
    # query the database
    # for peptide in peptides:
    #     try:
    #         score = load_sqlite(peptide, method="SVM", unique=True)
    #         svm_scores.append(score)
    #     except:
    #         pass

    if len(peptides) == len(svm_scores):
        pass
    else:

        # generate a svm input from the peptides
        rand = job_id
        input_svm = "%s_svm.fasta" % rand
        output_tmp = open(os.path.join(TMP_PATH, input_svm), "w")

        count = 0
        for peptide in peptides:
            count += 1
            output_tmp.write("> %i label=%s\n%s\n" % (count, 1, peptide))
        for peptide in peptides:
            count += 1
            output_tmp.write("> %i label=%s\n%s\n" % (count, -1, peptide))
        output_tmp.close()

        # outputs
        model_svm = "%s_svm_model.txt" % rand

        # train data
        train_data = SequenceData(os.path.join(PATH, input_train),
                                  mink=1,
                                  maxk=1,
                                  maxShift=0,
                                  headerHandler=svm_process_header)
        train_data.attachKernel('cosine')

        cval = 1
        s = SVM(C=cval)
        s.train(train_data)
        s.save(os.path.join(TMP_PATH, model_svm))

        # load trained SVM
        loaded_svm = loadSVM(os.path.join(TMP_PATH, model_svm), train_data)

        # test data
        test_data = SequenceData(os.path.join(TMP_PATH, input_svm),
                                 mink=1,
                                 maxk=1,
                                 maxShift=0,
                                 headerHandler=svm_process_header)
        test_data.attachKernel('cosine')
        results = loaded_svm.test(test_data)

        # print results out
        output_svm = "%s_svm.txt" % rand
        results.toFile(os.path.join(TMP_PATH, output_svm))

        # load results process output (positives + negatives)
        infile = open(os.path.join(TMP_PATH, output_svm), "r")
        inlines = infile.readlines()
        infile.close()
        scores = list()
        for line in inlines:
            line = line.rstrip("\r\n")
            try:
                entry = int(line.split("\t")[0])
                score = float(line.split("\t")[1])
                label = int(line.split("\t")[3])
                if label != "-1":
                    scores.append([entry, score])
            except:
                pass

        # order list
        sorted_scores = sorted(scores, key=lambda scores: scores[0])

        svm_scores = list()
        for score in sorted_scores:
            svm_score = score[1]
            svm_scores.append(svm_score)

        # remove the temporary model files and results
        try:
            os.remove(os.path.join(TMP_PATH, input_svm))
            os.remove(os.path.join(TMP_PATH, model_svm))
            os.remove(os.path.join(TMP_PATH, output_svm))
        except:
            pass

        # save the peptides in db
        # for peptide, score in zip(peptides, svm_scores):
        #     store_sqlite(peptide, method="SVM", information=score, save=True)

    # restore normal output
    sys.stdout = sys.__stdout__
    sys.stderr = sys.__stderr__

    print("End SVM")
    return svm_scores