Beispiel #1
0
 def compute_train_kernel(self,
                          g,
                          m,
                          t=20,
                          approx=True,
                          I=100,
                          delta=0.025,
                          skip_variance=False):
     kernel = FastSK(g=g,
                     m=m,
                     t=t,
                     approx=approx,
                     max_iters=I,
                     delta=delta,
                     skip_variance=skip_variance)
     kernel.compute_train(self.train_seq)
Beispiel #2
0
    def train_and_test(self,
                       g,
                       m,
                       t,
                       approx,
                       I=100,
                       delta=0.025,
                       skip_variance=False,
                       C=1):
        kernel = FastSK(
            g=g,
            m=m,
            t=t,
            approx=approx,
            max_iters=I,
            delta=delta,
            skip_variance=skip_variance,
        )

        kernel.compute_kernel(self.train_seq, self.test_seq)
        self.Xtrain = kernel.get_train_kernel()
        self.Xtest = kernel.get_test_kernel()
        self.stdevs = kernel.get_stdevs()
        svm = LinearSVC(C=C, class_weight="balanced")
        self.clf = CalibratedClassifierCV(svm,
                                          cv=5).fit(self.Xtrain, self.Ytrain)
        acc, auc = self.evaluate_clf()
        return acc, auc
Beispiel #3
0
def main(args):
    ## Get data
    reader = FastaUtility()
    Xtrain, Ytrain = reader.read_data(args.train)
    Xtest, Ytest = reader.read_data(args.test)
    Ytest = np.array(Ytest).reshape(-1, 1)

    ## Compute kernel matrix
    fastsk = FastSK(g=10, m=6, t=1, approx=True)
    fastsk.compute_kernel(Xtrain, Xtest)

    Xtrain = fastsk.get_train_kernel()
    Xtest = fastsk.get_test_kernel()

    reader = FastaUtility()
    Xseq, Ytrain = reader.read_data(args.train)

    ## Train a linear SVM
    svm = LinearSVC(C=1)
    clf = CalibratedClassifierCV(svm, cv=5).fit(Xtrain, Ytrain)

    ## Evaluate
    acc = clf.score(Xtest, Ytest)
    probs = clf.predict_proba(Xtest)[:, 1]
    auc = roc_auc_score(Ytest, probs)

    print("Linear SVM:\n\tAcc = {}, AUC = {}".format(acc, auc))
    assert auc >= 0.9, "AUC is not correct. Should be >= 0.9. Received: {}".format(auc)
Beispiel #4
0
    def train_and_test(self,
                       g,
                       m,
                       t,
                       approx,
                       I=100,
                       delta=0.025,
                       skip_variance=False):
        kernel = FastSK(
            g=g,
            m=m,
            t=t,
            approx=approx,
            max_iters=I,
            delta=delta,
            skip_variance=skip_variance,
        )

        kernel.compute_kernel(self.train_seq, self.test_seq)
        self.Xtest = kernel.get_test_kernel()
        self.Xtest = np.array(self.Xtest).reshape(len(self.Xtest), -1)
        self.Xtrain = kernel.get_train_kernel()
        self.Xtrain = np.array(self.Xtrain).reshape(len(self.Xtrain), -1)

        # Can replace Lasso with alternative regression approaches such as SVR
        model = LassoCV(cv=5, n_jobs=t,
                        random_state=293).fit(self.Xtrain, self.Ytrain)
        r2 = model.score(self.Xtest, self.Ytest)
        return r2
Beispiel #5
0
if __name__ == '__main__':

    parser = argparse.ArgumentParser()

    parser.add_argument('--train',
                        default='../data/EP300.train.fasta',
                        help='training sequences file')
    parser.add_argument('--test',
                        default='../data/EP300.test.fasta',
                        help='test sequences file')

    args = parser.parse_args()

    ## Compute kernel matrix
    fastsk = FastSK(g=10, m=6, t=1, approx=True)
    fastsk.compute_kernel(args.train, args.test)

    Xtrain = fastsk.get_train_kernel()
    Xtest = fastsk.get_test_kernel()

    reader = FastaUtility()
    Xseq, Ytrain = reader.read_data(args.train)

    ## Use linear SVM
    svm = LinearSVC(C=1)
    clf = CalibratedClassifierCV(svm, cv=5).fit(Xtrain, Ytrain)

    ## Evaluate
    reader = FastaUtility()
    Xseq, Ytest = reader.read_data(args.test)
Beispiel #6
0
    args.t,
    args.approx,
    args.I,
    args.delta,
)
skip_variance = args.skip_variance

### Read the data
reader = FastaUtility()
Xtrain, Ytrain = reader.read_data(train_file)
Xtest, Ytest = reader.read_data(test_file)
Ytest = np.array(Ytest).reshape(-1, 1)

### Compute the fastsk kernel
start = time.time()
fastsk = FastSK(
    g=g, m=m, t=t, approx=approx, max_iters=I, delta=d, skip_variance=skip_variance
)

fastsk.compute_kernel(Xtrain, Xtest)
end = time.time()
print("Kernel computation time: ", end - start)
Xtrain = fastsk.get_train_kernel()
Xtest = fastsk.get_test_kernel()

### Use linear SVM
svm = LinearSVC(C=C)
clf = CalibratedClassifierCV(svm, cv=5).fit(Xtrain, Ytrain)
acc, auc = evaluate_clf(clf, Xtest, Ytest)
print("Linear SVM:\n\tAcc = {}, AUC = {}".format(acc, auc))
Beispiel #7
0
import glob
from fastsk import FastSK
datasets = []
for filepath in glob.iglob('../../data/*.train.fasta'):
    s = filepath[:filepath.find(".train")]
    datasets.append(s)
    #print(s)

for s in datasets:
    name = s[s.find("/") + 1:]
    # Uncomment for confirmation for each dataset train
    '''
    s = input("about to train on " + name + " dataset.\ny/n?")
    if s.lower() != "y":
        continue
    '''
    fastsk = FastSK(g=7, m=3, t=16, approx=True, max_iters=75)
    fastsk.compute_kernel(Xtrain=s + ".train.fasta",
                          Xtest="../../data/" + name + ".test.fasta")
    fastsk.fit()
    fastsk.score(metric='auc')