Beispiel #1
0
# boosting rounds
T = 20  
# cross-validation folds
Nfold = 10
# ADT model
model = 'tree'  

# run boosting for each value of mismatch
for m in range(M):
    Xt = XT[m].astype(float)
    Yt = Yt.astype(float)
    Nt = Yt.shape[1]
    predicted_labels = np.zeros((Nt,T),dtype='int16')

    # split the data indices into `Nfold` random disjoint sets
    Fidx = splitdata.cv_multiclass_fold(Yt,Nfold)

    for fold in range(Nfold):
        # split the data and labels into train and test sets
        train_data, train_labels, test_data, test_labels \
            = splitdata.cv_split(Xt,Yt,Fidx[fold])

        # specify output file names
        filetag = model+'_%d_%d_%d' % (K,m,fold)
        output_file = '%s/output_%s.txt' % (data_path, filetag)
        handle = open(output_file,'w')
        to_write = ['round', 'kmer', 'threshold', 'train_auc', 
                    'train_acc', 'test_auc', 'test_acc', 'runtime']
        handle.write('\t'.join(to_write)+'\n')
        handle.close()
Beispiel #2
0
def main():

    # default proj_path
    def_pp = os.sep.join(os.getcwd().split(os.sep)[:-1])

    # parse arguments
    parser = ArgumentParser()
    parser.add_argument("-J","--jobid", dest="jobid",
            type=str, help="dataset id")
    parser.add_argument("-K", "--kmerlength", dest="K",
            type=int, help="kmer feature length")
    parser.add_argument("-M", "--mismatch", dest="M",
            type=int, help="max allowed mismatch")
    parser.add_argument("-P", "--projpath", dest="proj_path",
            type=str, default=def_pp, help="project path (defaults to cwd)")
    parser.add_argument("-T","--rounds", dest="T",
            type=int, help="number of boosting rounds")
    parser.add_argument("-N","--folds", dest="Nfolds",
            type=int, default=5, help="number of cv folds to execute")
    parser.add_argument("-R","--runid", dest="runid",
            type=str, help="run id")
    parser.add_argument("-O","--outdir",dest="outdir",
            type=str, help="output directory")
    parser.add_argument("-Z","--model", dest="model",
            type=str, default="tree", help="model type (tree or stump)")
    args = parser.parse_args()

    jobid = args.jobid
    runid = args.runid
    K = args.K
    M = args.M
    T = args.T
    Nfolds = args.Nfolds
    model = args.model
    outdir = args.outdir

    # set up paths
    proj_path = args.proj_path
    src_path = proj_path + '/src/psmkboost'
    C_path = '%s/get_new_function.c' % (src_path)
    feat_path = '%s/data' % (proj_path)
    # run_path = '%s/cache/runs/%s' % (proj_path, runid)
    run_path = outdir
    if not os.path.exists(run_path):
        os.makedirs(run_path)

    print 'Running adaboost on %s using position specific mismatch kmer feature space' % (jobid)

    # load feature matrix
    XDF = pd.load('%s/data/%s.K%d.M%d.feature_matrix.pkl' % (proj_path, jobid, K, M))
    (N, P) = XDF.shape

    # load feature list
    features = XDF.columns
    feat_dict = dict(zip(range(P), features))

    # load label dict
    ldf = '%s/data/%s.labeldict.csv' % (proj_path, jobid)
    label_dict = {}
    for row in csv.reader(open(ldf,'r'), delimiter=','):
        label_dict[int(row[0])] = row[1]
    C = len(label_dict)

    # load label matrix
    lf = '%s/data/%s.labels.csv' % (proj_path, jobid)
    Y = gen_label_matrix(lf, N, C)
    Yt = Y.T

    # in this case we will loop over binary thresholds
    threshold_list = range(2)

    #holds predicted label at each round
    predicted_labels = np.zeros((N,T),dtype='int')

    # split the data indices into `Nfold` random disjoint sets
    Fidx = splitdata.cv_multiclass_fold(Yt,Nfolds)

    for fold in range(Nfolds):

        print 'executing fold %d'%(fold+1)
        # split the data and labels into train and test sets
        # skip this and return only indices?
        print 'splitting data...'
        tt = time.time()
        train_data, train_labels, test_data, test_labels \
            = splitdata.cv_split(XDF.as_matrix().T, Yt, Fidx[fold])
        print 'split data time=%.2f seconds'%(time.time()-tt)

        # specify output file names
        filetag = '%s_K%d_M%d_fold%d'%(model,K,M,fold)
        output_file = '%s/%s.%s.outputsummary_%s.txt' % (run_path, jobid, runid, filetag)
        handle = open(output_file,'w')
        to_write = ['round', 'kmer', 'threshold', 'train_auc', 
                    'train_acc', 'test_auc', 'test_acc', 'runtime']
        handle.write('\t'.join(to_write)+'\n')
        handle.close()

        # parse the C code from get_new_function.c
        f = open(C_path,'r')
        C_code = '\n'.join([line for line in f if '//' not in line])
        f.close()

        # run Adaboost
        print 'entering adaboost'
        adt, adt_outputs, performance, predicted_labels = boost.adaboost(C_code, \
            train_data, train_labels, test_data, test_labels, T, \
            output_file=output_file, kmer_dict=feat_dict, model=model, \
            predicted_labels=predicted_labels, test_indices=Fidx[fold])

        # save the learned model
        model_file = '%s/%s.%s.adt_%s.pkl' % (run_path, jobid, runid, filetag)
        handle = open(model_file,'w')
        cPickle.dump(adt,handle)
        handle.close()

        # save algorithm performance (errors, runtime, etc)
        results_file = '%s/%s.%s.performance_%s.pkl' % (run_path, jobid, runid, filetag)
        handle = open(results_file,'w')
        cPickle.Pickler(handle,protocol=2).dump(adt_outputs)
        cPickle.Pickler(handle,protocol=2).dump(performance)
        handle.close()

    # output predicted labels on test data for each CV fold
    output_file = '%s/%s.%s.testsetpredictions_%d.pkl' \
        % (run_path, jobid, runid, K)
    handle = open(output_file,'w')
    cPickle.Pickler(handle,protocol=2).dump(Fidx)
    cPickle.Pickler(handle,protocol=2).dump(predicted_labels)
    handle.close()
Beispiel #3
0
    Yt = cPickle.load(f)
    kmer_dict = cPickle.load(f)
    f.close()

    # make Xt, Yt memory-efficient
    Xt = Xt.astype('int16')
    Yt = Yt.astype('int16')
    Nt = Yt.shape[1]
    T = 20
    predicted_labels = np.zeros((Nt, T), dtype='int16')

    # number of folds of cross validation
    Nfold = 10

    # split the data indices into 10 random disjoint sets
    Fidx = splitdata.cv_multiclass_fold(Yt, Nfold)

    for fold in range(Nfold):
        params = (fold, k, m, T)
        # using each set as the test set and the rest as train sets
        # split the data and run boosting
        X, Y, x, y, Idx = splitdata.cv_split(Xt, Yt, Fidx[fold])
        predicted_labels = boost.adaboost(X,
                                          Y,
                                          x,
                                          y,
                                          predicted_labels,
                                          Fidx[fold],
                                          params,
                                          kmer_dict,
                                          model='tree',
Beispiel #4
0
from nltk.classify import naivebayes as nb
import numpy as np
import cPickle
import splitdata

fh = open('../data/picorna_virii_data_8_2.pkl')
X = cPickle.load(fh)
Y = cPickle.load(fh)
z = cPickle.load(fh)

# just pull out ten features from X to make sure the whole thing works
X = X[:10,:]

v = z.values()
split_indices = splitdata.cv_multiclass_fold(Y,10)

labels = np.argmax(Y,axis=0)
labelled_featuresets = [(dict(zip(v,data)),y) for (data,y) in zip(X.T,labels)]

test_labels=[]
true_labels=[]
for i,train_indices in enumerate(split_indices):
    test_indices = list(set(range(Y.shape[1])).difference(train_indices))
    # train
    train_features = [labelled_featuresets[i] for i in train_indices]
    model = nb.NaiveBayesClassifier.train(train_features)
    # test
    test_features = [labelled_featuresets[i] for i in test_indices]
    label = [model.classify(featureset[0]) for featureset in test_features]
    # collect
    true_labels.append(Y[:,test_indices])