def get_sequences_with_names(size=9999, rand=0): if rand>0: sequences , boring = random_bipartition_iter(fasta_to_sequence("../toolsdata/%s.fa" % RFAM),.9,random_state=random.random()*rand) sequences = itertools.islice( sequences , size) else: sequences = itertools.islice( fasta_to_sequence("../toolsdata/%s.fa" % RFAM), size) return sequences
def batch_performance_evaluation(params, synthesizer=None, iter_train=None, iter_test=None, relative_size=None): """ """ n_experiment_repetitions = params['n_experiment_repetitions'] start_time = time.time() e_roc_t = [] e_apr_t = [] e_roc_s = [] e_apr_s = [] for epoch in range(n_experiment_repetitions): logger.info('-' * 80) logger.info('run %d/%d' % (epoch + 1, n_experiment_repetitions)) # Copy train and test iterables for one run. iter_train, iter_train_ = tee(iter_train) iter_test, iter_test_ = tee(iter_test) # Portion of train and test iterables used in one run. iter_train_, x = random_bipartition_iter(iter_train_, relative_size=relative_size) iter_test_, x = random_bipartition_iter(iter_test_, relative_size=relative_size) roc_t, apr_t, roc_s, apr_s = performance_evaluation( params, synthesizer=synthesizer, iter_train=iter_train_, iter_test=iter_test_) # Update experiment performance measures. e_roc_t.append(roc_t) e_apr_t.append(apr_t) e_roc_s.append(roc_s) e_apr_s.append(apr_s) elapsed_time = time.time() - start_time return e_roc_t, e_apr_t, e_roc_s, e_apr_s, elapsed_time
def split_to_train_and_test(rfam_id=None, train_to_test_split_ratio=None, number_of_samples=None): iterable = fasta_to_sequence(rfam_url(rfam_id)) if number_of_samples: iterable = islice(iterable, number_of_samples) logger.info('Experiment cunducted with %d sample sequences' % number_of_samples) train, test = random_bipartition_iter( iterable, relative_size=train_to_test_split_ratio) return train, test
def get_sequences_with_names(size=9999, rand=0): if rand > 0: sequences, boring = random_bipartition_iter( fasta_to_sequence("../toolsdata/%s.fa" % RFAM), .9, random_state=random.random() * rand) sequences = itertools.islice(sequences, size) else: sequences = itertools.islice( fasta_to_sequence("../toolsdata/%s.fa" % RFAM), size) return sequences
def oneclasstest_fraction(fraction=0.1, repeats=2): # choosing some graphs, # having array to save results for i in range(repeats): badscores = [] goodscores = [] graphs = get_sequences_with_names(size=923) graphs, not_used = random_bipartition_iter( graphs, fraction, random_state=random.random() * i * 1000) estimator = Wrapper(nu=.27, cv=3, n_jobs=-1) sampler = rna.AbstractSampler(radius_list=[0, 1], thickness_list=[2], min_cip_count=1, min_interface_count=2, preprocessor=rna.PreProcessor( base_thickness_list=[1], ignore_inserts=True), postprocessor=rna.PostProcessor(), estimator=estimator) sampler.preprocessor.set_param(sampler.vectorizer) graphmanagers = sampler.preprocessor.fit_transform(graphs) sampler.estimatorobject.fit(graphmanagers, vectorizer=sampler.vectorizer, random_state=sampler.random_state) #test for graphman in graphmanagers: struct = evaltools.dotbracket_to_shape(graphman.structure, shapesversion=SHAPEVERSION) score = sampler.estimatorobject.score(graphman) if struct == "[[][][]]": goodscores.append(score) else: badscores.append(score) print "afraction=%f , instances=%f, good=%d , bad=%d" % ( fraction, fraction * 923, len(goodscores), len(badscores)) a = numpy.array(badscores) print 'bad:mean/std ', numpy.mean(a, axis=0), ' ', numpy.std(a, axis=0) a = numpy.array(goodscores) print 'cgood:mean/std ', numpy.mean(a, axis=0), ' ', numpy.std(a, axis=0) a = numpy.array(goodscores + badscores) print 'dbad+good:mean/std ', numpy.mean(a, axis=0), ' ', numpy.std(a, axis=0) print ''
def batch_performance_evaluation(params, synthesizer=None, iter_train=None, iter_test=None, relative_size=None): """ """ n_experiment_repetitions = params['n_experiment_repetitions'] start_time = time.time() e_roc_t = [] e_apr_t = [] e_roc_s = [] e_apr_s = [] for epoch in range(n_experiment_repetitions): logger.info('-' * 80) logger.info('run %d/%d' % (epoch + 1, n_experiment_repetitions)) # Copy train and test iterables for one run. iter_train, iter_train_ = tee(iter_train) iter_test, iter_test_ = tee(iter_test) # Portion of train and test iterables used in one run. iter_train_, x = random_bipartition_iter( iter_train_, relative_size=relative_size) iter_test_, x = random_bipartition_iter( iter_test_, relative_size=relative_size) roc_t, apr_t, roc_s, apr_s = performance_evaluation( params, synthesizer=synthesizer, iter_train=iter_train_, iter_test=iter_test_) # Update experiment performance measures. e_roc_t.append(roc_t) e_apr_t.append(apr_t) e_roc_s.append(roc_s) e_apr_s.append(apr_s) elapsed_time = time.time() - start_time return e_roc_t, e_apr_t, e_roc_s, e_apr_s, elapsed_time
def oneclasstest_fraction(fraction=0.1, repeats=2): # choosing some graphs, # having array to save results for i in range(repeats): badscores = [] goodscores = [] graphs = get_sequences_with_names(size=923) graphs, not_used = random_bipartition_iter(graphs, fraction, random_state=random.random() * i * 1000) estimator = Wrapper(nu=0.27, cv=3, n_jobs=-1) sampler = rna.AbstractSampler( radius_list=[0, 1], thickness_list=[2], min_cip_count=1, min_interface_count=2, preprocessor=rna.PreProcessor(base_thickness_list=[1], ignore_inserts=True), postprocessor=rna.PostProcessor(), estimator=estimator, ) sampler.preprocessor.set_param(sampler.vectorizer) graphmanagers = sampler.preprocessor.fit_transform(graphs) sampler.estimatorobject.fit(graphmanagers, vectorizer=sampler.vectorizer, random_state=sampler.random_state) # test for graphman in graphmanagers: struct = evaltools.dotbracket_to_shape(graphman.structure, shapesversion=SHAPEVERSION) score = sampler.estimatorobject.score(graphman) if struct == "[[][][]]": goodscores.append(score) else: badscores.append(score) print "afraction=%f , instances=%f, good=%d , bad=%d" % ( fraction, fraction * 923, len(goodscores), len(badscores), ) a = numpy.array(badscores) print "bad:mean/std ", numpy.mean(a, axis=0), " ", numpy.std(a, axis=0) a = numpy.array(goodscores) print "cgood:mean/std ", numpy.mean(a, axis=0), " ", numpy.std(a, axis=0) a = numpy.array(goodscores + badscores) print "dbad+good:mean/std ", numpy.mean(a, axis=0), " ", numpy.std(a, axis=0) print ""
def evaluate(pos_fname, neg_fname=None, size=None, percentages=None, n_repetitions=None, train_test_split=None): # initializing graphs_pos = get_graphs(pos_fname, size=size) if neg_fname == None: graphs_neg = get_graphs_permuted(pos_fname, size=size) else: graphs_neg = get_graphs(neg_fname, size=size) # train/test split from eden.util import random_bipartition_iter pos_train_global, pos_test_global = random_bipartition_iter( graphs_pos, train_test_split, random_state=random.random() * 1000) neg_train_global, neg_test_global = random_bipartition_iter( graphs_neg, train_test_split, random_state=random.random() * 1000) original_repetitions = [] original_sample_repetitions = [] sample_repetitions = [] for percentage in percentages: originals = [] originals_samples = [] samples = [] for repetition in range(n_repetitions): random_state = int(313379 * percentage + repetition) random.seed(random_state) pos_train_global, pos_train_global_ = tee(pos_train_global) neg_train_global, neg_train_global_ = tee(neg_train_global) pos_test_global, pos_test_global_ = tee(pos_test_global) neg_test_global, neg_test_global_ = tee(neg_test_global) # use shuffled list to create test and sample set pos, pos_reminder = random_bipartition_iter( pos_train_global_, percentage) pos, pos_ = tee(pos) neg, neg_reminder = random_bipartition_iter( neg_train_global_, percentage) neg, neg_ = tee(neg) #sample independently from the 2 classes logger.info('Positive') sampled_pos = fit_sample(pos_, random_state=random_state) logger.info('Negative') sampled_neg = fit_sample(neg_, random_state=random_state) #evaluate the predictive performance on held out test set start = time() logger.info("=" * 80) logger.info('repetition: %d/%d' % (repetition + 1, n_repetitions)) logger.info("training percentage:" + str(percentage)) perf_orig, perf_samp, perf_orig_samp = fit_and_evaluate( pos, neg, sampled_pos, sampled_neg, pos_test_global_, neg_test_global_) logger.info('Time elapsed for full repetition: %.1f sec' % ((time() - start))) originals.append(perf_orig) originals_samples.append(perf_orig_samp) samples.append(perf_samp) original_repetitions.append(originals) original_sample_repetitions.append(originals_samples) sample_repetitions.append(samples) return original_repetitions, original_sample_repetitions, sample_repetitions
def train_dbox_model(fasta_fname=None, model_fname='eden_model_Dbox',window=4, neg_size_factor=5, train_test_split=0.7, n_jobs=4, n_iter=40): #transform sequences in a linear graph def pre_process_graph(iterator): from eden.converter.fasta import sequence_to_eden graphs = sequence_to_eden(iterator) return graphs #extract box sequence with annotaded header information def extract_box(data,window=3,box_type='D'): import re from eden.converter.fasta import fasta_to_sequence seqs = fasta_to_sequence(data) for seq in seqs: header = seq[0].split('_') cbox = header[-4] cpos = int(header[-3]) dbox = header[-2] dpos = int(header[-1]) nts = re.sub('\n','',seq[1]) if box_type == 'C': box = nts[cpos-1-window:cpos+6+window] else: if (not((len(nts) < dpos+3+window) or (dpos-1-window < 0))): #box = nts[dpos-1-window:dpos+3+window] box = nts[dpos-1-window:dpos-1]+'x'+nts[dpos-1:dpos+3]+'y'+nts[dpos+3:dpos+3+window] yield seq[0],box #Choose the vectorizer from eden.graph import Vectorizer vectorizer = Vectorizer() #Choose the estimator from sklearn.linear_model import SGDClassifier estimator = SGDClassifier(class_weight='auto', shuffle = True, average=True ) import random from eden import util ################Generate positive samples############### seqs_d_pos = extract_box(fasta_fname,window,box_type='D') from itertools import tee seqs_d_pos,seqs_d_pos_ = tee(seqs_d_pos) #################Generate negatives samples############## from eden.modifier.seq import seq_to_seq, shuffle_modifier seqs_d_neg = seq_to_seq( seqs_d_pos_, modifier=shuffle_modifier, times=neg_size_factor, order=2 ) #####################split train/test#################### from eden.util import random_bipartition_iter iterable_pos_train, iterable_pos_test = random_bipartition_iter(seqs_d_pos, relative_size=train_test_split) iterable_neg_train, iterable_neg_test = random_bipartition_iter(seqs_d_neg, relative_size=train_test_split) iterable_pos_train = list(iterable_pos_train) iterable_pos_test = list(iterable_pos_test) iterable_neg_train = list(iterable_neg_train) iterable_neg_test = list(iterable_neg_test) print "training pos ",len(iterable_pos_train) print "training neg ",len(iterable_neg_train) print "test pos ",len(iterable_pos_test) print "test neg ",len(iterable_neg_test) #make predictive model from eden.model import ActiveLearningBinaryClassificationModel model = ActiveLearningBinaryClassificationModel(pre_processor=pre_process_graph, estimator=estimator, vectorizer=vectorizer, n_jobs=n_jobs) #optimize hyperparameters and fit model from numpy.random import randint from numpy.random import uniform vectorizer_parameters={'complexity':[2,3]} estimator_parameters={'n_iter':randint(5, 100, size=n_iter), 'penalty':['l1','l2','elasticnet'], 'l1_ratio':uniform(0.1,0.9, size=n_iter), 'loss':['log'],#'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], 'power_t':uniform(0.1, size=n_iter), 'alpha': [10**x for x in range(-8,0)], 'eta0': [10**x for x in range(-4,-1)], 'learning_rate': ["invscaling", "constant", "optimal"], 'n_jobs':[n_jobs]} model.optimize(iterable_pos_train, iterable_neg_train, model_name=model_fname, max_total_time=60*30, n_iter=n_iter, cv=10, score_func=lambda avg_score,std_score : avg_score - std_score * 2, scoring='roc_auc', vectorizer_parameters=vectorizer_parameters, estimator_parameters=estimator_parameters) #estimate predictive performance print model.get_parameters() result,text = model.estimate( iterable_pos_test, iterable_neg_test ) rss=0 i = 0 for prob in result: i=i+1 print prob if (prob[1] == 1): rss = rss + ((1 - prob[0][1])**2) else: rss = rss + ((1 - prob[0][0])**2) avg_rss= rss/i; text.append('RSS: %.2f' % rss) text.append('avg RSS: %2f' % avg_rss) for t in text: print t
def evaluate(pos_fname, neg_fname=None, size=None, percentages=None, n_repetitions=None, train_test_split=None): # initializing graphs_pos = get_graphs(pos_fname, size=size) if neg_fname==None: graphs_neg = get_graphs_permuted(pos_fname, size=size) else: graphs_neg = get_graphs(neg_fname, size=size) # train/test split from eden.util import random_bipartition_iter pos_train_global,pos_test_global = random_bipartition_iter(graphs_pos,train_test_split,random_state=random.random()*1000) neg_train_global,neg_test_global = random_bipartition_iter(graphs_neg,train_test_split,random_state=random.random()*1000) original_repetitions = [] original_sample_repetitions = [] sample_repetitions = [] for percentage in percentages: originals = [] originals_samples = [] samples = [] for repetition in range(n_repetitions): random_state = int(313379*percentage+repetition) random.seed(random_state) pos_train_global,pos_train_global_ = tee(pos_train_global) neg_train_global,neg_train_global_ = tee(neg_train_global) pos_test_global,pos_test_global_ = tee(pos_test_global) neg_test_global,neg_test_global_ = tee(neg_test_global) # use shuffled list to create test and sample set pos,pos_reminder = random_bipartition_iter(pos_train_global_,percentage) pos,pos_ = tee(pos) neg,neg_reminder = random_bipartition_iter(neg_train_global_,percentage) neg,neg_ = tee(neg) #sample independently from the 2 classes logger.info('Positive') sampled_pos = fit_sample(pos_, random_state=random_state) logger.info('Negative') sampled_neg = fit_sample(neg_, random_state=random_state) #evaluate the predictive performance on held out test set start=time() logger.info( "="*80) logger.info( 'repetition: %d/%d'%(repetition+1, n_repetitions)) logger.info( "training percentage:"+str(percentage)) perf_orig, perf_samp, perf_orig_samp = fit_and_evaluate(pos,neg, sampled_pos,sampled_neg, pos_test_global_,neg_test_global_) logger.info( 'Time elapsed for full repetition: %.1f sec'%((time()-start))) originals.append(perf_orig) originals_samples.append(perf_orig_samp) samples.append(perf_samp) original_repetitions.append(originals) original_sample_repetitions.append(originals_samples) sample_repetitions.append(samples) return original_repetitions, original_sample_repetitions, sample_repetitions
def train_stem_finder_model(fasta,model_stem_name,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r, train_test_split=0.7,neg_size_factor = 4, n_jobs=4, n_iter=40,fasta_test=None): ########### Pre processor #################### def pre_process_graph(iterator, **options): from eden.converter.rna.rnasubopt import rnasubopt_to_eden graphs = rnasubopt_to_eden(iterator, **options) return graphs ########## Vectorizer ######################## from eden.graph import Vectorizer vectorizer = Vectorizer() ######### Estimator ######################### from sklearn.linear_model import SGDClassifier estimator = SGDClassifier(class_weight='auto', shuffle = True, average=True ) def get_Cbox(seqs,window_c): import re for seq in seqs: header = seq[0].split('_') cpos = int(header[-3]) nts = re.sub('\n','',seq[1]) if (not((len(nts) < cpos+6+window_c) or (cpos-1-window_c < 0))): box = nts[cpos-1-window_c:cpos-1]+'x'+nts[cpos-1:cpos+6]+'y'+nts[cpos+6:cpos+6+window_c] yield [seq,cpos],box def get_Dbox(seqs_c,window_d): import re for [seq,cbox],pred in seqs_c: header = seq[0][0].split('_') dpos = int(header[-1]) nts = re.sub('\n','',seq[0][1]) if (not((len(nts) < dpos+3+window_d) or (dpos-1-window_d < 0))): box = nts[dpos-1-window_d:dpos-1]+'x'+nts[dpos-1:dpos+3]+'y'+nts[dpos+3:dpos+3+window_d] yield [seq,cbox,pred,dpos],box ######### Get stem ######################### def get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r): from itertools import izip import re from itertools import tee,islice #1)c_finder seqs_c = get_Cbox(seqs,window_c) #2)submit the Cbox candidates to the model from eden.model import ActiveLearningBinaryClassificationModel model = ActiveLearningBinaryClassificationModel() model.load(model_c_name) seqs_c_pred = list() cands_c = list() max_count = 0 for seq_c in seqs_c: max_count +=1 cands_c.append(seq_c) if (max_count == 10000): #in order to not generate memory leak I've restricted the number of samples to be submited to the model preds = model.decision_function(cands_c) seqs_c_pred = seqs_c_pred + zip(cands_c,preds) cands_c = list() max_count = 0 if (max_count != 0): preds = model.decision_function(cands_c) seqs_c_pred = seqs_c_pred + zip(cands_c,preds) #discard sequences with pred < 0 seqs_c = list() for cand in seqs_c_pred: if (cand[1] >= 0.0): seqs_c.append(cand) #D_finder seqs_cd = get_Dbox(seqs_c,window_d) #submit Dboxes candidate to its model model = ActiveLearningBinaryClassificationModel() model.load(model_d_name) seqs_d_pred = list() cands_d = list() max_count = 0 for seq_d in seqs_cd: max_count +=1 cands_d.append(seq_d) if (max_count == 10000): #in order to not generate memory leak I've restricted the number of samples to be submited to the model preds = model.decision_function(cands_d) seqs_d_pred = seqs_d_pred + zip(cands_d,preds) cands_d = list() max_count = 0 if (max_count != 0): preds = model.decision_function(cands_d) seqs_d_pred = seqs_d_pred + zip(cands_d,preds) #Get the stem region from the sequences stem_cands=[] stem_info =[] #(([[(header, seq), pos_c], cand_c, pred_c, pos_d], 'UAAxCUGAyGAU'), 77.000434164559792) for ([[(header,nts),pos_c],cand_c,pred_c,pos_d],cand_d),pred_d in seqs_d_pred: #print header,'\t',seq,pos_c,'\t',cand_c,'\t',pred_c,'\t',cand_d,'\t',pred_d,"\n---\n" if ( int(pos_c) - 10 < 0): if (int(pos_d)+10 > len(nts)): stem_cands.append([[header,pos_c,pos_d],nts[0:int(pos_c)+6]+"&"+nts[int(pos_d)-1:len(nts)]]) else: stem_cands.append([[header,pos_c,pos_d],nts[0:int(pos_c)+6]+"&"+nts[int(pos_d)-1:int(pos_d)+3+10]]) else: if (int(pos_d)+10 > len(nts)): stem_cands.append([[header,pos_c,pos_d],nts[int(pos_c)-10:int(pos_c)+6]+"&"+nts[int(pos_d)-1:len(nts)]]) else: stem_cands.append([[header,pos_c,pos_d],nts[int(pos_c)-10:int(pos_c)+6]+"&"+nts[int(pos_d)-1:int(pos_d)+3+10]]) return stem_cands #get positive data pos_cds=[] from eden.converter.fasta import fasta_to_sequence seqs = fasta_to_sequence(fasta) train_pos = get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r) train_pos = list(train_pos) #for h,seq in stems_cds: # print h[0][0:10],'\t',seq #Generate Negative Dataset from eden.modifier.seq import seq_to_seq, shuffle_modifier train_neg = seq_to_seq(train_pos, modifier=shuffle_modifier, times=neg_size_factor, order=2 ) train_neg = list(train_neg) #######Split the data into training and test if (fasta_test == None): print "Training and Test with the same dataset (different sequences)" #split train/test from eden.util import random_bipartition_iter iterable_pos_train, iterable_pos_test = random_bipartition_iter(train_pos, relative_size=train_test_split) iterable_neg_train, iterable_neg_test = random_bipartition_iter(train_neg, relative_size=train_test_split) iterable_pos_train = list(iterable_pos_train) iterable_neg_train = list(iterable_neg_train) iterable_pos_test = list(iterable_pos_test) iterable_neg_test = list(iterable_neg_test) else: print "test dataset = ",fasta_test,"\n" pos_test_cds=[] neg_test_cds=[] from eden.converter.fasta import fasta_to_sequence seqs = fasta_to_sequence(fasta_test) test_pos = get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r) test_pos = list(test_pos) #Generate Negative test data test_neg = seq_to_seq(test_pos, modifier=shuffle_modifier, times=neg_size_factor, order=2 ) test_neg = list(test_neg) iterable_pos_train = list(train_pos) iterable_neg_train = list(train_neg) iterable_pos_test = list(test_pos) iterable_neg_test = list(test_neg) print "Positive training samples: ",len(iterable_pos_train) print "Negative training samples: ",len(iterable_neg_train) print "--------\nPositive test samples: ",len(iterable_pos_test) print "Negative test samples: ",len(iterable_neg_test) #make predictive model from eden.model import ActiveLearningBinaryClassificationModel model = ActiveLearningBinaryClassificationModel(pre_processor=pre_process_graph, estimator=estimator, vectorizer=vectorizer, n_jobs=n_jobs) #optimize hyperparameters and fit model: from numpy.random import randint from numpy.random import uniform pre_processor_parameters={'energy_range':[3,4,5,6,7,8,9,10], 'max_num_subopts':randint(100,200,size=n_iter), 'max_num': [3,4,5,6,7,8]} vectorizer_parameters={'complexity':[2,3]} estimator_parameters={'n_iter':randint(5, 100, size=n_iter), 'penalty':['l1','l2','elasticnet'], 'l1_ratio':uniform(0.1,0.9, size=n_iter), 'loss':['log'],#'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], 'power_t':uniform(0.1, size=n_iter), 'alpha': [10**x for x in range(-8,0)], 'eta0': [10**x for x in range(-4,-1)], 'learning_rate': ["invscaling", "constant", "optimal"], 'n_jobs':[n_jobs]} model.optimize(iterable_pos_train, iterable_neg_train, model_name=model_stem_name, max_total_time=60*60*24, n_iter=n_iter, n_active_learning_iterations=3, cv=10, score_func=lambda avg_score,std_score : avg_score - std_score * 2, scoring='roc_auc', pre_processor_parameters = pre_processor_parameters, vectorizer_parameters=vectorizer_parameters, estimator_parameters=estimator_parameters) #estimate predictive performance print model.get_parameters() result,text = model.estimate( iterable_pos_test, iterable_neg_test ) rss=0 i = 0 for prob in result: i=i+1 #print prob if (prob[1] == 1): rss = rss + ((1 - prob[0][1])**2) else: rss = rss + ((1 - prob[0][0])**2) avg_rss= rss/i; text.append('RSS: %.2f' % rss) text.append('avg RSS: %2f' % avg_rss) for t in text: print t