def pre_processor(seqs, **args): seqs = seq_to_seq(seqs, modifier=mark_modifier, position=0.5, mark="%") seqs = seq_to_seq(seqs, modifier=mark_modifier, position=0.0, mark="@") seqs = seq_to_seq(seqs, modifier=mark_modifier, position=1.0, mark="*") graphs = sequence_to_eden(seqs) return graphs
def fit(self, pos_seqs, neg_seqs=None, times=2, order=2): """Fit an estimator to discriminate the pos_seqs from the neg_seqs. Parameters ---------- pos_seqs : iterable strings Input sequences. neg_seqs : iterable strings (default: None) If not None the program uses these as negative examples. If it is None, then negative sequences are generated as random shuffling of the positive sequences. times: int (default: 2) Factor between number of negatives and number of positives. order: int (default: 2) Size of the minimum block to shuffle: 1 means shuffling single characters, 2 means shuffling pairs of characters, etc. Returns ------- self. """ if neg_seqs is None: neg_seqs = list(seq_to_seq(pos_seqs, modifier=shuffle_modifier, times=times, order=order)) self.estimator = fit(pos_seqs, neg_seqs, self.vectorizer, n_jobs=self.n_jobs, cv=10, n_iter_search=1, random_state=self.random_state, n_blocks=5, block_size=None) return self
def fit(self, seqs, neg_seqs=None): """Find motives with SequenceMotifDecomposer.""" if neg_seqs is None: from eden.modifier.seq import seq_to_seq, shuffle_modifier neg_seqs = seq_to_seq(seqs, modifier=shuffle_modifier, times=1, order=2) neg_seqs = list(neg_seqs) self.smd = self.smd.fit(pos_seqs=seqs, neg_seqs=neg_seqs) try: motives = self.smd.select_motives(seqs=seqs, p_value=self.p_value, similarity_th=self.similarity_th, min_score=self.min_score, min_freq=self.min_freq, min_cluster_size=self.min_cluster_size, regex_th=self.regex_th, sample_size=self.sample_size, freq_th=self.freq_th, std_th=self.std_th) except AttributeError: raise AttributeError('No motives found.') self.nmotifs = len(motives.keys()) self.original_motives_list = self._get_motives_list(motives)[:] self.aligned_motives_list = self._get_aligned_motives_list( self.original_motives_list)[:] self.motives_list = self.adapt_motives( self.aligned_motives_list)[:] # create PWMs super(SMoDWrapper, self).fit(motives=self.aligned_motives_list)
def _fit_predictive_model(self, seqs, neg_seqs=None): # duplicate iterator pos_seqs, pos_seqs_ = tee(seqs) pos_graphs = mp_pre_process(pos_seqs, pre_processor=sequence_to_eden, n_blocks=self.pre_processor_n_blocks, block_size=self.pre_processor_block_size, n_jobs=self.pre_processor_n_jobs) if neg_seqs is None: # shuffle seqs to obtain negatives neg_seqs = seq_to_seq(pos_seqs_, modifier=shuffle_modifier, times=self.negative_ratio, order=self.shuffle_order) neg_graphs = mp_pre_process(neg_seqs, pre_processor=sequence_to_eden, n_blocks=self.pre_processor_n_blocks, block_size=self.pre_processor_block_size, n_jobs=self.pre_processor_n_jobs) # fit discriminative estimator self.estimator = fit(pos_graphs, neg_graphs, vectorizer=self.vectorizer, n_iter_search=self.n_iter_search, n_jobs=self.n_jobs, n_blocks=self.n_blocks, block_size=self.block_size, random_state=self.random_state)
def fit(self, seqs, neg_seqs=None): """Find motives with SequenceMotifDecomposer.""" if neg_seqs is None: from eden.modifier.seq import seq_to_seq, shuffle_modifier neg_seqs = seq_to_seq(seqs, modifier=shuffle_modifier, times=1, order=2) neg_seqs = list(neg_seqs) self.smd = self.smd.fit(pos_seqs=seqs, neg_seqs=neg_seqs) try: motives = self.smd.select_motives( seqs=seqs, p_value=self.p_value, similarity_th=self.similarity_th, min_score=self.min_score, min_freq=self.min_freq, min_cluster_size=self.min_cluster_size, regex_th=self.regex_th, sample_size=self.sample_size, freq_th=self.freq_th, std_th=self.std_th) except AttributeError: raise AttributeError('No motives found.') self.nmotifs = len(motives.keys()) self.original_motives_list = self._get_motives_list(motives)[:] self.aligned_motives_list = self._get_aligned_motives_list( self.original_motives_list)[:] self.motives_list = self.adapt_motives(self.aligned_motives_list)[:] # create PWMs super(SMoDWrapper, self).fit(motives=self.aligned_motives_list)
def get_dataset(sequence_length=200, n_sequences=200, motif_length=10, n_motives=2, p=0.2, random_state=1): """Generate, preprocess and return the dataset.""" motives, pos_seqs, binary_seq = make_artificial_dataset(alphabet='ACGT', sequence_length=sequence_length, n_sequences=n_sequences, motif_length=motif_length, n_motives=n_motives, p=p, random_state=random_state) from eden.modifier.seq import seq_to_seq, shuffle_modifier neg_seqs = seq_to_seq( pos_seqs, modifier=shuffle_modifier, times=2, order=2) neg_seqs = list(neg_seqs) block_size = n_sequences / 8 pos_size = len(pos_seqs) train_pos_seqs = pos_seqs[:pos_size / 2] test_pos_seqs = pos_seqs[pos_size / 2:] neg_size = len(neg_seqs) train_neg_seqs = neg_seqs[:neg_size / 2] # test_neg_seqs = neg_seqs[neg_size / 2:] true_score = [float(int(i)) for i in binary_seq] return (block_size, train_pos_seqs, train_neg_seqs, test_pos_seqs, n_motives, true_score)
def binary_classification_dataset_setup(iterable_seq=None, negative_shuffle_ratio=None, shuffle_order=None): iter1, iter2 = tee(iterable_seq) iterable_graph = rnafold_to_eden(iter1) iter3 = seq_to_seq(iter2, modifier=shuffle_modifier, times=negative_shuffle_ratio, order=shuffle_order) iterable_graph_neg = rnafold_to_eden(iter3) return iterable_graph, iterable_graph_neg
def load_negative_data(self, args): seqs = self.load_data(args) return seq_to_seq( seqs, modifier=shuffle_modifier, times=args.negative_ratio, order=args.shuffle_order, )
def _binary_classification_setup(self, seqs=None, negative_shuffle_ratio=None, shuffle_order=None): seqs, seqs_ = tee(seqs) graphs = self.pre_processor.transform(seqs, mfe=False) seqs_neg = seq_to_seq(seqs_, modifier=shuffle_modifier, times=negative_shuffle_ratio, order=shuffle_order) graphs_neg = self.pre_processor.transform(seqs_neg) return graphs, graphs_neg
def fit(self, pos_seqs, neg_seqs=None, times=2, order=2): """Fit an estimator to discriminate the pos_seqs from the neg_seqs. Parameters ---------- pos_seqs : iterable strings Input sequences. neg_seqs : iterable strings (default: None) If not None the program uses these as negative examples. If it is None, then negative sequences are generated as random shuffling of the positive sequences. times: int (default: 2) Factor between number of negatives and number of positives. order: int (default: 2) Size of the minimum block to shuffle: 1 means shuffling single characters, 2 means shuffling pairs of characters, etc. Returns ------- self. """ if neg_seqs is None: neg_seqs = list( seq_to_seq(pos_seqs, modifier=shuffle_modifier, times=times, order=order)) self.estimator = fit(pos_seqs, neg_seqs, self.vectorizer, n_jobs=self.n_jobs, cv=10, n_iter_search=1, random_state=self.random_state, n_blocks=5, block_size=None) return self
def pre_processor(seqs, **args): seqs = seq_to_seq(seqs, modifier=mark_modifier, position=0.5, mark='%') seqs = seq_to_seq(seqs, modifier=mark_modifier, position=0.0, mark='@') seqs = seq_to_seq(seqs, modifier=mark_modifier, position=1.0, mark='*') graphs = sequence_to_eden(seqs) return graphs
def load_negative_data(self, args): seqs = self.load_data(args) return seq_to_seq(seqs, modifier=shuffle_modifier, times=args.negative_ratio, order=args.shuffle_order)
def train_dbox_model(fasta_fname=None, model_fname='eden_model_Dbox',window=4, neg_size_factor=5, train_test_split=0.7, n_jobs=4, n_iter=40): #transform sequences in a linear graph def pre_process_graph(iterator): from eden.converter.fasta import sequence_to_eden graphs = sequence_to_eden(iterator) return graphs #extract box sequence with annotaded header information def extract_box(data,window=3,box_type='D'): import re from eden.converter.fasta import fasta_to_sequence seqs = fasta_to_sequence(data) for seq in seqs: header = seq[0].split('_') cbox = header[-4] cpos = int(header[-3]) dbox = header[-2] dpos = int(header[-1]) nts = re.sub('\n','',seq[1]) if box_type == 'C': box = nts[cpos-1-window:cpos+6+window] else: if (not((len(nts) < dpos+3+window) or (dpos-1-window < 0))): #box = nts[dpos-1-window:dpos+3+window] box = nts[dpos-1-window:dpos-1]+'x'+nts[dpos-1:dpos+3]+'y'+nts[dpos+3:dpos+3+window] yield seq[0],box #Choose the vectorizer from eden.graph import Vectorizer vectorizer = Vectorizer() #Choose the estimator from sklearn.linear_model import SGDClassifier estimator = SGDClassifier(class_weight='auto', shuffle = True, average=True ) import random from eden import util ################Generate positive samples############### seqs_d_pos = extract_box(fasta_fname,window,box_type='D') from itertools import tee seqs_d_pos,seqs_d_pos_ = tee(seqs_d_pos) #################Generate negatives samples############## from eden.modifier.seq import seq_to_seq, shuffle_modifier seqs_d_neg = seq_to_seq( seqs_d_pos_, modifier=shuffle_modifier, times=neg_size_factor, order=2 ) #####################split train/test#################### from eden.util import random_bipartition_iter iterable_pos_train, iterable_pos_test = random_bipartition_iter(seqs_d_pos, relative_size=train_test_split) iterable_neg_train, iterable_neg_test = random_bipartition_iter(seqs_d_neg, relative_size=train_test_split) iterable_pos_train = list(iterable_pos_train) iterable_pos_test = list(iterable_pos_test) iterable_neg_train = list(iterable_neg_train) iterable_neg_test = list(iterable_neg_test) print "training pos ",len(iterable_pos_train) print "training neg ",len(iterable_neg_train) print "test pos ",len(iterable_pos_test) print "test neg ",len(iterable_neg_test) #make predictive model from eden.model import ActiveLearningBinaryClassificationModel model = ActiveLearningBinaryClassificationModel(pre_processor=pre_process_graph, estimator=estimator, vectorizer=vectorizer, n_jobs=n_jobs) #optimize hyperparameters and fit model from numpy.random import randint from numpy.random import uniform vectorizer_parameters={'complexity':[2,3]} estimator_parameters={'n_iter':randint(5, 100, size=n_iter), 'penalty':['l1','l2','elasticnet'], 'l1_ratio':uniform(0.1,0.9, size=n_iter), 'loss':['log'],#'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], 'power_t':uniform(0.1, size=n_iter), 'alpha': [10**x for x in range(-8,0)], 'eta0': [10**x for x in range(-4,-1)], 'learning_rate': ["invscaling", "constant", "optimal"], 'n_jobs':[n_jobs]} model.optimize(iterable_pos_train, iterable_neg_train, model_name=model_fname, max_total_time=60*30, n_iter=n_iter, cv=10, score_func=lambda avg_score,std_score : avg_score - std_score * 2, scoring='roc_auc', vectorizer_parameters=vectorizer_parameters, estimator_parameters=estimator_parameters) #estimate predictive performance print model.get_parameters() result,text = model.estimate( iterable_pos_test, iterable_neg_test ) rss=0 i = 0 for prob in result: i=i+1 print prob if (prob[1] == 1): rss = rss + ((1 - prob[0][1])**2) else: rss = rss + ((1 - prob[0][0])**2) avg_rss= rss/i; text.append('RSS: %.2f' % rss) text.append('avg RSS: %2f' % avg_rss) for t in text: print t
def train_stem_finder_model(fasta,model_stem_name,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r, train_test_split=0.7,neg_size_factor = 4, n_jobs=4, n_iter=40,fasta_test=None): ########### Pre processor #################### def pre_process_graph(iterator, **options): from eden.converter.rna.rnasubopt import rnasubopt_to_eden graphs = rnasubopt_to_eden(iterator, **options) return graphs ########## Vectorizer ######################## from eden.graph import Vectorizer vectorizer = Vectorizer() ######### Estimator ######################### from sklearn.linear_model import SGDClassifier estimator = SGDClassifier(class_weight='auto', shuffle = True, average=True ) def get_Cbox(seqs,window_c): import re for seq in seqs: header = seq[0].split('_') cpos = int(header[-3]) nts = re.sub('\n','',seq[1]) if (not((len(nts) < cpos+6+window_c) or (cpos-1-window_c < 0))): box = nts[cpos-1-window_c:cpos-1]+'x'+nts[cpos-1:cpos+6]+'y'+nts[cpos+6:cpos+6+window_c] yield [seq,cpos],box def get_Dbox(seqs_c,window_d): import re for [seq,cbox],pred in seqs_c: header = seq[0][0].split('_') dpos = int(header[-1]) nts = re.sub('\n','',seq[0][1]) if (not((len(nts) < dpos+3+window_d) or (dpos-1-window_d < 0))): box = nts[dpos-1-window_d:dpos-1]+'x'+nts[dpos-1:dpos+3]+'y'+nts[dpos+3:dpos+3+window_d] yield [seq,cbox,pred,dpos],box ######### Get stem ######################### def get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r): from itertools import izip import re from itertools import tee,islice #1)c_finder seqs_c = get_Cbox(seqs,window_c) #2)submit the Cbox candidates to the model from eden.model import ActiveLearningBinaryClassificationModel model = ActiveLearningBinaryClassificationModel() model.load(model_c_name) seqs_c_pred = list() cands_c = list() max_count = 0 for seq_c in seqs_c: max_count +=1 cands_c.append(seq_c) if (max_count == 10000): #in order to not generate memory leak I've restricted the number of samples to be submited to the model preds = model.decision_function(cands_c) seqs_c_pred = seqs_c_pred + zip(cands_c,preds) cands_c = list() max_count = 0 if (max_count != 0): preds = model.decision_function(cands_c) seqs_c_pred = seqs_c_pred + zip(cands_c,preds) #discard sequences with pred < 0 seqs_c = list() for cand in seqs_c_pred: if (cand[1] >= 0.0): seqs_c.append(cand) #D_finder seqs_cd = get_Dbox(seqs_c,window_d) #submit Dboxes candidate to its model model = ActiveLearningBinaryClassificationModel() model.load(model_d_name) seqs_d_pred = list() cands_d = list() max_count = 0 for seq_d in seqs_cd: max_count +=1 cands_d.append(seq_d) if (max_count == 10000): #in order to not generate memory leak I've restricted the number of samples to be submited to the model preds = model.decision_function(cands_d) seqs_d_pred = seqs_d_pred + zip(cands_d,preds) cands_d = list() max_count = 0 if (max_count != 0): preds = model.decision_function(cands_d) seqs_d_pred = seqs_d_pred + zip(cands_d,preds) #Get the stem region from the sequences stem_cands=[] stem_info =[] #(([[(header, seq), pos_c], cand_c, pred_c, pos_d], 'UAAxCUGAyGAU'), 77.000434164559792) for ([[(header,nts),pos_c],cand_c,pred_c,pos_d],cand_d),pred_d in seqs_d_pred: #print header,'\t',seq,pos_c,'\t',cand_c,'\t',pred_c,'\t',cand_d,'\t',pred_d,"\n---\n" if ( int(pos_c) - 10 < 0): if (int(pos_d)+10 > len(nts)): stem_cands.append([[header,pos_c,pos_d],nts[0:int(pos_c)+6]+"&"+nts[int(pos_d)-1:len(nts)]]) else: stem_cands.append([[header,pos_c,pos_d],nts[0:int(pos_c)+6]+"&"+nts[int(pos_d)-1:int(pos_d)+3+10]]) else: if (int(pos_d)+10 > len(nts)): stem_cands.append([[header,pos_c,pos_d],nts[int(pos_c)-10:int(pos_c)+6]+"&"+nts[int(pos_d)-1:len(nts)]]) else: stem_cands.append([[header,pos_c,pos_d],nts[int(pos_c)-10:int(pos_c)+6]+"&"+nts[int(pos_d)-1:int(pos_d)+3+10]]) return stem_cands #get positive data pos_cds=[] from eden.converter.fasta import fasta_to_sequence seqs = fasta_to_sequence(fasta) train_pos = get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r) train_pos = list(train_pos) #for h,seq in stems_cds: # print h[0][0:10],'\t',seq #Generate Negative Dataset from eden.modifier.seq import seq_to_seq, shuffle_modifier train_neg = seq_to_seq(train_pos, modifier=shuffle_modifier, times=neg_size_factor, order=2 ) train_neg = list(train_neg) #######Split the data into training and test if (fasta_test == None): print "Training and Test with the same dataset (different sequences)" #split train/test from eden.util import random_bipartition_iter iterable_pos_train, iterable_pos_test = random_bipartition_iter(train_pos, relative_size=train_test_split) iterable_neg_train, iterable_neg_test = random_bipartition_iter(train_neg, relative_size=train_test_split) iterable_pos_train = list(iterable_pos_train) iterable_neg_train = list(iterable_neg_train) iterable_pos_test = list(iterable_pos_test) iterable_neg_test = list(iterable_neg_test) else: print "test dataset = ",fasta_test,"\n" pos_test_cds=[] neg_test_cds=[] from eden.converter.fasta import fasta_to_sequence seqs = fasta_to_sequence(fasta_test) test_pos = get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r) test_pos = list(test_pos) #Generate Negative test data test_neg = seq_to_seq(test_pos, modifier=shuffle_modifier, times=neg_size_factor, order=2 ) test_neg = list(test_neg) iterable_pos_train = list(train_pos) iterable_neg_train = list(train_neg) iterable_pos_test = list(test_pos) iterable_neg_test = list(test_neg) print "Positive training samples: ",len(iterable_pos_train) print "Negative training samples: ",len(iterable_neg_train) print "--------\nPositive test samples: ",len(iterable_pos_test) print "Negative test samples: ",len(iterable_neg_test) #make predictive model from eden.model import ActiveLearningBinaryClassificationModel model = ActiveLearningBinaryClassificationModel(pre_processor=pre_process_graph, estimator=estimator, vectorizer=vectorizer, n_jobs=n_jobs) #optimize hyperparameters and fit model: from numpy.random import randint from numpy.random import uniform pre_processor_parameters={'energy_range':[3,4,5,6,7,8,9,10], 'max_num_subopts':randint(100,200,size=n_iter), 'max_num': [3,4,5,6,7,8]} vectorizer_parameters={'complexity':[2,3]} estimator_parameters={'n_iter':randint(5, 100, size=n_iter), 'penalty':['l1','l2','elasticnet'], 'l1_ratio':uniform(0.1,0.9, size=n_iter), 'loss':['log'],#'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], 'power_t':uniform(0.1, size=n_iter), 'alpha': [10**x for x in range(-8,0)], 'eta0': [10**x for x in range(-4,-1)], 'learning_rate': ["invscaling", "constant", "optimal"], 'n_jobs':[n_jobs]} model.optimize(iterable_pos_train, iterable_neg_train, model_name=model_stem_name, max_total_time=60*60*24, n_iter=n_iter, n_active_learning_iterations=3, cv=10, score_func=lambda avg_score,std_score : avg_score - std_score * 2, scoring='roc_auc', pre_processor_parameters = pre_processor_parameters, vectorizer_parameters=vectorizer_parameters, estimator_parameters=estimator_parameters) #estimate predictive performance print model.get_parameters() result,text = model.estimate( iterable_pos_test, iterable_neg_test ) rss=0 i = 0 for prob in result: i=i+1 #print prob if (prob[1] == 1): rss = rss + ((1 - prob[0][1])**2) else: rss = rss + ((1 - prob[0][0])**2) avg_rss= rss/i; text.append('RSS: %.2f' % rss) text.append('avg RSS: %2f' % avg_rss) for t in text: print t