def get_sequences_with_names(size=9999, rand=0): if rand>0: sequences , boring = random_bipartition_iter(fasta_to_sequence("../toolsdata/%s.fa" % RFAM),.9,random_state=random.random()*rand) sequences = itertools.islice( sequences , size) else: sequences = itertools.islice( fasta_to_sequence("../toolsdata/%s.fa" % RFAM), size) return sequences
def get_sequences_with_names(size=9999, rand=0): if rand > 0: sequences, boring = random_bipartition_iter( fasta_to_sequence("../toolsdata/%s.fa" % RFAM), .9, random_state=random.random() * rand) sequences = itertools.islice(sequences, size) else: sequences = itertools.islice( fasta_to_sequence("../toolsdata/%s.fa" % RFAM), size) return sequences
def test_sequence_to_eden_id_attribute(self): """Test if networkx graph ids are set correctly to fasta header. -> header annotation won't be moved to garden""" fa_fn = "test/test_fasta_to_sequence_with_center_annotation.fa" graphs = sequence_to_eden(fasta_to_sequence(fa_fn)) graph = graphs.next() assert graph.graph["id"] == "ID0 center:25"
def get_sequences_with_names(size=9999, rand=True): it = fasta_to_sequence("../toolsdata/%s.fa" % RFAM) it = list(it) if rand: #sequences , boring = random_bipartition_iter(it,.9,random_state=random.random()) r = range(len(it)) random.shuffle(r) return selection_iterator(it, r[:size]) else: sequences = itertools.islice(it, size) return sequences
def split_to_train_and_test(rfam_id=None, train_to_test_split_ratio=None, number_of_samples=None): iterable = fasta_to_sequence(rfam_url(rfam_id)) if number_of_samples: iterable = islice(iterable, number_of_samples) logger.info('Experiment cunducted with %d sample sequences' % number_of_samples) train, test = random_bipartition_iter( iterable, relative_size=train_to_test_split_ratio) return train, test
def test_fasta_to_sequence_no_normalize(self): """Test default test_fasta_to_sequence with default parameters. -> moved to garden doctest""" fa_fn = "test/test_fasta_to_sequence.fa" seq = fasta_to_sequence(fa_fn, normalize=False) assert (is_iterable(seq)) (header, sequence) = seq.next() # sequence should correspond to the unmodified fasta string assert ( sequence == "gtggcgtactcacggccaCCTTAGGACTCCGCGGACTTTATGCCCACCAAAAAAACGAGCCGTTTCTACGCGTCCTCCGTCGCCTgtgtcgataaagcaa" )
def test_fasta_to_sequence_normalized(self): """Test default test_fasta_to_sequence with default parameters. -> moved to garden doctest""" fa_fn = "test/test_fasta_to_sequence.fa" seq = fasta_to_sequence(fa_fn, normalize=True) assert (is_iterable(seq)) (header, sequence) = seq.next() # sequence should be uppercased and all Ts should be replaced by Us assert ( sequence == "GUGGCGUACUCACGGCCACCUUAGGACUCCGCGGACUUUAUGCCCACCAAAAAAACGAGCCGUUUCUACGCGUCCUCCGUCGCCUGUGUCGAUAAAGCAA" )
def test_fasta_to_sequence_default(self): """Test test_fasta_to_sequence with default parameters. -> moved to garden doctest""" fa_fn = "test/test_fasta_to_sequence.fa" seq = fasta_to_sequence(fa_fn) assert (is_iterable(seq)) (header, sequence) = seq.next() # header should contain the fasta header with '>' removed assert (header == "ID0") # sequence should be uppercased and all Ts should be replaced by Us assert ( sequence == "GUGGCGUACUCACGGCCACCUUAGGACUCCGCGGACUUUAUGCCCACCAAAAAAACGAGCCGUUUCUACGCGUCCUCCGUCGCCUGUGUCGAUAAAGCAA" )
def extract_box(data,window=3,box_type='C'): import re from eden.converter.fasta import fasta_to_sequence seqs = fasta_to_sequence(data) for seq in seqs: header = seq[0].split('_') cbox = header[-4] cpos = int(header[-3]) dbox = header[-2] dpos = int(header[-1]) nts = re.sub('\n','',seq[1]) if box_type == 'C': if (not((len(nts) < cpos+6+window) or (cpos-1-window < 0))): #box = nts[cpos-1-window:cpos+6+window] box = nts[cpos-1-window:cpos-1]+'x'+nts[cpos-1:cpos+6]+'y'+nts[cpos+6:cpos+6+window] yield seq[0],box
def _fold_sequences(self): """Fold the RNA sequences using RNAplfold.""" if self.verbose: print("Folding sequences using RNAplfold -W %i -L %i -c %f \ --noLP..." % (self.window_size, self.max_bp_span, self.avg_bp_prob_cutoff), end=' ') sys.stdout.flush() seqs = fasta_to_sequence(self.fasta) graphs = rnaplfold_to_eden(seqs, window_size=self.window_size, max_bp_span=self.max_bp_span, avg_bp_prob_cutoff=self.avg_bp_prob_cutoff, max_num_edges=1) if self.verbose: print("Done.\n") sys.stdout.flush() return graphs
def _fold_sequences(self): """Fold the RNA sequences using RNAplfold.""" if self.verbose: print( "Folding sequences using RNAplfold -W %i -L %i -c %f \ --noLP..." % (self.window_size, self.max_bp_span, self.avg_bp_prob_cutoff), end=' ') sys.stdout.flush() seqs = fasta_to_sequence(self.fasta) graphs = rnaplfold_to_eden(seqs, window_size=self.window_size, max_bp_span=self.max_bp_span, avg_bp_prob_cutoff=self.avg_bp_prob_cutoff, max_num_edges=1) if self.verbose: print("Done.\n") sys.stdout.flush() return graphs
def get_sequences_with_names(filename='RF00005.fa'): sequences = fasta_to_sequence("../toolsdata/"+filename) return sequences
def fasta_to_list(fname): return [e for e in fasta_to_sequence(fname)]
def read_and_permute(samples='RF00005.fa'): for name,seq in fasta_to_sequence(samples): seq=permute_sequence(seq) yield (name,seq)
def get_seq_tups(fname, size, sizeb): kram = fasta_to_sequence("../toolsdata/" + fname) graphs = [g for g in kram] random.shuffle(graphs) return graphs[:size], graphs[size:size + sizeb]
def get_sequences_with_names(filename='RF00005.fa'): sequences = fasta_to_sequence("../toolsdata/" + filename) return sequences
def test_fasta_to_sequence_graph(): fa_fn = "test/test_fasta_to_sequence.fa" seq = fasta_to_sequence(fa_fn) sequence_to_eden(seq)
def get_graphss(rfam_id='../toolsdata/RF00005'): return fasta_to_sequence(rfam_uri(rfam_id))
def read_and_permute(samples='RF00005.fa'): for name, seq in fasta_to_sequence(samples): seq = permute_sequence(seq) yield (name, seq)
seqs = self._design(graphs) seqs = self._filter_seqs(seqs) return seqs def fit_sample(self, seqs): seqs, seqs_ = tee(seqs) seqs = self.fit(seqs).sample(seqs_) return seqs def predict(self, seqs): graphs = self.pre_processor.transform(seqs, mfe=True) predictions = self.vectorizer.predict(graphs, self.estimator) for prediction in predictions: yield prediction if __name__ == "__main__": logging.basicConfig(level=logging.INFO) logger.info('Call to RNASynthesizer module.') rfam_id = 'RF01685' iterable_seq = fasta_to_sequence( 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0' % (rfam_id, rfam_id)) synthesizer = RNASynthesizerInitializer().synthesizer synth_seqs = synthesizer.fit_sample(iterable_seq) for header, seq in synth_seqs: print header print seq
graphs = self._filter_graphs(graphs) seqs = self._design(graphs) seqs = self._filter_seqs(seqs) return seqs def fit_sample(self, seqs): seqs, seqs_ = tee(seqs) seqs = self.fit(seqs).sample(seqs_) return seqs def predict(self, seqs): graphs = self.pre_processor.transform(seqs, mfe=True) predictions = self.vectorizer.predict(graphs, self.estimator) for prediction in predictions: yield prediction if __name__ == "__main__": logging.basicConfig(level=logging.INFO) logger.info('Call to RNASynthesizer module.') rfam_id = 'RF01685' iterable_seq = fasta_to_sequence( 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0' % (rfam_id, rfam_id)) synthesizer = RNASynthesizerInitializer().synthesizer synth_seqs = synthesizer.fit_sample(iterable_seq) for header, seq in synth_seqs: print header print seq
def get_sequences(size=9999): sequences = itertools.islice( fasta_to_sequence("../example/RF00005.fa"), size) return [ b for (a,b) in sequences ]
def get_sequences_with_names(size=9999): sequences = itertools.islice(fasta_to_sequence("../toolsdata/RF00005.fa"), size) return sequences
def get_sequences(size=9999): sequences = itertools.islice(fasta_to_sequence("../toolsdata/RF00005.fa"), size) return [b for (a, b) in sequences]
def load_data(self, args): seqs = fasta_to_sequence(args.input_file) return seqs
def get_graphss(rfam_id="../toolsdata/RF00005"): return fasta_to_sequence(rfam_uri(rfam_id))
def get_graphs(rfam_id='../example/RF00005', size=9999): seqs = fasta_to_sequence(rfam_uri(rfam_id)) graphs = islice( clean(rnafold_to_eden(seqs, shape_type=5, energy_range=30, max_num=3)), size) return graphs
def get_graphs(rfam_id = '../example/RF00005',size=9999): seqs = fasta_to_sequence(rfam_uri(rfam_id)) graphs = islice( clean(rnafold_to_eden(seqs, shape_type=5, energy_range=30, max_num=3)), size) return graphs
import random num_seqs = 400 minlen = 74 maxlen = 90 samples = 'RF00005.fa' symbols = 'AUGC' # analyse original from eden.converter.fasta import fasta_to_sequence # count all the symbols symboldict = {symbol: 0 for symbol in symbols} for name, seq in fasta_to_sequence(samples): for symbol in symboldict.keys(): symboldict[symbol] += seq.count(symbol) def choosesymbol(total, weights, symbols): i = random.randint(0, total) for e, w in enumerate(weights): i -= w if i <= 0: return symbols[e] print 'ERRER this should not happen. this means my code sucks' def make_random_sequence(minlen, maxlen, weights, symbols, total): length = random.randint(minlen, maxlen) seq = [choosesymbol(total, weights, symbols) for i in xrange(length)] return ''.join(seq)
def get_sequences_with_names(size=9999): sequences = itertools.islice( fasta_to_sequence("../example/RF00005.fa"), size) return sequences
def train_stem_finder_model(fasta,model_stem_name,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r, train_test_split=0.7,neg_size_factor = 4, n_jobs=4, n_iter=40,fasta_test=None): ########### Pre processor #################### def pre_process_graph(iterator, **options): from eden.converter.rna.rnasubopt import rnasubopt_to_eden graphs = rnasubopt_to_eden(iterator, **options) return graphs ########## Vectorizer ######################## from eden.graph import Vectorizer vectorizer = Vectorizer() ######### Estimator ######################### from sklearn.linear_model import SGDClassifier estimator = SGDClassifier(class_weight='auto', shuffle = True, average=True ) def get_Cbox(seqs,window_c): import re for seq in seqs: header = seq[0].split('_') cpos = int(header[-3]) nts = re.sub('\n','',seq[1]) if (not((len(nts) < cpos+6+window_c) or (cpos-1-window_c < 0))): box = nts[cpos-1-window_c:cpos-1]+'x'+nts[cpos-1:cpos+6]+'y'+nts[cpos+6:cpos+6+window_c] yield [seq,cpos],box def get_Dbox(seqs_c,window_d): import re for [seq,cbox],pred in seqs_c: header = seq[0][0].split('_') dpos = int(header[-1]) nts = re.sub('\n','',seq[0][1]) if (not((len(nts) < dpos+3+window_d) or (dpos-1-window_d < 0))): box = nts[dpos-1-window_d:dpos-1]+'x'+nts[dpos-1:dpos+3]+'y'+nts[dpos+3:dpos+3+window_d] yield [seq,cbox,pred,dpos],box ######### Get stem ######################### def get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r): from itertools import izip import re from itertools import tee,islice #1)c_finder seqs_c = get_Cbox(seqs,window_c) #2)submit the Cbox candidates to the model from eden.model import ActiveLearningBinaryClassificationModel model = ActiveLearningBinaryClassificationModel() model.load(model_c_name) seqs_c_pred = list() cands_c = list() max_count = 0 for seq_c in seqs_c: max_count +=1 cands_c.append(seq_c) if (max_count == 10000): #in order to not generate memory leak I've restricted the number of samples to be submited to the model preds = model.decision_function(cands_c) seqs_c_pred = seqs_c_pred + zip(cands_c,preds) cands_c = list() max_count = 0 if (max_count != 0): preds = model.decision_function(cands_c) seqs_c_pred = seqs_c_pred + zip(cands_c,preds) #discard sequences with pred < 0 seqs_c = list() for cand in seqs_c_pred: if (cand[1] >= 0.0): seqs_c.append(cand) #D_finder seqs_cd = get_Dbox(seqs_c,window_d) #submit Dboxes candidate to its model model = ActiveLearningBinaryClassificationModel() model.load(model_d_name) seqs_d_pred = list() cands_d = list() max_count = 0 for seq_d in seqs_cd: max_count +=1 cands_d.append(seq_d) if (max_count == 10000): #in order to not generate memory leak I've restricted the number of samples to be submited to the model preds = model.decision_function(cands_d) seqs_d_pred = seqs_d_pred + zip(cands_d,preds) cands_d = list() max_count = 0 if (max_count != 0): preds = model.decision_function(cands_d) seqs_d_pred = seqs_d_pred + zip(cands_d,preds) #Get the stem region from the sequences stem_cands=[] stem_info =[] #(([[(header, seq), pos_c], cand_c, pred_c, pos_d], 'UAAxCUGAyGAU'), 77.000434164559792) for ([[(header,nts),pos_c],cand_c,pred_c,pos_d],cand_d),pred_d in seqs_d_pred: #print header,'\t',seq,pos_c,'\t',cand_c,'\t',pred_c,'\t',cand_d,'\t',pred_d,"\n---\n" if ( int(pos_c) - 10 < 0): if (int(pos_d)+10 > len(nts)): stem_cands.append([[header,pos_c,pos_d],nts[0:int(pos_c)+6]+"&"+nts[int(pos_d)-1:len(nts)]]) else: stem_cands.append([[header,pos_c,pos_d],nts[0:int(pos_c)+6]+"&"+nts[int(pos_d)-1:int(pos_d)+3+10]]) else: if (int(pos_d)+10 > len(nts)): stem_cands.append([[header,pos_c,pos_d],nts[int(pos_c)-10:int(pos_c)+6]+"&"+nts[int(pos_d)-1:len(nts)]]) else: stem_cands.append([[header,pos_c,pos_d],nts[int(pos_c)-10:int(pos_c)+6]+"&"+nts[int(pos_d)-1:int(pos_d)+3+10]]) return stem_cands #get positive data pos_cds=[] from eden.converter.fasta import fasta_to_sequence seqs = fasta_to_sequence(fasta) train_pos = get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r) train_pos = list(train_pos) #for h,seq in stems_cds: # print h[0][0:10],'\t',seq #Generate Negative Dataset from eden.modifier.seq import seq_to_seq, shuffle_modifier train_neg = seq_to_seq(train_pos, modifier=shuffle_modifier, times=neg_size_factor, order=2 ) train_neg = list(train_neg) #######Split the data into training and test if (fasta_test == None): print "Training and Test with the same dataset (different sequences)" #split train/test from eden.util import random_bipartition_iter iterable_pos_train, iterable_pos_test = random_bipartition_iter(train_pos, relative_size=train_test_split) iterable_neg_train, iterable_neg_test = random_bipartition_iter(train_neg, relative_size=train_test_split) iterable_pos_train = list(iterable_pos_train) iterable_neg_train = list(iterable_neg_train) iterable_pos_test = list(iterable_pos_test) iterable_neg_test = list(iterable_neg_test) else: print "test dataset = ",fasta_test,"\n" pos_test_cds=[] neg_test_cds=[] from eden.converter.fasta import fasta_to_sequence seqs = fasta_to_sequence(fasta_test) test_pos = get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r) test_pos = list(test_pos) #Generate Negative test data test_neg = seq_to_seq(test_pos, modifier=shuffle_modifier, times=neg_size_factor, order=2 ) test_neg = list(test_neg) iterable_pos_train = list(train_pos) iterable_neg_train = list(train_neg) iterable_pos_test = list(test_pos) iterable_neg_test = list(test_neg) print "Positive training samples: ",len(iterable_pos_train) print "Negative training samples: ",len(iterable_neg_train) print "--------\nPositive test samples: ",len(iterable_pos_test) print "Negative test samples: ",len(iterable_neg_test) #make predictive model from eden.model import ActiveLearningBinaryClassificationModel model = ActiveLearningBinaryClassificationModel(pre_processor=pre_process_graph, estimator=estimator, vectorizer=vectorizer, n_jobs=n_jobs) #optimize hyperparameters and fit model: from numpy.random import randint from numpy.random import uniform pre_processor_parameters={'energy_range':[3,4,5,6,7,8,9,10], 'max_num_subopts':randint(100,200,size=n_iter), 'max_num': [3,4,5,6,7,8]} vectorizer_parameters={'complexity':[2,3]} estimator_parameters={'n_iter':randint(5, 100, size=n_iter), 'penalty':['l1','l2','elasticnet'], 'l1_ratio':uniform(0.1,0.9, size=n_iter), 'loss':['log'],#'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], 'power_t':uniform(0.1, size=n_iter), 'alpha': [10**x for x in range(-8,0)], 'eta0': [10**x for x in range(-4,-1)], 'learning_rate': ["invscaling", "constant", "optimal"], 'n_jobs':[n_jobs]} model.optimize(iterable_pos_train, iterable_neg_train, model_name=model_stem_name, max_total_time=60*60*24, n_iter=n_iter, n_active_learning_iterations=3, cv=10, score_func=lambda avg_score,std_score : avg_score - std_score * 2, scoring='roc_auc', pre_processor_parameters = pre_processor_parameters, vectorizer_parameters=vectorizer_parameters, estimator_parameters=estimator_parameters) #estimate predictive performance print model.get_parameters() result,text = model.estimate( iterable_pos_test, iterable_neg_test ) rss=0 i = 0 for prob in result: i=i+1 #print prob if (prob[1] == 1): rss = rss + ((1 - prob[0][1])**2) else: rss = rss + ((1 - prob[0][0])**2) avg_rss= rss/i; text.append('RSS: %.2f' % rss) text.append('avg RSS: %2f' % avg_rss) for t in text: print t