Example #1
0
def main_predict(model_initializer, args):
    iterator = model_initializer.load_data(args)
    from itertools import tee
    iterator, iterator_ = tee(iterator)

    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel()
    model.load(args.model_file)
    logger.info(model.get_parameters())

    predictions = model.decision_function(iterator)
    text = []
    for p in predictions:
        text.append(str(p) + "\n")
    save_output(text=text, output_dir_path=args.output_dir_path, out_file_name='predictions.txt')

    text = []
    for p in predictions:
        if p > 0:
            prediction = 1
        else:
            prediction = -1
        text.append(str(prediction) + "\n")
    save_output(text=text, output_dir_path=args.output_dir_path, out_file_name='classifications.txt')

    text = []
    from itertools import izip
    info_iterator = model.get_info(iterator_)
    for p, info in izip(predictions, info_iterator):
        text.append("%.4f\t%s\n" % (p, info))
    save_output(text=text, output_dir_path=args.output_dir_path, out_file_name='info.txt')
Example #2
0
def main_estimate(model_initializer, args):
    pos_test_iterator = model_initializer.load_positive_data(args)
    neg_test_iterator = model_initializer.load_negative_data(args)

    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel()
    model.load(args.model_file)
    logger.info(model.get_parameters())
    apr, rocauc = model.estimate(pos_test_iterator, neg_test_iterator)
Example #3
0
def main_feature(model_initializer, args):
    iterator = model_initializer.load_data(args)

    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel()
    model.load(args.model_file)
    logger.info(model.get_parameters())
    X = model._data_matrix(iterator)
    store_matrix(matrix=X, output_dir_path=args.output_dir_path, out_file_name='data_matrix', output_format=args.output_format)
Example #4
0
def main_estimate(model_initializer, args):
    pos_test_iterator = model_initializer.load_positive_data(args)
    neg_test_iterator = model_initializer.load_negative_data(args)

    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel()
    model.load(args.model_file)
    logger.info(model.get_parameters())
    apr, rocauc = model.estimate(pos_test_iterator, neg_test_iterator)
Example #5
0
def main_matrix(model_initializer, args):
    iterator = model_initializer.load_data(args)

    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel()
    model.load(args.model_file)
    logger.info(model.get_parameters())
    X = model._data_matrix(iterator)
    K = metrics.pairwise.pairwise_kernels(X, metric='linear')
    store_matrix(matrix=K, output_dir_path=args.output_dir_path, out_file_name='Gram_matrix', output_format=args.output_format)
Example #6
0
def main_feature(model_initializer, args):
    iterator = model_initializer.load_data(args)

    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel()
    model.load(args.model_file)
    logger.info(model.get_parameters())
    data_matrix = model._data_matrix(iterator)
    store_matrix(matrix=data_matrix,
                 output_dir_path=args.output_dir_path,
                 out_file_name='data_matrix',
                 output_format=args.output_format)
Example #7
0
def main_matrix(model_initializer, args):
    iterator = model_initializer.load_data(args)

    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel()
    model.load(args.model_file)
    logger.info(model.get_parameters())
    X = model._data_matrix(iterator)
    K = metrics.pairwise.pairwise_kernels(X, metric='linear')
    store_matrix(matrix=K,
                 output_dir_path=args.output_dir_path,
                 out_file_name='Gram_matrix',
                 output_format=args.output_format)
Example #8
0
def main_predict(model_initializer, args):
    iterator = model_initializer.load_data(args)
    from itertools import tee
    iterator, iterator_ = tee(iterator)

    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel()
    model.load(args.model_file)
    logger.info(model.get_parameters())

    text = []
    for margin, graph_info in model.decision_function_info(iterator, key='id'):
        if margin > 0:
            prediction = 1
        else:
            prediction = -1
        text.append("%d\t%s\t%s\n" % (prediction, margin, graph_info))
    save_output(text=text, output_dir_path=args.output_dir_path, out_file_name='predictions.txt')
Example #9
0
def main_predict(model_initializer, args):
    iterator = model_initializer.load_data(args)
    from itertools import tee
    iterator, iterator_ = tee(iterator)

    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel()
    model.load(args.model_file)
    logger.info(model.get_parameters())

    text = []
    for margin, graph_info in model.decision_function_info(iterator, key='id'):
        if margin > 0:
            prediction = 1
        else:
            prediction = -1
        text.append("%d\t%s\t%s\n" % (prediction, margin, graph_info))
    save_output(text=text,
                output_dir_path=args.output_dir_path,
                out_file_name='predictions.txt')
Example #10
0
def main_predict(model_initializer, args):
    iterator = model_initializer.load_data(args)
    from itertools import tee
    iterator, iterator_ = tee(iterator)

    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel()
    model.load(args.model_file)
    logger.info(model.get_parameters())

    predictions = model.decision_function(iterator)
    text = []
    for p in predictions:
        text.append(str(p) + "\n")
    save_output(text=text,
                output_dir_path=args.output_dir_path,
                out_file_name='predictions.txt')

    text = []
    for p in predictions:
        if p > 0:
            prediction = 1
        else:
            prediction = -1
        text.append(str(prediction) + "\n")
    save_output(text=text,
                output_dir_path=args.output_dir_path,
                out_file_name='classifications.txt')

    text = []
    from itertools import izip
    info_iterator = model.get_info(iterator_)
    for p, info in izip(predictions, info_iterator):
        text.append("%.4f\t%s\n" % (p, info))
    save_output(text=text,
                output_dir_path=args.output_dir_path,
                out_file_name='info.txt')
def train_dbox_model(fasta_fname=None, model_fname='eden_model_Dbox',window=4, neg_size_factor=5, train_test_split=0.7, n_jobs=4, n_iter=40):
    
    #transform sequences in a linear graph
    def pre_process_graph(iterator):
        from eden.converter.fasta import sequence_to_eden
        graphs = sequence_to_eden(iterator)
        return graphs

    #extract box sequence with annotaded header information
    def extract_box(data,window=3,box_type='D'):
        import re
        from eden.converter.fasta import fasta_to_sequence
        seqs = fasta_to_sequence(data)
        
        for seq in seqs:
            header = seq[0].split('_')
            cbox = header[-4]
            cpos = int(header[-3])
            dbox = header[-2]
            dpos = int(header[-1])
            
            nts = re.sub('\n','',seq[1])
            if box_type == 'C':
                box = nts[cpos-1-window:cpos+6+window]
            else:
                if (not((len(nts) < dpos+3+window) or (dpos-1-window < 0))):
                    #box = nts[dpos-1-window:dpos+3+window]
                    box = nts[dpos-1-window:dpos-1]+'x'+nts[dpos-1:dpos+3]+'y'+nts[dpos+3:dpos+3+window]
                    yield seq[0],box
                    
    #Choose the vectorizer
    from eden.graph import Vectorizer
    vectorizer = Vectorizer()
        
    #Choose the estimator
    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(class_weight='auto', shuffle = True, average=True )
    
    import random
    from eden import util
    ################Generate positive samples###############
    seqs_d_pos = extract_box(fasta_fname,window,box_type='D')

    from itertools import tee
    seqs_d_pos,seqs_d_pos_ = tee(seqs_d_pos)
    #################Generate negatives samples##############
    from eden.modifier.seq import seq_to_seq, shuffle_modifier
    seqs_d_neg = seq_to_seq( seqs_d_pos_, modifier=shuffle_modifier, times=neg_size_factor, order=2 )
    
    #####################split train/test####################
    from eden.util import random_bipartition_iter
    iterable_pos_train, iterable_pos_test = random_bipartition_iter(seqs_d_pos, relative_size=train_test_split)
    iterable_neg_train, iterable_neg_test = random_bipartition_iter(seqs_d_neg, relative_size=train_test_split)
    
    iterable_pos_train = list(iterable_pos_train)
    iterable_pos_test = list(iterable_pos_test)
    iterable_neg_train = list(iterable_neg_train)
    iterable_neg_test = list(iterable_neg_test)

    print "training pos ",len(iterable_pos_train)
    print "training neg ",len(iterable_neg_train)
    print "test pos ",len(iterable_pos_test)
    print "test neg ",len(iterable_neg_test)

    
    
    #make predictive model
    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel(pre_processor=pre_process_graph, 
                                                    estimator=estimator, 
                                                    vectorizer=vectorizer,
                                                    n_jobs=n_jobs)
    #optimize hyperparameters and fit model
    from numpy.random import randint
    from numpy.random import uniform
    
    vectorizer_parameters={'complexity':[2,3]}
    
    estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                          'penalty':['l1','l2','elasticnet'],
                          'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                          'loss':['log'],#'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                          'power_t':uniform(0.1, size=n_iter),
                          'alpha': [10**x for x in range(-8,0)],
                          'eta0': [10**x for x in range(-4,-1)],
                          'learning_rate': ["invscaling", "constant", "optimal"],
                          'n_jobs':[n_jobs]}
    
    model.optimize(iterable_pos_train, iterable_neg_train, 
                   model_name=model_fname,
                   max_total_time=60*30, n_iter=n_iter, 
                   cv=10,
                   score_func=lambda avg_score,std_score : avg_score - std_score * 2,
                   scoring='roc_auc', 
                   vectorizer_parameters=vectorizer_parameters, 
                   estimator_parameters=estimator_parameters)

    #estimate predictive performance
    print model.get_parameters()
    
    
    result,text = model.estimate( iterable_pos_test, iterable_neg_test )
    
    rss=0
    i = 0

    for prob in result:
        i=i+1
        print prob
        if (prob[1] == 1):
            rss = rss + ((1 - prob[0][1])**2)
        else:
            rss = rss + ((1 - prob[0][0])**2)

    avg_rss= rss/i;
    text.append('RSS: %.2f' % rss)
    text.append('avg RSS: %2f' % avg_rss)

    for t in text:
        print t
def train_stem_finder_model(fasta,model_stem_name,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r, train_test_split=0.7,neg_size_factor = 4, n_jobs=4, n_iter=40,fasta_test=None):

    ########### Pre processor ####################
    def pre_process_graph(iterator, **options):    
        from eden.converter.rna.rnasubopt import rnasubopt_to_eden
        graphs = rnasubopt_to_eden(iterator, **options)
        return graphs
    ########## Vectorizer ########################
    from eden.graph import Vectorizer
    vectorizer = Vectorizer()

    ######### Estimator #########################
    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(class_weight='auto', shuffle = True, average=True )

    
    def get_Cbox(seqs,window_c):
		import re
		for seq in seqs:
			header = seq[0].split('_')
			cpos = int(header[-3])
			nts = re.sub('\n','',seq[1])
			if (not((len(nts) < cpos+6+window_c) or (cpos-1-window_c < 0))):
				box = nts[cpos-1-window_c:cpos-1]+'x'+nts[cpos-1:cpos+6]+'y'+nts[cpos+6:cpos+6+window_c]
				yield [seq,cpos],box

    def get_Dbox(seqs_c,window_d):
	    import re 
	    for [seq,cbox],pred in seqs_c:
			header = seq[0][0].split('_')
			dpos = int(header[-1])
			nts = re.sub('\n','',seq[0][1])
			if (not((len(nts) < dpos+3+window_d) or (dpos-1-window_d < 0))):
				box = nts[dpos-1-window_d:dpos-1]+'x'+nts[dpos-1:dpos+3]+'y'+nts[dpos+3:dpos+3+window_d]
				yield [seq,cbox,pred,dpos],box
    
                    
    ######### Get stem #########################
    def get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r):
        from itertools import izip 
        import re
        from itertools import tee,islice
       
        #1)c_finder
        seqs_c = get_Cbox(seqs,window_c)
		

        #2)submit the Cbox candidates to the model
        from eden.model import ActiveLearningBinaryClassificationModel
        model = ActiveLearningBinaryClassificationModel()
        model.load(model_c_name)
        
        seqs_c_pred = list()
        cands_c = list()
        max_count = 0        
        
        for seq_c in seqs_c:
            max_count +=1
            cands_c.append(seq_c)
            if (max_count == 10000): #in order to not generate memory leak I've restricted the number of samples to be submited to the model
				preds = model.decision_function(cands_c)
				seqs_c_pred = seqs_c_pred + zip(cands_c,preds)
				cands_c = list()
				max_count = 0
        if (max_count != 0):
			preds = model.decision_function(cands_c)
			seqs_c_pred = seqs_c_pred + zip(cands_c,preds)
        
        #discard sequences with pred < 0
        seqs_c = list()
        for cand in seqs_c_pred:
			if (cand[1] >= 0.0):
				seqs_c.append(cand)
        
        
        #D_finder
        seqs_cd = get_Dbox(seqs_c,window_d)
        #submit Dboxes candidate to its model
        model = ActiveLearningBinaryClassificationModel()
        model.load(model_d_name)
        
        seqs_d_pred = list()
        cands_d = list()
        max_count = 0        
        
        for seq_d in seqs_cd:
            max_count +=1
            cands_d.append(seq_d)
            if (max_count == 10000): #in order to not generate memory leak I've restricted the number of samples to be submited to the model
				preds = model.decision_function(cands_d)
				seqs_d_pred = seqs_d_pred + zip(cands_d,preds)
				cands_d = list()
				max_count = 0
        if (max_count != 0):
			preds = model.decision_function(cands_d)
			seqs_d_pred = seqs_d_pred + zip(cands_d,preds)
		
	#Get the stem region from the sequences
        stem_cands=[]
        stem_info =[]
        #(([[(header, seq), pos_c], cand_c, pred_c, pos_d], 'UAAxCUGAyGAU'), 77.000434164559792)

        for ([[(header,nts),pos_c],cand_c,pred_c,pos_d],cand_d),pred_d in seqs_d_pred:
			#print header,'\t',seq,pos_c,'\t',cand_c,'\t',pred_c,'\t',cand_d,'\t',pred_d,"\n---\n" 
			if ( int(pos_c) - 10 < 0):
				if (int(pos_d)+10 > len(nts)):
					stem_cands.append([[header,pos_c,pos_d],nts[0:int(pos_c)+6]+"&"+nts[int(pos_d)-1:len(nts)]])
				else:
					stem_cands.append([[header,pos_c,pos_d],nts[0:int(pos_c)+6]+"&"+nts[int(pos_d)-1:int(pos_d)+3+10]])
					
			else:
				if (int(pos_d)+10 > len(nts)):
					stem_cands.append([[header,pos_c,pos_d],nts[int(pos_c)-10:int(pos_c)+6]+"&"+nts[int(pos_d)-1:len(nts)]])
					
				else:
					stem_cands.append([[header,pos_c,pos_d],nts[int(pos_c)-10:int(pos_c)+6]+"&"+nts[int(pos_d)-1:int(pos_d)+3+10]])
					
		
        return stem_cands
            
			
    
    #get positive data
    pos_cds=[]

    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(fasta)

    train_pos = get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r)
    train_pos = list(train_pos)

    

    #for h,seq in stems_cds:
	#	print h[0][0:10],'\t',seq

    #Generate Negative Dataset
    from eden.modifier.seq import seq_to_seq, shuffle_modifier
    train_neg = seq_to_seq(train_pos, modifier=shuffle_modifier, times=neg_size_factor, order=2 )
    train_neg = list(train_neg)
 
    
    #######Split the data into training and test
    if (fasta_test == None):
        print "Training and Test with the same dataset (different sequences)"
        #split train/test
        from eden.util import random_bipartition_iter
        iterable_pos_train, iterable_pos_test = random_bipartition_iter(train_pos, relative_size=train_test_split)
        iterable_neg_train, iterable_neg_test = random_bipartition_iter(train_neg, relative_size=train_test_split)
        
        
        iterable_pos_train = list(iterable_pos_train)
        iterable_neg_train = list(iterable_neg_train)
        
        iterable_pos_test = list(iterable_pos_test)
        iterable_neg_test = list(iterable_neg_test)
        
    
        
      
        

    else:        
        print "test dataset = ",fasta_test,"\n"
        pos_test_cds=[]
        neg_test_cds=[]

        from eden.converter.fasta import fasta_to_sequence
        seqs = fasta_to_sequence(fasta_test)

        test_pos = get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r)
        test_pos = list(test_pos)

        #Generate Negative test data
        test_neg = seq_to_seq(test_pos, modifier=shuffle_modifier, times=neg_size_factor, order=2 )
        test_neg = list(test_neg)
        
        iterable_pos_train = list(train_pos)
        iterable_neg_train = list(train_neg)
        iterable_pos_test  = list(test_pos)
        iterable_neg_test  = list(test_neg)
        
    print "Positive training samples: ",len(iterable_pos_train)
    print "Negative training samples: ",len(iterable_neg_train)
    print "--------\nPositive test samples: ",len(iterable_pos_test)
    print "Negative test samples: ",len(iterable_neg_test)



    #make predictive model
    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel(pre_processor=pre_process_graph, 
                                                    estimator=estimator, 
                                                    vectorizer=vectorizer,
                                                    n_jobs=n_jobs)
    #optimize hyperparameters and fit model:
    from numpy.random import randint
    from numpy.random import uniform


    pre_processor_parameters={'energy_range':[3,4,5,6,7,8,9,10],
							   'max_num_subopts':randint(100,200,size=n_iter),
							   'max_num': [3,4,5,6,7,8]}


    vectorizer_parameters={'complexity':[2,3]}

    estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                          'penalty':['l1','l2','elasticnet'],
                          'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                          'loss':['log'],#'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                          'power_t':uniform(0.1, size=n_iter),
                          'alpha': [10**x for x in range(-8,0)],
                          'eta0': [10**x for x in range(-4,-1)],
                          'learning_rate': ["invscaling", "constant", "optimal"],
                          'n_jobs':[n_jobs]}

    model.optimize(iterable_pos_train, iterable_neg_train, 
                   model_name=model_stem_name,
                   max_total_time=60*60*24, n_iter=n_iter, 
                   n_active_learning_iterations=3,
                   cv=10,
                   score_func=lambda avg_score,std_score : avg_score - std_score * 2,
                   scoring='roc_auc', 
                   pre_processor_parameters = pre_processor_parameters,
		   vectorizer_parameters=vectorizer_parameters, 
                   estimator_parameters=estimator_parameters)

    #estimate predictive performance
    print model.get_parameters()

    result,text = model.estimate( iterable_pos_test, iterable_neg_test )

    rss=0
    i = 0

    for prob in result:
        i=i+1
        #print prob
        if (prob[1] == 1):
            rss = rss + ((1 - prob[0][1])**2)
        else:
            rss = rss + ((1 - prob[0][0])**2)

    avg_rss= rss/i;
    text.append('RSS: %.2f' % rss)
    text.append('avg RSS: %2f' % avg_rss)

    for t in text:
        print t