Example #1
0
 def pre_processor(seqs, **args):
     seqs = seq_to_seq(seqs,
                       modifier=mark_modifier,
                       position=0.5,
                       mark="%")
     seqs = seq_to_seq(seqs,
                       modifier=mark_modifier,
                       position=0.0,
                       mark="@")
     seqs = seq_to_seq(seqs,
                       modifier=mark_modifier,
                       position=1.0,
                       mark="*")
     graphs = sequence_to_eden(seqs)
     return graphs
    def fit(self, pos_seqs, neg_seqs=None, times=2, order=2):
        """Fit an estimator to discriminate the pos_seqs from the neg_seqs.

        Parameters
        ----------
        pos_seqs : iterable strings
            Input sequences.

        neg_seqs : iterable strings (default: None)
            If not None the program uses these as negative examples. If
            it is None, then negative sequences are generated as random
            shuffling of the positive sequences.

        times: int (default: 2)
            Factor between number of negatives and number of positives.

        order: int (default: 2)
            Size of the minimum block to shuffle: 1 means shuffling single characters,
            2 means shuffling pairs of characters, etc.

        Returns
        -------
        self.
        """

        if neg_seqs is None:
            neg_seqs = list(seq_to_seq(pos_seqs, modifier=shuffle_modifier, times=times, order=order))
        self.estimator = fit(pos_seqs, neg_seqs, self.vectorizer,
                             n_jobs=self.n_jobs,
                             cv=10,
                             n_iter_search=1,
                             random_state=self.random_state,
                             n_blocks=5,
                             block_size=None)
        return self
Example #3
0
    def fit(self, seqs, neg_seqs=None):
        """Find motives with SequenceMotifDecomposer."""
        if neg_seqs is None:
            from eden.modifier.seq import seq_to_seq, shuffle_modifier
            neg_seqs = seq_to_seq(seqs, modifier=shuffle_modifier, times=1, order=2)
            neg_seqs = list(neg_seqs)

        self.smd = self.smd.fit(pos_seqs=seqs, neg_seqs=neg_seqs)
        try:
            motives = self.smd.select_motives(seqs=seqs,
                                              p_value=self.p_value,
                                              similarity_th=self.similarity_th,
                                              min_score=self.min_score,
                                              min_freq=self.min_freq,
                                              min_cluster_size=self.min_cluster_size,
                                              regex_th=self.regex_th,
                                              sample_size=self.sample_size,
                                              freq_th=self.freq_th,
                                              std_th=self.std_th)
        except AttributeError:
            raise AttributeError('No motives found.')

        self.nmotifs = len(motives.keys())
        self.original_motives_list = self._get_motives_list(motives)[:]
        self.aligned_motives_list = self._get_aligned_motives_list(
            self.original_motives_list)[:]
        self.motives_list = self.adapt_motives(
            self.aligned_motives_list)[:]

        # create PWMs
        super(SMoDWrapper, self).fit(motives=self.aligned_motives_list)
 def _fit_predictive_model(self, seqs, neg_seqs=None):
     # duplicate iterator
     pos_seqs, pos_seqs_ = tee(seqs)
     pos_graphs = mp_pre_process(pos_seqs,
                                 pre_processor=sequence_to_eden,
                                 n_blocks=self.pre_processor_n_blocks,
                                 block_size=self.pre_processor_block_size,
                                 n_jobs=self.pre_processor_n_jobs)
     if neg_seqs is None:
         # shuffle seqs to obtain negatives
         neg_seqs = seq_to_seq(pos_seqs_,
                               modifier=shuffle_modifier,
                               times=self.negative_ratio,
                               order=self.shuffle_order)
     neg_graphs = mp_pre_process(neg_seqs,
                                 pre_processor=sequence_to_eden,
                                 n_blocks=self.pre_processor_n_blocks,
                                 block_size=self.pre_processor_block_size,
                                 n_jobs=self.pre_processor_n_jobs)
     # fit discriminative estimator
     self.estimator = fit(pos_graphs,
                          neg_graphs,
                          vectorizer=self.vectorizer,
                          n_iter_search=self.n_iter_search,
                          n_jobs=self.n_jobs,
                          n_blocks=self.n_blocks,
                          block_size=self.block_size,
                          random_state=self.random_state)
Example #5
0
    def fit(self, seqs, neg_seqs=None):
        """Find motives with SequenceMotifDecomposer."""
        if neg_seqs is None:
            from eden.modifier.seq import seq_to_seq, shuffle_modifier
            neg_seqs = seq_to_seq(seqs,
                                  modifier=shuffle_modifier,
                                  times=1,
                                  order=2)
            neg_seqs = list(neg_seqs)

        self.smd = self.smd.fit(pos_seqs=seqs, neg_seqs=neg_seqs)
        try:
            motives = self.smd.select_motives(
                seqs=seqs,
                p_value=self.p_value,
                similarity_th=self.similarity_th,
                min_score=self.min_score,
                min_freq=self.min_freq,
                min_cluster_size=self.min_cluster_size,
                regex_th=self.regex_th,
                sample_size=self.sample_size,
                freq_th=self.freq_th,
                std_th=self.std_th)
        except AttributeError:
            raise AttributeError('No motives found.')

        self.nmotifs = len(motives.keys())
        self.original_motives_list = self._get_motives_list(motives)[:]
        self.aligned_motives_list = self._get_aligned_motives_list(
            self.original_motives_list)[:]
        self.motives_list = self.adapt_motives(self.aligned_motives_list)[:]

        # create PWMs
        super(SMoDWrapper, self).fit(motives=self.aligned_motives_list)
Example #6
0
 def _fit_predictive_model(self, seqs, neg_seqs=None):
     # duplicate iterator
     pos_seqs, pos_seqs_ = tee(seqs)
     pos_graphs = mp_pre_process(pos_seqs, pre_processor=sequence_to_eden,
                                 n_blocks=self.pre_processor_n_blocks,
                                 block_size=self.pre_processor_block_size,
                                 n_jobs=self.pre_processor_n_jobs)
     if neg_seqs is None:
         # shuffle seqs to obtain negatives
         neg_seqs = seq_to_seq(pos_seqs_,
                               modifier=shuffle_modifier,
                               times=self.negative_ratio,
                               order=self.shuffle_order)
     neg_graphs = mp_pre_process(neg_seqs, pre_processor=sequence_to_eden,
                                 n_blocks=self.pre_processor_n_blocks,
                                 block_size=self.pre_processor_block_size,
                                 n_jobs=self.pre_processor_n_jobs)
     # fit discriminative estimator
     self.estimator = fit(pos_graphs, neg_graphs,
                          vectorizer=self.vectorizer,
                          n_iter_search=self.n_iter_search,
                          n_jobs=self.n_jobs,
                          n_blocks=self.n_blocks,
                          block_size=self.block_size,
                          random_state=self.random_state)
def get_dataset(sequence_length=200,
                n_sequences=200,
                motif_length=10,
                n_motives=2,
                p=0.2,
                random_state=1):
    """Generate, preprocess and return the dataset."""
    motives, pos_seqs, binary_seq = make_artificial_dataset(alphabet='ACGT',
                                                            sequence_length=sequence_length,
                                                            n_sequences=n_sequences,
                                                            motif_length=motif_length,
                                                            n_motives=n_motives,
                                                            p=p,
                                                            random_state=random_state)

    from eden.modifier.seq import seq_to_seq, shuffle_modifier
    neg_seqs = seq_to_seq(
        pos_seqs, modifier=shuffle_modifier, times=2, order=2)
    neg_seqs = list(neg_seqs)

    block_size = n_sequences / 8

    pos_size = len(pos_seqs)
    train_pos_seqs = pos_seqs[:pos_size / 2]
    test_pos_seqs = pos_seqs[pos_size / 2:]

    neg_size = len(neg_seqs)
    train_neg_seqs = neg_seqs[:neg_size / 2]
    # test_neg_seqs = neg_seqs[neg_size / 2:]

    true_score = [float(int(i)) for i in binary_seq]
    return (block_size, train_pos_seqs, train_neg_seqs, test_pos_seqs, n_motives, true_score)
Example #8
0
def binary_classification_dataset_setup(iterable_seq=None, negative_shuffle_ratio=None, shuffle_order=None):

    iter1, iter2 = tee(iterable_seq)
    iterable_graph = rnafold_to_eden(iter1)
    iter3 = seq_to_seq(iter2, modifier=shuffle_modifier,
                       times=negative_shuffle_ratio, order=shuffle_order)
    iterable_graph_neg = rnafold_to_eden(iter3)
    return iterable_graph, iterable_graph_neg
Example #9
0
 def load_negative_data(self, args):
     seqs = self.load_data(args)
     return seq_to_seq(
         seqs,
         modifier=shuffle_modifier,
         times=args.negative_ratio,
         order=args.shuffle_order,
     )
Example #10
0
 def _binary_classification_setup(self,
                                  seqs=None,
                                  negative_shuffle_ratio=None,
                                  shuffle_order=None):
     seqs, seqs_ = tee(seqs)
     graphs = self.pre_processor.transform(seqs, mfe=False)
     seqs_neg = seq_to_seq(seqs_,
                           modifier=shuffle_modifier,
                           times=negative_shuffle_ratio,
                           order=shuffle_order)
     graphs_neg = self.pre_processor.transform(seqs_neg)
     return graphs, graphs_neg
Example #11
0
 def _binary_classification_setup(self,
                                  seqs=None,
                                  negative_shuffle_ratio=None,
                                  shuffle_order=None):
     seqs, seqs_ = tee(seqs)
     graphs = self.pre_processor.transform(seqs, mfe=False)
     seqs_neg = seq_to_seq(seqs_,
                           modifier=shuffle_modifier,
                           times=negative_shuffle_ratio,
                           order=shuffle_order)
     graphs_neg = self.pre_processor.transform(seqs_neg)
     return graphs, graphs_neg
Example #12
0
    def fit(self, pos_seqs, neg_seqs=None, times=2, order=2):
        """Fit an estimator to discriminate the pos_seqs from the neg_seqs.

        Parameters
        ----------
        pos_seqs : iterable strings
            Input sequences.

        neg_seqs : iterable strings (default: None)
            If not None the program uses these as negative examples. If
            it is None, then negative sequences are generated as random
            shuffling of the positive sequences.

        times: int (default: 2)
            Factor between number of negatives and number of positives.

        order: int (default: 2)
            Size of the minimum block to shuffle: 1 means shuffling single characters,
            2 means shuffling pairs of characters, etc.

        Returns
        -------
        self.
        """

        if neg_seqs is None:
            neg_seqs = list(
                seq_to_seq(pos_seqs,
                           modifier=shuffle_modifier,
                           times=times,
                           order=order))
        self.estimator = fit(pos_seqs,
                             neg_seqs,
                             self.vectorizer,
                             n_jobs=self.n_jobs,
                             cv=10,
                             n_iter_search=1,
                             random_state=self.random_state,
                             n_blocks=5,
                             block_size=None)
        return self
Example #13
0
 def pre_processor(seqs, **args):
     seqs = seq_to_seq(seqs, modifier=mark_modifier, position=0.5, mark='%')
     seqs = seq_to_seq(seqs, modifier=mark_modifier, position=0.0, mark='@')
     seqs = seq_to_seq(seqs, modifier=mark_modifier, position=1.0, mark='*')
     graphs = sequence_to_eden(seqs)
     return graphs
Example #14
0
 def load_negative_data(self, args):
     seqs = self.load_data(args)
     return seq_to_seq(seqs,
                       modifier=shuffle_modifier,
                       times=args.negative_ratio,
                       order=args.shuffle_order)
def train_dbox_model(fasta_fname=None, model_fname='eden_model_Dbox',window=4, neg_size_factor=5, train_test_split=0.7, n_jobs=4, n_iter=40):
    
    #transform sequences in a linear graph
    def pre_process_graph(iterator):
        from eden.converter.fasta import sequence_to_eden
        graphs = sequence_to_eden(iterator)
        return graphs

    #extract box sequence with annotaded header information
    def extract_box(data,window=3,box_type='D'):
        import re
        from eden.converter.fasta import fasta_to_sequence
        seqs = fasta_to_sequence(data)
        
        for seq in seqs:
            header = seq[0].split('_')
            cbox = header[-4]
            cpos = int(header[-3])
            dbox = header[-2]
            dpos = int(header[-1])
            
            nts = re.sub('\n','',seq[1])
            if box_type == 'C':
                box = nts[cpos-1-window:cpos+6+window]
            else:
                if (not((len(nts) < dpos+3+window) or (dpos-1-window < 0))):
                    #box = nts[dpos-1-window:dpos+3+window]
                    box = nts[dpos-1-window:dpos-1]+'x'+nts[dpos-1:dpos+3]+'y'+nts[dpos+3:dpos+3+window]
                    yield seq[0],box
                    
    #Choose the vectorizer
    from eden.graph import Vectorizer
    vectorizer = Vectorizer()
        
    #Choose the estimator
    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(class_weight='auto', shuffle = True, average=True )
    
    import random
    from eden import util
    ################Generate positive samples###############
    seqs_d_pos = extract_box(fasta_fname,window,box_type='D')

    from itertools import tee
    seqs_d_pos,seqs_d_pos_ = tee(seqs_d_pos)
    #################Generate negatives samples##############
    from eden.modifier.seq import seq_to_seq, shuffle_modifier
    seqs_d_neg = seq_to_seq( seqs_d_pos_, modifier=shuffle_modifier, times=neg_size_factor, order=2 )
    
    #####################split train/test####################
    from eden.util import random_bipartition_iter
    iterable_pos_train, iterable_pos_test = random_bipartition_iter(seqs_d_pos, relative_size=train_test_split)
    iterable_neg_train, iterable_neg_test = random_bipartition_iter(seqs_d_neg, relative_size=train_test_split)
    
    iterable_pos_train = list(iterable_pos_train)
    iterable_pos_test = list(iterable_pos_test)
    iterable_neg_train = list(iterable_neg_train)
    iterable_neg_test = list(iterable_neg_test)

    print "training pos ",len(iterable_pos_train)
    print "training neg ",len(iterable_neg_train)
    print "test pos ",len(iterable_pos_test)
    print "test neg ",len(iterable_neg_test)

    
    
    #make predictive model
    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel(pre_processor=pre_process_graph, 
                                                    estimator=estimator, 
                                                    vectorizer=vectorizer,
                                                    n_jobs=n_jobs)
    #optimize hyperparameters and fit model
    from numpy.random import randint
    from numpy.random import uniform
    
    vectorizer_parameters={'complexity':[2,3]}
    
    estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                          'penalty':['l1','l2','elasticnet'],
                          'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                          'loss':['log'],#'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                          'power_t':uniform(0.1, size=n_iter),
                          'alpha': [10**x for x in range(-8,0)],
                          'eta0': [10**x for x in range(-4,-1)],
                          'learning_rate': ["invscaling", "constant", "optimal"],
                          'n_jobs':[n_jobs]}
    
    model.optimize(iterable_pos_train, iterable_neg_train, 
                   model_name=model_fname,
                   max_total_time=60*30, n_iter=n_iter, 
                   cv=10,
                   score_func=lambda avg_score,std_score : avg_score - std_score * 2,
                   scoring='roc_auc', 
                   vectorizer_parameters=vectorizer_parameters, 
                   estimator_parameters=estimator_parameters)

    #estimate predictive performance
    print model.get_parameters()
    
    
    result,text = model.estimate( iterable_pos_test, iterable_neg_test )
    
    rss=0
    i = 0

    for prob in result:
        i=i+1
        print prob
        if (prob[1] == 1):
            rss = rss + ((1 - prob[0][1])**2)
        else:
            rss = rss + ((1 - prob[0][0])**2)

    avg_rss= rss/i;
    text.append('RSS: %.2f' % rss)
    text.append('avg RSS: %2f' % avg_rss)

    for t in text:
        print t
def train_stem_finder_model(fasta,model_stem_name,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r, train_test_split=0.7,neg_size_factor = 4, n_jobs=4, n_iter=40,fasta_test=None):

    ########### Pre processor ####################
    def pre_process_graph(iterator, **options):    
        from eden.converter.rna.rnasubopt import rnasubopt_to_eden
        graphs = rnasubopt_to_eden(iterator, **options)
        return graphs
    ########## Vectorizer ########################
    from eden.graph import Vectorizer
    vectorizer = Vectorizer()

    ######### Estimator #########################
    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(class_weight='auto', shuffle = True, average=True )

    
    def get_Cbox(seqs,window_c):
		import re
		for seq in seqs:
			header = seq[0].split('_')
			cpos = int(header[-3])
			nts = re.sub('\n','',seq[1])
			if (not((len(nts) < cpos+6+window_c) or (cpos-1-window_c < 0))):
				box = nts[cpos-1-window_c:cpos-1]+'x'+nts[cpos-1:cpos+6]+'y'+nts[cpos+6:cpos+6+window_c]
				yield [seq,cpos],box

    def get_Dbox(seqs_c,window_d):
	    import re 
	    for [seq,cbox],pred in seqs_c:
			header = seq[0][0].split('_')
			dpos = int(header[-1])
			nts = re.sub('\n','',seq[0][1])
			if (not((len(nts) < dpos+3+window_d) or (dpos-1-window_d < 0))):
				box = nts[dpos-1-window_d:dpos-1]+'x'+nts[dpos-1:dpos+3]+'y'+nts[dpos+3:dpos+3+window_d]
				yield [seq,cbox,pred,dpos],box
    
                    
    ######### Get stem #########################
    def get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r):
        from itertools import izip 
        import re
        from itertools import tee,islice
       
        #1)c_finder
        seqs_c = get_Cbox(seqs,window_c)
		

        #2)submit the Cbox candidates to the model
        from eden.model import ActiveLearningBinaryClassificationModel
        model = ActiveLearningBinaryClassificationModel()
        model.load(model_c_name)
        
        seqs_c_pred = list()
        cands_c = list()
        max_count = 0        
        
        for seq_c in seqs_c:
            max_count +=1
            cands_c.append(seq_c)
            if (max_count == 10000): #in order to not generate memory leak I've restricted the number of samples to be submited to the model
				preds = model.decision_function(cands_c)
				seqs_c_pred = seqs_c_pred + zip(cands_c,preds)
				cands_c = list()
				max_count = 0
        if (max_count != 0):
			preds = model.decision_function(cands_c)
			seqs_c_pred = seqs_c_pred + zip(cands_c,preds)
        
        #discard sequences with pred < 0
        seqs_c = list()
        for cand in seqs_c_pred:
			if (cand[1] >= 0.0):
				seqs_c.append(cand)
        
        
        #D_finder
        seqs_cd = get_Dbox(seqs_c,window_d)
        #submit Dboxes candidate to its model
        model = ActiveLearningBinaryClassificationModel()
        model.load(model_d_name)
        
        seqs_d_pred = list()
        cands_d = list()
        max_count = 0        
        
        for seq_d in seqs_cd:
            max_count +=1
            cands_d.append(seq_d)
            if (max_count == 10000): #in order to not generate memory leak I've restricted the number of samples to be submited to the model
				preds = model.decision_function(cands_d)
				seqs_d_pred = seqs_d_pred + zip(cands_d,preds)
				cands_d = list()
				max_count = 0
        if (max_count != 0):
			preds = model.decision_function(cands_d)
			seqs_d_pred = seqs_d_pred + zip(cands_d,preds)
		
	#Get the stem region from the sequences
        stem_cands=[]
        stem_info =[]
        #(([[(header, seq), pos_c], cand_c, pred_c, pos_d], 'UAAxCUGAyGAU'), 77.000434164559792)

        for ([[(header,nts),pos_c],cand_c,pred_c,pos_d],cand_d),pred_d in seqs_d_pred:
			#print header,'\t',seq,pos_c,'\t',cand_c,'\t',pred_c,'\t',cand_d,'\t',pred_d,"\n---\n" 
			if ( int(pos_c) - 10 < 0):
				if (int(pos_d)+10 > len(nts)):
					stem_cands.append([[header,pos_c,pos_d],nts[0:int(pos_c)+6]+"&"+nts[int(pos_d)-1:len(nts)]])
				else:
					stem_cands.append([[header,pos_c,pos_d],nts[0:int(pos_c)+6]+"&"+nts[int(pos_d)-1:int(pos_d)+3+10]])
					
			else:
				if (int(pos_d)+10 > len(nts)):
					stem_cands.append([[header,pos_c,pos_d],nts[int(pos_c)-10:int(pos_c)+6]+"&"+nts[int(pos_d)-1:len(nts)]])
					
				else:
					stem_cands.append([[header,pos_c,pos_d],nts[int(pos_c)-10:int(pos_c)+6]+"&"+nts[int(pos_d)-1:int(pos_d)+3+10]])
					
		
        return stem_cands
            
			
    
    #get positive data
    pos_cds=[]

    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(fasta)

    train_pos = get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r)
    train_pos = list(train_pos)

    

    #for h,seq in stems_cds:
	#	print h[0][0:10],'\t',seq

    #Generate Negative Dataset
    from eden.modifier.seq import seq_to_seq, shuffle_modifier
    train_neg = seq_to_seq(train_pos, modifier=shuffle_modifier, times=neg_size_factor, order=2 )
    train_neg = list(train_neg)
 
    
    #######Split the data into training and test
    if (fasta_test == None):
        print "Training and Test with the same dataset (different sequences)"
        #split train/test
        from eden.util import random_bipartition_iter
        iterable_pos_train, iterable_pos_test = random_bipartition_iter(train_pos, relative_size=train_test_split)
        iterable_neg_train, iterable_neg_test = random_bipartition_iter(train_neg, relative_size=train_test_split)
        
        
        iterable_pos_train = list(iterable_pos_train)
        iterable_neg_train = list(iterable_neg_train)
        
        iterable_pos_test = list(iterable_pos_test)
        iterable_neg_test = list(iterable_neg_test)
        
    
        
      
        

    else:        
        print "test dataset = ",fasta_test,"\n"
        pos_test_cds=[]
        neg_test_cds=[]

        from eden.converter.fasta import fasta_to_sequence
        seqs = fasta_to_sequence(fasta_test)

        test_pos = get_stem(seqs,window_c,model_c_name,window_d, model_d_name,flank_size_l,flank_size_r)
        test_pos = list(test_pos)

        #Generate Negative test data
        test_neg = seq_to_seq(test_pos, modifier=shuffle_modifier, times=neg_size_factor, order=2 )
        test_neg = list(test_neg)
        
        iterable_pos_train = list(train_pos)
        iterable_neg_train = list(train_neg)
        iterable_pos_test  = list(test_pos)
        iterable_neg_test  = list(test_neg)
        
    print "Positive training samples: ",len(iterable_pos_train)
    print "Negative training samples: ",len(iterable_neg_train)
    print "--------\nPositive test samples: ",len(iterable_pos_test)
    print "Negative test samples: ",len(iterable_neg_test)



    #make predictive model
    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel(pre_processor=pre_process_graph, 
                                                    estimator=estimator, 
                                                    vectorizer=vectorizer,
                                                    n_jobs=n_jobs)
    #optimize hyperparameters and fit model:
    from numpy.random import randint
    from numpy.random import uniform


    pre_processor_parameters={'energy_range':[3,4,5,6,7,8,9,10],
							   'max_num_subopts':randint(100,200,size=n_iter),
							   'max_num': [3,4,5,6,7,8]}


    vectorizer_parameters={'complexity':[2,3]}

    estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                          'penalty':['l1','l2','elasticnet'],
                          'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                          'loss':['log'],#'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                          'power_t':uniform(0.1, size=n_iter),
                          'alpha': [10**x for x in range(-8,0)],
                          'eta0': [10**x for x in range(-4,-1)],
                          'learning_rate': ["invscaling", "constant", "optimal"],
                          'n_jobs':[n_jobs]}

    model.optimize(iterable_pos_train, iterable_neg_train, 
                   model_name=model_stem_name,
                   max_total_time=60*60*24, n_iter=n_iter, 
                   n_active_learning_iterations=3,
                   cv=10,
                   score_func=lambda avg_score,std_score : avg_score - std_score * 2,
                   scoring='roc_auc', 
                   pre_processor_parameters = pre_processor_parameters,
		   vectorizer_parameters=vectorizer_parameters, 
                   estimator_parameters=estimator_parameters)

    #estimate predictive performance
    print model.get_parameters()

    result,text = model.estimate( iterable_pos_test, iterable_neg_test )

    rss=0
    i = 0

    for prob in result:
        i=i+1
        #print prob
        if (prob[1] == 1):
            rss = rss + ((1 - prob[0][1])**2)
        else:
            rss = rss + ((1 - prob[0][0])**2)

    avg_rss= rss/i;
    text.append('RSS: %.2f' % rss)
    text.append('avg RSS: %2f' % avg_rss)

    for t in text:
        print t