def _fit_predictive_model(self, seqs, neg_seqs=None):
     # duplicate iterator
     pos_seqs, pos_seqs_ = tee(seqs)
     pos_graphs = mp_pre_process(pos_seqs,
                                 pre_processor=sequence_to_eden,
                                 n_blocks=self.pre_processor_n_blocks,
                                 block_size=self.pre_processor_block_size,
                                 n_jobs=self.pre_processor_n_jobs)
     if neg_seqs is None:
         # shuffle seqs to obtain negatives
         neg_seqs = seq_to_seq(pos_seqs_,
                               modifier=shuffle_modifier,
                               times=self.negative_ratio,
                               order=self.shuffle_order)
     neg_graphs = mp_pre_process(neg_seqs,
                                 pre_processor=sequence_to_eden,
                                 n_blocks=self.pre_processor_n_blocks,
                                 block_size=self.pre_processor_block_size,
                                 n_jobs=self.pre_processor_n_jobs)
     # fit discriminative estimator
     self.estimator = fit(pos_graphs,
                          neg_graphs,
                          vectorizer=self.vectorizer,
                          n_iter_search=self.n_iter_search,
                          n_jobs=self.n_jobs,
                          n_blocks=self.n_blocks,
                          block_size=self.block_size,
                          random_state=self.random_state)
    def fit(self, pos_seqs, neg_seqs=None, times=2, order=2):
        """Fit an estimator to discriminate the pos_seqs from the neg_seqs.

        Parameters
        ----------
        pos_seqs : iterable strings
            Input sequences.

        neg_seqs : iterable strings (default: None)
            If not None the program uses these as negative examples. If
            it is None, then negative sequences are generated as random
            shuffling of the positive sequences.

        times: int (default: 2)
            Factor between number of negatives and number of positives.

        order: int (default: 2)
            Size of the minimum block to shuffle: 1 means shuffling single characters,
            2 means shuffling pairs of characters, etc.

        Returns
        -------
        self.
        """

        if neg_seqs is None:
            neg_seqs = list(seq_to_seq(pos_seqs, modifier=shuffle_modifier, times=times, order=order))
        self.estimator = fit(pos_seqs, neg_seqs, self.vectorizer,
                             n_jobs=self.n_jobs,
                             cv=10,
                             n_iter_search=1,
                             random_state=self.random_state,
                             n_blocks=5,
                             block_size=None)
        return self
Beispiel #3
0
 def _fit_predictive_model(self, seqs, neg_seqs=None):
     # duplicate iterator
     pos_seqs, pos_seqs_ = tee(seqs)
     pos_graphs = mp_pre_process(pos_seqs, pre_processor=sequence_to_eden,
                                 n_blocks=self.pre_processor_n_blocks,
                                 block_size=self.pre_processor_block_size,
                                 n_jobs=self.pre_processor_n_jobs)
     if neg_seqs is None:
         # shuffle seqs to obtain negatives
         neg_seqs = seq_to_seq(pos_seqs_,
                               modifier=shuffle_modifier,
                               times=self.negative_ratio,
                               order=self.shuffle_order)
     neg_graphs = mp_pre_process(neg_seqs, pre_processor=sequence_to_eden,
                                 n_blocks=self.pre_processor_n_blocks,
                                 block_size=self.pre_processor_block_size,
                                 n_jobs=self.pre_processor_n_jobs)
     # fit discriminative estimator
     self.estimator = fit(pos_graphs, neg_graphs,
                          vectorizer=self.vectorizer,
                          n_iter_search=self.n_iter_search,
                          n_jobs=self.n_jobs,
                          n_blocks=self.n_blocks,
                          block_size=self.block_size,
                          random_state=self.random_state)
Beispiel #4
0
def generate_negatives_and_fit(iterable=None, negative_shuffle_ratio=None, shuffle_order=None, vectorizer_complexity=None):

    vectorizer = Vectorizer(complexity=vectorizer_complexity)
    iterable, iterable_neg = binary_classification_dataset_setup(
        iterable_seq=iterable, negative_shuffle_ratio=negative_shuffle_ratio, shuffle_order=shuffle_order)
    model = fit(iterable, iterable_neg, vectorizer,
                n_jobs=-1, cv=3, n_iter_search=1)
    return model
def fit_and_evaluate(pos_original,
                     neg_original,
                     pos_sampled,
                     neg_sampled,
                     pos_test,
                     neg_test,
                     random_state=42):
    '''
    pos + neg orig+sampled testsets -> orig_roc , sampled_roc, augmented_roc
    '''

    # create graph sets...orig augmented and sampled
    pos_orig, pos_orig_ = tee(pos_original)
    neg_orig, neg_orig_ = tee(neg_original)

    pos_sampled, pos_sampled_ = tee(pos_sampled)
    neg_sampled, neg_sampled_ = tee(neg_sampled)

    pos_augmented = chain(pos_orig_, pos_sampled_)
    neg_augmented = chain(neg_orig_, neg_sampled_)

    predictive_performances = []
    for desc, pos_train, neg_train in [('original', pos_orig, neg_orig),
                                       ('sample', pos_sampled, neg_sampled),
                                       ('original+sample', pos_augmented,
                                        neg_augmented)]:
        pos_train, pos_train_ = tee(pos_train)
        neg_train, neg_train_ = tee(neg_train)
        pos_size = sum(1 for x in pos_train_)
        neg_size = sum(1 for x in neg_train_)

        logger.info("-" * 80)
        logger.info('working on %s' % (desc))
        logger.info('training set sizes: #pos: %d #neg: %d' %
                    (pos_size, neg_size))

        if pos_size == 0 or neg_size == 0:
            logger.info('WARNING: empty dataset')
            predictive_performances.append(0)
        else:
            start = time()
            pos_test, pos_test_ = tee(pos_test)
            neg_test, neg_test_ = tee(neg_test)

            local_estimator = fit(pos_train,
                                  neg_train,
                                  Vectorizer(4),
                                  n_jobs=-1,
                                  n_iter_search=1)
            apr, roc = estimate(pos_test_, neg_test_, local_estimator,
                                Vectorizer(4))
            predictive_performances.append(roc)
            logger.info('elapsed: %.1f sec' % (time() - start))
    return predictive_performances
def fit_and_evaluate(pos_original, neg_original,
                     pos_sampled, neg_sampled,
                     pos_test, neg_test,
                     random_state=42):
    '''
    pos + neg orig+sampled testsets -> orig_roc , sampled_roc, augmented_roc
    '''
    
    # create graph sets...orig augmented and sampled
    pos_orig,pos_orig_ = tee(pos_original)
    neg_orig,neg_orig_ = tee(neg_original)
    
    pos_sampled, pos_sampled_ = tee(pos_sampled)
    neg_sampled, neg_sampled_ = tee(neg_sampled)
    
    pos_augmented = chain(pos_orig_,pos_sampled_)
    neg_augmented = chain(neg_orig_,neg_sampled_)

    predictive_performances = []
    for desc,pos_train,neg_train in [('original',pos_orig, neg_orig),
                                     ('sample',pos_sampled,neg_sampled),
                                     ('original+sample',pos_augmented, neg_augmented)]:
        pos_train,pos_train_ = tee(pos_train)
        neg_train,neg_train_ = tee(neg_train)
        pos_size=sum(1 for x in pos_train_)
        neg_size=sum(1 for x in neg_train_)

        logger.info( "-"*80)
        logger.info('working on %s'%(desc))
        logger.info('training set sizes: #pos: %d #neg: %d'%(pos_size, neg_size))

        if pos_size == 0 or neg_size == 0:
            logger.info('WARNING: empty dataset')
            predictive_performances.append(0)            
        else:
            start=time()
            pos_test,pos_test_ = tee(pos_test)
            neg_test,neg_test_ = tee(neg_test)
            
            local_estimator = fit(pos_train, neg_train, Vectorizer(4), n_jobs=-1, n_iter_search=1)
            apr, roc = estimate(pos_test_, neg_test_, local_estimator, Vectorizer(4))
            predictive_performances.append(roc)
            logger.info( 'elapsed: %.1f sec'%(time()-start))
    return predictive_performances
Beispiel #7
0
    def fit(self, pos_seqs, neg_seqs=None, times=2, order=2):
        """Fit an estimator to discriminate the pos_seqs from the neg_seqs.

        Parameters
        ----------
        pos_seqs : iterable strings
            Input sequences.

        neg_seqs : iterable strings (default: None)
            If not None the program uses these as negative examples. If
            it is None, then negative sequences are generated as random
            shuffling of the positive sequences.

        times: int (default: 2)
            Factor between number of negatives and number of positives.

        order: int (default: 2)
            Size of the minimum block to shuffle: 1 means shuffling single characters,
            2 means shuffling pairs of characters, etc.

        Returns
        -------
        self.
        """

        if neg_seqs is None:
            neg_seqs = list(
                seq_to_seq(pos_seqs,
                           modifier=shuffle_modifier,
                           times=times,
                           order=order))
        self.estimator = fit(pos_seqs,
                             neg_seqs,
                             self.vectorizer,
                             n_jobs=self.n_jobs,
                             cv=10,
                             n_iter_search=1,
                             random_state=self.random_state,
                             n_blocks=5,
                             block_size=None)
        return self