def _fit_predictive_model(self, seqs, neg_seqs=None): # duplicate iterator pos_seqs, pos_seqs_ = tee(seqs) pos_graphs = mp_pre_process(pos_seqs, pre_processor=sequence_to_eden, n_blocks=self.pre_processor_n_blocks, block_size=self.pre_processor_block_size, n_jobs=self.pre_processor_n_jobs) if neg_seqs is None: # shuffle seqs to obtain negatives neg_seqs = seq_to_seq(pos_seqs_, modifier=shuffle_modifier, times=self.negative_ratio, order=self.shuffle_order) neg_graphs = mp_pre_process(neg_seqs, pre_processor=sequence_to_eden, n_blocks=self.pre_processor_n_blocks, block_size=self.pre_processor_block_size, n_jobs=self.pre_processor_n_jobs) # fit discriminative estimator self.estimator = fit(pos_graphs, neg_graphs, vectorizer=self.vectorizer, n_iter_search=self.n_iter_search, n_jobs=self.n_jobs, n_blocks=self.n_blocks, block_size=self.block_size, random_state=self.random_state)
def fit(self, pos_seqs, neg_seqs=None, times=2, order=2): """Fit an estimator to discriminate the pos_seqs from the neg_seqs. Parameters ---------- pos_seqs : iterable strings Input sequences. neg_seqs : iterable strings (default: None) If not None the program uses these as negative examples. If it is None, then negative sequences are generated as random shuffling of the positive sequences. times: int (default: 2) Factor between number of negatives and number of positives. order: int (default: 2) Size of the minimum block to shuffle: 1 means shuffling single characters, 2 means shuffling pairs of characters, etc. Returns ------- self. """ if neg_seqs is None: neg_seqs = list(seq_to_seq(pos_seqs, modifier=shuffle_modifier, times=times, order=order)) self.estimator = fit(pos_seqs, neg_seqs, self.vectorizer, n_jobs=self.n_jobs, cv=10, n_iter_search=1, random_state=self.random_state, n_blocks=5, block_size=None) return self
def generate_negatives_and_fit(iterable=None, negative_shuffle_ratio=None, shuffle_order=None, vectorizer_complexity=None): vectorizer = Vectorizer(complexity=vectorizer_complexity) iterable, iterable_neg = binary_classification_dataset_setup( iterable_seq=iterable, negative_shuffle_ratio=negative_shuffle_ratio, shuffle_order=shuffle_order) model = fit(iterable, iterable_neg, vectorizer, n_jobs=-1, cv=3, n_iter_search=1) return model
def fit_and_evaluate(pos_original, neg_original, pos_sampled, neg_sampled, pos_test, neg_test, random_state=42): ''' pos + neg orig+sampled testsets -> orig_roc , sampled_roc, augmented_roc ''' # create graph sets...orig augmented and sampled pos_orig, pos_orig_ = tee(pos_original) neg_orig, neg_orig_ = tee(neg_original) pos_sampled, pos_sampled_ = tee(pos_sampled) neg_sampled, neg_sampled_ = tee(neg_sampled) pos_augmented = chain(pos_orig_, pos_sampled_) neg_augmented = chain(neg_orig_, neg_sampled_) predictive_performances = [] for desc, pos_train, neg_train in [('original', pos_orig, neg_orig), ('sample', pos_sampled, neg_sampled), ('original+sample', pos_augmented, neg_augmented)]: pos_train, pos_train_ = tee(pos_train) neg_train, neg_train_ = tee(neg_train) pos_size = sum(1 for x in pos_train_) neg_size = sum(1 for x in neg_train_) logger.info("-" * 80) logger.info('working on %s' % (desc)) logger.info('training set sizes: #pos: %d #neg: %d' % (pos_size, neg_size)) if pos_size == 0 or neg_size == 0: logger.info('WARNING: empty dataset') predictive_performances.append(0) else: start = time() pos_test, pos_test_ = tee(pos_test) neg_test, neg_test_ = tee(neg_test) local_estimator = fit(pos_train, neg_train, Vectorizer(4), n_jobs=-1, n_iter_search=1) apr, roc = estimate(pos_test_, neg_test_, local_estimator, Vectorizer(4)) predictive_performances.append(roc) logger.info('elapsed: %.1f sec' % (time() - start)) return predictive_performances
def fit_and_evaluate(pos_original, neg_original, pos_sampled, neg_sampled, pos_test, neg_test, random_state=42): ''' pos + neg orig+sampled testsets -> orig_roc , sampled_roc, augmented_roc ''' # create graph sets...orig augmented and sampled pos_orig,pos_orig_ = tee(pos_original) neg_orig,neg_orig_ = tee(neg_original) pos_sampled, pos_sampled_ = tee(pos_sampled) neg_sampled, neg_sampled_ = tee(neg_sampled) pos_augmented = chain(pos_orig_,pos_sampled_) neg_augmented = chain(neg_orig_,neg_sampled_) predictive_performances = [] for desc,pos_train,neg_train in [('original',pos_orig, neg_orig), ('sample',pos_sampled,neg_sampled), ('original+sample',pos_augmented, neg_augmented)]: pos_train,pos_train_ = tee(pos_train) neg_train,neg_train_ = tee(neg_train) pos_size=sum(1 for x in pos_train_) neg_size=sum(1 for x in neg_train_) logger.info( "-"*80) logger.info('working on %s'%(desc)) logger.info('training set sizes: #pos: %d #neg: %d'%(pos_size, neg_size)) if pos_size == 0 or neg_size == 0: logger.info('WARNING: empty dataset') predictive_performances.append(0) else: start=time() pos_test,pos_test_ = tee(pos_test) neg_test,neg_test_ = tee(neg_test) local_estimator = fit(pos_train, neg_train, Vectorizer(4), n_jobs=-1, n_iter_search=1) apr, roc = estimate(pos_test_, neg_test_, local_estimator, Vectorizer(4)) predictive_performances.append(roc) logger.info( 'elapsed: %.1f sec'%(time()-start)) return predictive_performances
def fit(self, pos_seqs, neg_seqs=None, times=2, order=2): """Fit an estimator to discriminate the pos_seqs from the neg_seqs. Parameters ---------- pos_seqs : iterable strings Input sequences. neg_seqs : iterable strings (default: None) If not None the program uses these as negative examples. If it is None, then negative sequences are generated as random shuffling of the positive sequences. times: int (default: 2) Factor between number of negatives and number of positives. order: int (default: 2) Size of the minimum block to shuffle: 1 means shuffling single characters, 2 means shuffling pairs of characters, etc. Returns ------- self. """ if neg_seqs is None: neg_seqs = list( seq_to_seq(pos_seqs, modifier=shuffle_modifier, times=times, order=order)) self.estimator = fit(pos_seqs, neg_seqs, self.vectorizer, n_jobs=self.n_jobs, cv=10, n_iter_search=1, random_state=self.random_state, n_blocks=5, block_size=None) return self