def _generate(self, input_seq, n_seqs=1, show_score=False): header, seq = input_seq # find best/worst n_differences positions seq_items, n_differences_ids = self._find_key_positions(seq) # replace all possible kmers of size n_differences gen_seqs = list(self._replace(seq_items, n_differences_ids)) # keep the best/worst preds = predict(iterable=gen_seqs, estimator=self.estimator, vectorizer=self.vectorizer, mode='decision_function', n_blocks=5, block_size=None, n_jobs=self.n_jobs) sorted_pred_ids = np.argsort(preds) if self.enhance: n_seqs_ids = sorted_pred_ids[-n_seqs:] n_seqs_ids = n_seqs_ids[::-1] else: n_seqs_ids = sorted_pred_ids[:n_seqs] if show_score: return zip( np.array(preds)[n_seqs_ids], np.array(gen_seqs)[n_seqs_ids]) else: return np.array(gen_seqs)[n_seqs_ids]
def _generate(self, input_seq, n_seqs=1, show_score=False): header, seq = input_seq # find best/worst n_differences positions seq_items, n_differences_ids = self._find_key_positions(seq) # replace all possible kmers of size n_differences gen_seqs = list(self._replace(seq_items, n_differences_ids)) # keep the best/worst preds = predict(iterable=gen_seqs, estimator=self.estimator, vectorizer=self.vectorizer, mode='decision_function', n_blocks=5, block_size=None, n_jobs=self.n_jobs) sorted_pred_ids = np.argsort(preds) if self.enhance: n_seqs_ids = sorted_pred_ids[-n_seqs:] n_seqs_ids = n_seqs_ids[::-1] else: n_seqs_ids = sorted_pred_ids[:n_seqs] if show_score: return zip(np.array(preds)[n_seqs_ids], np.array(gen_seqs)[n_seqs_ids]) else: return np.array(gen_seqs)[n_seqs_ids]
def sample(self, seqs, n_seqs=1, show_score=False, enhance=None, n_differences=None): """Generate sequences starting from input sequences that are 'better' if enhance is set to True ('worse' otherwise) given the set of sequences used in the fit phase. Parameters ---------- seqs : iterable strings Input sequences. n_seqs : int (default: 1) Number of sequences to be generated starting from each sequence in input. show_score: bool (default: False) If True the return type is a pair consisting of a score and a sequence. If False the return type is a sequence. enhance : bool (default None) If set to True then the score computed by the estimator will be higher for the sequences generated than for the input sequences. If False than the score will be lower. If None the state set in the initializer is used. n_differences : int (default None) Number of characters that differ for the generated sequence from the original input sequence. If None the number set in the initializer is used. Returns ------- sequences : iterable sequences List of sequences or (score, sequence) pairs if show_score is True. """ if enhance is not None: self.enhance = enhance if n_differences is not None: self.n_differences = n_differences for seq in seqs: if show_score: preds = predict(iterable=[seq], estimator=self.estimator, vectorizer=self.vectorizer, mode='decision_function', n_blocks=5, block_size=None, n_jobs=self.n_jobs) logger.debug('%s\n%+.3f %s' % (seq[0], preds[0], seq[1])) gen_seqs = self._generate(seq, n_seqs=n_seqs, show_score=show_score) for gen_seq in gen_seqs: yield gen_seq