def eva(esti, ne, po): v = Vectorizer() matrix = v.transform(ne) correct = sum([1 for res in esti.predict(matrix) if res == -1]) matrix2 = v.transform(po) correct += sum([1 for res in esti.predict(matrix2) if res == 1]) return correct
def fit_and_evaluate(pos_original, neg_original, pos_sampled, neg_sampled, pos_test, neg_test, random_state=42): ''' pos + neg orig+sampled testsets -> orig_roc , sampled_roc, augmented_roc ''' # create graph sets...orig augmented and sampled pos_orig, pos_orig_ = tee(pos_original) neg_orig, neg_orig_ = tee(neg_original) pos_sampled, pos_sampled_ = tee(pos_sampled) neg_sampled, neg_sampled_ = tee(neg_sampled) pos_augmented = chain(pos_orig_, pos_sampled_) neg_augmented = chain(neg_orig_, neg_sampled_) predictive_performances = [] for desc, pos_train, neg_train in [('original', pos_orig, neg_orig), ('sample', pos_sampled, neg_sampled), ('original+sample', pos_augmented, neg_augmented)]: pos_train, pos_train_ = tee(pos_train) neg_train, neg_train_ = tee(neg_train) pos_size = sum(1 for x in pos_train_) neg_size = sum(1 for x in neg_train_) logger.info("-" * 80) logger.info('working on %s' % (desc)) logger.info('training set sizes: #pos: %d #neg: %d' % (pos_size, neg_size)) if pos_size == 0 or neg_size == 0: logger.info('WARNING: empty dataset') predictive_performances.append(0) else: start = time() pos_test, pos_test_ = tee(pos_test) neg_test, neg_test_ = tee(neg_test) local_estimator = fit(pos_train, neg_train, Vectorizer(4), n_jobs=-1, n_iter_search=1) apr, roc = estimate(pos_test_, neg_test_, local_estimator, Vectorizer(4)) predictive_performances.append(roc) logger.info('elapsed: %.1f sec' % (time() - start)) return predictive_performances
def __init__(self, n_differences=1, enhance=True, vectorizer=Vectorizer(complexity=3), n_jobs=-1, random_state=1): """Generate sequences starting from input sequences that are 'better' if enhance is set to True ('worse' otherwise) given the set of sequences used in the fit phase. Parameters ---------- n_differences : int (default 1) Number of characters that differ for the generated sequence from the original input sequence. enhance : bool (default True) If set to True then the score computed by the estimator will be higher for the sequences generated than for the input sequences. If False than the score will be lower. vectorizer : EDeN sequence vectorizer The vectorizer to map sequences to sparse vectors. n_jobs : int (default -1) The number of cores to use in parallel. -1 indicates all available. random_state: int (default 1) The random seed. """ self.random_state = random_state self.n_jobs = n_jobs self.n_differences = n_differences self.enhance = enhance self.vectorizer = vectorizer self.estimator = None
def __init__(self, complexity=None, nbits=20, sequence_vectorizer_complexity=3, graph_vectorizer_complexity=2, n_neighbors=5, sampling_prob=.5, n_iter=5, min_energy=-5, random_state=1): random.seed(random_state) if complexity is not None: sequence_vectorizer_complexity = complexity graph_vectorizer_complexity = complexity self.sequence_vectorizer = SeqVectorizer(complexity=sequence_vectorizer_complexity, nbits=nbits, normalization=False, inner_normalization=False) self.graph_vectorizer = GraphVectorizer(complexity=graph_vectorizer_complexity, nbits=nbits) self.n_neighbors = n_neighbors self.sampling_prob = sampling_prob self.n_iter = n_iter self.min_energy = min_energy self.nearest_neighbors = NearestNeighbors(n_neighbors=n_neighbors)
def train_esti(neg, pos): v = Vectorizer() matrix = v.transform(neg + pos) res = SGDClassifier(shuffle=True) res.fit(matrix, np.asarray([-1] * len(neg) + [1] * len(pos))) return res
class Vectorizer(object): def __init__(self, complexity=None, nbits=20, sequence_vectorizer_complexity=3, graph_vectorizer_complexity=2, n_neighbors=5, sampling_prob=.5, n_iter=5, min_energy=-5, random_state=1): random.seed(random_state) if complexity is not None: sequence_vectorizer_complexity = complexity graph_vectorizer_complexity = complexity self.sequence_vectorizer = SeqVectorizer(complexity=sequence_vectorizer_complexity, nbits=nbits, normalization=False, inner_normalization=False) self.graph_vectorizer = GraphVectorizer(complexity=graph_vectorizer_complexity, nbits=nbits) self.n_neighbors = n_neighbors self.sampling_prob = sampling_prob self.n_iter = n_iter self.min_energy = min_energy self.nearest_neighbors = NearestNeighbors(n_neighbors=n_neighbors) def fit(self, seqs): # store seqs self.seqs = list(normalize_seqs(seqs)) data_matrix = self.sequence_vectorizer.transform(self.seqs) # fit nearest_neighbors model self.nearest_neighbors.fit(data_matrix) return self def fit_transform(self, seqs, sampling_prob=None, n_iter=None): seqs, seqs_ = tee(seqs) return self.fit(seqs_).transform(seqs, sampling_prob=sampling_prob, n_iter=n_iter) def transform(self, seqs, sampling_prob=None, n_iter=None): seqs = list(normalize_seqs(seqs)) graphs_ = self.graphs(seqs) data_matrix = self.graph_vectorizer.transform(graphs_) return data_matrix def graphs(self, seqs, sampling_prob=None, n_iter=None): seqs = list(normalize_seqs(seqs)) if n_iter is not None: self.n_iter = n_iter if sampling_prob is not None: self.sampling_prob = sampling_prob for seq, neighs in self._compute_neighbors(seqs): if self.n_iter > 1: header, sequence, struct, energy = self._optimize_struct(seq, neighs) else: header, sequence, struct, energy = self._align_sequence_structure(seq, neighs) graph = self._seq_to_eden(header, sequence, struct, energy) yield graph def _optimize_struct(self, seq, neighs): structs = [] results = [] for i in range(self.n_iter): new_neighs = self._sample_neighbors(neighs) header, sequence, struct, energy = self._align_sequence_structure(seq, new_neighs) results.append((header, sequence, struct, energy)) structs.append(struct) instance_id = self._most_representative(structs) selected = results[instance_id] return selected def _most_representative(self, structs): # compute kernel matrix with sequence_vectorizer data_matrix = self.sequence_vectorizer.transform(structs) kernel_matrix = pairwise_kernels(data_matrix, metric='rbf', gamma=1) # compute instance density as 1 over average pairwise distance density = np.sum(kernel_matrix, 0) / data_matrix.shape[0] # compute list of nearest neighbors max_id = np.argsort(-density)[0] return max_id def _sample_neighbors(self, neighs): out_neighs = [] # insert one element at random out_neighs.append(random.choice(neighs)) # add other elements sampling without replacement for neigh in neighs: if random.random() < self.sampling_prob: out_neighs.append(neigh) return out_neighs def _align_sequence_structure(self, seq, neighs, structure_deletions=False): header = seq[0] if len(neighs) < 1: clean_seq, clean_struct = rnafold.RNAfold_wrapper(seq[1]) energy = 0 logger.debug('Warning: no alignment for: %s' % seq) else: str_out = convert_seq_to_fasta_str(seq) for neigh in neighs: str_out += convert_seq_to_fasta_str(neigh) cmd = 'echo "%s" | muscle -clwstrict -quiet' % (str_out) out = sp.check_output(cmd, shell=True) seed = extract_aligned_seed(header, out) cmd = 'echo "%s" | RNAalifold --noPS 2>/dev/null' % (out) out = sp.check_output(cmd, shell=True) struct, energy = extract_struct_energy(out) if energy > self.min_energy: # use min free energy structure clean_seq, clean_struct = rnafold.RNAfold_wrapper(seq[1]) else: clean_seq, clean_struct = make_seq_struct(seed, struct) if structure_deletions: clean_struct = self._clean_structure(clean_seq, clean_struct) return header, clean_seq, clean_struct, energy def _clean_structure(self, seq, stru): ''' Parameters ---------- seq : basestring rna sequence stru : basestring dotbracket string Returns ------- the structure given may not respect deletions in the sequence. we transform the structure to one that does ''' # find deletions in sequence ids = [] for i, c in enumerate(seq): if c == '-': ids.append(i) # remove brackets that dont have a partner anymore stru = list(stru) pairdict = self._pairs(stru) for i in ids: stru[pairdict[i]] = '.' # delete deletions in structure ids.reverse() for i in ids: del stru[i] stru = ''.join(stru) # removing obvious mistakes stru = stru.replace("(())", "....") stru = stru.replace("(.)", "...") stru = stru.replace("(..)", "....") return stru def _pairs(self, struct): ''' Parameters ---------- struct : basestring Returns ------- dictionary of ids in the struct, that are bond pairs ''' unpaired = [] pairs = {} for i, c in enumerate(struct): if c == '(': unpaired.append(i) if c == ')': partner = unpaired.pop() pairs[i] = partner pairs[partner] = i return pairs def _compute_neighbors(self, seqs): seqs = list(seqs) data_matrix = self.sequence_vectorizer.transform(seqs) # find neighbors distances, neighbors = self.nearest_neighbors.kneighbors(data_matrix) # for each seq for seq, neighs in zip(seqs, neighbors): neighbor_seqs = [self.seqs[neigh] for neigh in neighs] yield seq, neighbor_seqs def _seq_to_eden(self, header, sequence, struct, energy): graph = sequence_dotbracket_to_graph(seq_info=sequence, seq_struct=struct) if graph.number_of_nodes() < 2: graph = seq_to_networkx(header, sequence) graph.graph['id'] = header graph.graph['info'] = 'muscle+RNAalifold energy=%.3f' % (energy) graph.graph['energy'] = energy graph.graph['sequence'] = sequence return graph