def eva(esti, ne, po):
    v = Vectorizer()
    matrix = v.transform(ne)
    correct = sum([1 for res in esti.predict(matrix) if res == -1])
    matrix2 = v.transform(po)
    correct += sum([1 for res in esti.predict(matrix2) if res == 1])
    return correct
def fit_and_evaluate(pos_original,
                     neg_original,
                     pos_sampled,
                     neg_sampled,
                     pos_test,
                     neg_test,
                     random_state=42):
    '''
    pos + neg orig+sampled testsets -> orig_roc , sampled_roc, augmented_roc
    '''

    # create graph sets...orig augmented and sampled
    pos_orig, pos_orig_ = tee(pos_original)
    neg_orig, neg_orig_ = tee(neg_original)

    pos_sampled, pos_sampled_ = tee(pos_sampled)
    neg_sampled, neg_sampled_ = tee(neg_sampled)

    pos_augmented = chain(pos_orig_, pos_sampled_)
    neg_augmented = chain(neg_orig_, neg_sampled_)

    predictive_performances = []
    for desc, pos_train, neg_train in [('original', pos_orig, neg_orig),
                                       ('sample', pos_sampled, neg_sampled),
                                       ('original+sample', pos_augmented,
                                        neg_augmented)]:
        pos_train, pos_train_ = tee(pos_train)
        neg_train, neg_train_ = tee(neg_train)
        pos_size = sum(1 for x in pos_train_)
        neg_size = sum(1 for x in neg_train_)

        logger.info("-" * 80)
        logger.info('working on %s' % (desc))
        logger.info('training set sizes: #pos: %d #neg: %d' %
                    (pos_size, neg_size))

        if pos_size == 0 or neg_size == 0:
            logger.info('WARNING: empty dataset')
            predictive_performances.append(0)
        else:
            start = time()
            pos_test, pos_test_ = tee(pos_test)
            neg_test, neg_test_ = tee(neg_test)

            local_estimator = fit(pos_train,
                                  neg_train,
                                  Vectorizer(4),
                                  n_jobs=-1,
                                  n_iter_search=1)
            apr, roc = estimate(pos_test_, neg_test_, local_estimator,
                                Vectorizer(4))
            predictive_performances.append(roc)
            logger.info('elapsed: %.1f sec' % (time() - start))
    return predictive_performances
Example #3
0
    def __init__(self,
                 n_differences=1,
                 enhance=True,
                 vectorizer=Vectorizer(complexity=3),
                 n_jobs=-1,
                 random_state=1):
        """Generate sequences starting from input sequences that are 'better' if enhance is set to True
        ('worse' otherwise) given the set of sequences used in the fit phase.

        Parameters
        ----------
        n_differences : int (default 1)
            Number of characters that differ for the generated sequence from the original input sequence.

        enhance : bool (default True)
            If set to True then the score computed by the estimator will be higher for the sequences
            generated than for the input sequences. If False than the score will be lower.

        vectorizer : EDeN sequence vectorizer
            The vectorizer to map sequences to sparse vectors.

        n_jobs : int (default -1)
            The number of cores to use in parallel. -1 indicates all available.

        random_state: int (default 1)
            The random seed.
        """

        self.random_state = random_state
        self.n_jobs = n_jobs
        self.n_differences = n_differences
        self.enhance = enhance
        self.vectorizer = vectorizer
        self.estimator = None
Example #4
0
File: RNA.py Project: teresa-m/EDeN
    def __init__(self,
                 complexity=None,
                 nbits=20,
                 sequence_vectorizer_complexity=3,
                 graph_vectorizer_complexity=2,
                 n_neighbors=5,
                 sampling_prob=.5,
                 n_iter=5,
                 min_energy=-5,
                 random_state=1):
        random.seed(random_state)
        if complexity is not None:
            sequence_vectorizer_complexity = complexity
            graph_vectorizer_complexity = complexity

        self.sequence_vectorizer = SeqVectorizer(complexity=sequence_vectorizer_complexity,
                                                 nbits=nbits,
                                                 normalization=False,
                                                 inner_normalization=False)
        self.graph_vectorizer = GraphVectorizer(complexity=graph_vectorizer_complexity, nbits=nbits)
        self.n_neighbors = n_neighbors
        self.sampling_prob = sampling_prob
        self.n_iter = n_iter
        self.min_energy = min_energy
        self.nearest_neighbors = NearestNeighbors(n_neighbors=n_neighbors)
def train_esti(neg, pos):
    v = Vectorizer()
    matrix = v.transform(neg + pos)
    res = SGDClassifier(shuffle=True)
    res.fit(matrix, np.asarray([-1] * len(neg) + [1] * len(pos)))
    return res
Example #6
0
File: RNA.py Project: teresa-m/EDeN
class Vectorizer(object):

    def __init__(self,
                 complexity=None,
                 nbits=20,
                 sequence_vectorizer_complexity=3,
                 graph_vectorizer_complexity=2,
                 n_neighbors=5,
                 sampling_prob=.5,
                 n_iter=5,
                 min_energy=-5,
                 random_state=1):
        random.seed(random_state)
        if complexity is not None:
            sequence_vectorizer_complexity = complexity
            graph_vectorizer_complexity = complexity

        self.sequence_vectorizer = SeqVectorizer(complexity=sequence_vectorizer_complexity,
                                                 nbits=nbits,
                                                 normalization=False,
                                                 inner_normalization=False)
        self.graph_vectorizer = GraphVectorizer(complexity=graph_vectorizer_complexity, nbits=nbits)
        self.n_neighbors = n_neighbors
        self.sampling_prob = sampling_prob
        self.n_iter = n_iter
        self.min_energy = min_energy
        self.nearest_neighbors = NearestNeighbors(n_neighbors=n_neighbors)

    def fit(self, seqs):
        # store seqs
        self.seqs = list(normalize_seqs(seqs))
        data_matrix = self.sequence_vectorizer.transform(self.seqs)
        # fit nearest_neighbors model
        self.nearest_neighbors.fit(data_matrix)
        return self

    def fit_transform(self, seqs, sampling_prob=None, n_iter=None):
        seqs, seqs_ = tee(seqs)
        return self.fit(seqs_).transform(seqs, sampling_prob=sampling_prob, n_iter=n_iter)

    def transform(self, seqs, sampling_prob=None, n_iter=None):
        seqs = list(normalize_seqs(seqs))
        graphs_ = self.graphs(seqs)
        data_matrix = self.graph_vectorizer.transform(graphs_)
        return data_matrix

    def graphs(self, seqs, sampling_prob=None, n_iter=None):
        seqs = list(normalize_seqs(seqs))
        if n_iter is not None:
            self.n_iter = n_iter
        if sampling_prob is not None:
            self.sampling_prob = sampling_prob
        for seq, neighs in self._compute_neighbors(seqs):
            if self.n_iter > 1:
                header, sequence, struct, energy = self._optimize_struct(seq, neighs)
            else:
                header, sequence, struct, energy = self._align_sequence_structure(seq, neighs)
            graph = self._seq_to_eden(header, sequence, struct, energy)
            yield graph

    def _optimize_struct(self, seq, neighs):
        structs = []
        results = []
        for i in range(self.n_iter):
            new_neighs = self._sample_neighbors(neighs)
            header, sequence, struct, energy = self._align_sequence_structure(seq, new_neighs)
            results.append((header, sequence, struct, energy))
            structs.append(struct)
        instance_id = self._most_representative(structs)
        selected = results[instance_id]
        return selected

    def _most_representative(self, structs):
        # compute kernel matrix with sequence_vectorizer
        data_matrix = self.sequence_vectorizer.transform(structs)
        kernel_matrix = pairwise_kernels(data_matrix, metric='rbf', gamma=1)
        # compute instance density as 1 over average pairwise distance
        density = np.sum(kernel_matrix, 0) / data_matrix.shape[0]
        # compute list of nearest neighbors
        max_id = np.argsort(-density)[0]
        return max_id

    def _sample_neighbors(self, neighs):
        out_neighs = []
        # insert one element at random
        out_neighs.append(random.choice(neighs))
        # add other elements sampling without replacement
        for neigh in neighs:
            if random.random() < self.sampling_prob:
                out_neighs.append(neigh)
        return out_neighs

    def _align_sequence_structure(self, seq, neighs, structure_deletions=False):
        header = seq[0]
        if len(neighs) < 1:
            clean_seq, clean_struct = rnafold.RNAfold_wrapper(seq[1])
            energy = 0
            logger.debug('Warning: no alignment for: %s' % seq)
        else:
            str_out = convert_seq_to_fasta_str(seq)
            for neigh in neighs:
                str_out += convert_seq_to_fasta_str(neigh)
            cmd = 'echo "%s" | muscle -clwstrict -quiet' % (str_out)
            out = sp.check_output(cmd, shell=True)
            seed = extract_aligned_seed(header, out)
            cmd = 'echo "%s" | RNAalifold --noPS 2>/dev/null' % (out)
            out = sp.check_output(cmd, shell=True)
            struct, energy = extract_struct_energy(out)
            if energy > self.min_energy:
                # use min free energy structure
                clean_seq, clean_struct = rnafold.RNAfold_wrapper(seq[1])
            else:
                clean_seq, clean_struct = make_seq_struct(seed, struct)
            if structure_deletions:
                clean_struct = self._clean_structure(clean_seq, clean_struct)

        return header, clean_seq, clean_struct, energy

    def _clean_structure(self, seq, stru):
        '''
        Parameters
        ----------
        seq : basestring
            rna sequence
        stru : basestring
            dotbracket string

        Returns
        -------
        the structure given may not respect deletions in the sequence.
        we transform the structure to one that does
        '''

        # find  deletions in sequence
        ids = []
        for i, c in enumerate(seq):
            if c == '-':
                ids.append(i)
        # remove brackets that dont have a partner anymore
        stru = list(stru)
        pairdict = self._pairs(stru)
        for i in ids:
            stru[pairdict[i]] = '.'
        # delete deletions in structure
        ids.reverse()
        for i in ids:
            del stru[i]
        stru = ''.join(stru)

        # removing obvious mistakes
        stru = stru.replace("(())", "....")
        stru = stru.replace("(.)", "...")
        stru = stru.replace("(..)", "....")

        return stru

    def _pairs(self, struct):
        '''
        Parameters
        ----------
        struct : basestring

        Returns
        -------
        dictionary of ids in the struct, that are bond pairs
        '''
        unpaired = []
        pairs = {}
        for i, c in enumerate(struct):
            if c == '(':
                unpaired.append(i)
            if c == ')':
                partner = unpaired.pop()
                pairs[i] = partner
                pairs[partner] = i
        return pairs

    def _compute_neighbors(self, seqs):
        seqs = list(seqs)
        data_matrix = self.sequence_vectorizer.transform(seqs)
        # find neighbors
        distances, neighbors = self.nearest_neighbors.kneighbors(data_matrix)
        # for each seq
        for seq, neighs in zip(seqs, neighbors):
            neighbor_seqs = [self.seqs[neigh] for neigh in neighs]
            yield seq, neighbor_seqs

    def _seq_to_eden(self, header, sequence, struct, energy):
        graph = sequence_dotbracket_to_graph(seq_info=sequence, seq_struct=struct)
        if graph.number_of_nodes() < 2:
            graph = seq_to_networkx(header, sequence)
        graph.graph['id'] = header
        graph.graph['info'] = 'muscle+RNAalifold energy=%.3f' % (energy)
        graph.graph['energy'] = energy
        graph.graph['sequence'] = sequence
        return graph