Exemple #1
0
def rnaplfold_to_eden(iterable,
                      max_num_edges=1,
                      window_size=150,
                      max_bp_span=100,
                      hard_threshold=0.5,
                      avg_bp_prob_cutoff=0.2,
                      no_lonely_bps=True,
                      nesting=True):
    """Fold RNA sequence with RNAfold."""
    assert (is_iterable(iterable)), 'Not iterable'
    for header, seq in iterable:
        try:
            assert (header), 'Empty header'
            assert (seq), 'Empty seq'
            sequence = (header, seq)
            graph = _rnaplfold_to_eden(sequence, max_num_edges, window_size,
                                       max_bp_span, hard_threshold,
                                       avg_bp_prob_cutoff, no_lonely_bps,
                                       nesting)
        except Exception as e:
            logger.debug(e.__doc__)
            logger.debug(e.message)
            logger.debug('Error in: %s' % seq)
            graph = sequence_fold((header, seq))
        yield graph
Exemple #2
0
 def annotate(self, iterable):
     assert(is_iterable(iterable)), 'Not iterable'
     graphs = mp_pre_process(iterable,
                             pre_processor=self.pre_processor,
                             pre_processor_args=self.pre_processor_args,
                             n_blocks=self.pre_processor_n_blocks,
                             block_size=self.pre_processor_block_size,
                             n_jobs=self.pre_processor_n_jobs)
     return self.vectorizer.annotate(graphs, self.estimator)
Exemple #3
0
 def annotate(self, iterable):
     assert (is_iterable(iterable)), 'Not iterable'
     graphs = mp_pre_process(iterable,
                             pre_processor=self.pre_processor,
                             pre_processor_args=self.pre_processor_args,
                             n_blocks=self.pre_processor_n_blocks,
                             block_size=self.pre_processor_block_size,
                             n_jobs=self.pre_processor_n_jobs)
     return self.vectorizer.annotate(graphs, self.estimator)
Exemple #4
0
def rnafold_to_eden(iterable=None, **options):
    assert(is_iterable(iterable)), 'Not iterable'
    for header, seq in iterable:
        try:
            G = string_to_networkx(header, seq, **options)
        except Exception as e:
            print e.__doc__
            print e.message
            print 'Error in: %s' % seq
            G = seq_to_networkx(header, seq, **options)
        yield G
Exemple #5
0
def rnafold_to_eden(iterable=None, **options):
    assert (is_iterable(iterable)), 'Not iterable'
    for header, seq in iterable:
        try:
            G = string_to_networkx(header, seq, **options)
        except Exception as e:
            print e.__doc__
            print e.message
            print 'Error in: %s' % seq
            G = seq_to_networkx(header, seq, **options)
        yield G
def rnasubopt_to_eden(iterable, **options):
    assert(is_iterable(iterable)), 'Not iterable'
    for header, seq, const in iterable:
        try:
            for G in string_to_networkx(header, seq, const, **options):
                yield G
        except Exception as e:
            print e.__doc__
            print e.message
            print 'Error in: %s' % seq
            G = seq_to_networkx(header, seq, **options)
            yield G
Exemple #7
0
def rnashapes_struct_to_eden(iterable, **options):
    assert(is_iterable(iterable)), 'Not iterable'
    for header, seq in iterable:
        try:
            for G in string_to_networkx(header, seq, **options):
                yield G
        except Exception as e:
            print e.__doc__
            print e.message
            print 'Error in: %s %s' % (header, seq)
            G = seq_to_networkx(header, seq, **options)
            yield G
def rnashapes_struct_to_eden(iterable, **options):
    assert(is_iterable(iterable)), 'Not iterable'
    for header, seq in iterable:
        try:
            for G in string_to_networkx(header, seq, **options):
                yield G
        except Exception as e:
            print e.__doc__
            print e.message
            print 'Error in: %s %s' % (header, seq)
            graph = seq_to_networkx(header, seq, **options)
            yield graph
Exemple #9
0
    def test_fasta_to_sequence_no_normalize(self):
        """Test default test_fasta_to_sequence with default parameters. -> moved to garden doctest"""

        fa_fn = "test/test_fasta_to_sequence.fa"
        seq = fasta_to_sequence(fa_fn, normalize=False)
        assert (is_iterable(seq))
        (header, sequence) = seq.next()
        # sequence should correspond to the unmodified fasta string
        assert (
            sequence ==
            "gtggcgtactcacggccaCCTTAGGACTCCGCGGACTTTATGCCCACCAAAAAAACGAGCCGTTTCTACGCGTCCTCCGTCGCCTgtgtcgataaagcaa"
        )
Exemple #10
0
    def test_fasta_to_sequence_normalized(self):
        """Test default test_fasta_to_sequence with default parameters. -> moved to garden doctest"""

        fa_fn = "test/test_fasta_to_sequence.fa"
        seq = fasta_to_sequence(fa_fn, normalize=True)
        assert (is_iterable(seq))
        (header, sequence) = seq.next()
        # sequence should be uppercased and all Ts should be replaced by Us
        assert (
            sequence ==
            "GUGGCGUACUCACGGCCACCUUAGGACUCCGCGGACUUUAUGCCCACCAAAAAAACGAGCCGUUUCUACGCGUCCUCCGUCGCCUGUGUCGAUAAAGCAA"
        )
def rnasubopt_to_eden(iterable, **options):
    assert (is_iterable(iterable)), 'Not iterable'
    for header, seq in iterable:
        try:
            for graph in string_to_networkx(header, seq, **options):
                yield graph
        except Exception as e:
            print e.__doc__
            print e.message
            print 'Error in: %s' % seq
            graph = seq_to_networkx(header, seq, **options)
            yield graph
Exemple #12
0
def sequence_to_eden(iterable, **options):
    """Convert sequence tuples to EDeN graphs."""
    no_header = options.get('no_header', False)
    assert (is_iterable(iterable)), 'Not iterable'
    if no_header is True:
        for seq in iterable:
            graph = seq_to_networkx('NONE', seq, **options)
            yield graph
    else:
        for header, seq in iterable:
            graph = seq_to_networkx(header, seq, **options)
            yield graph
Exemple #13
0
def sequence_to_eden(iterable, **options):
    """Convert sequence tuples to EDeN graphs."""

    no_header = options.get('no_header', False)
    assert(is_iterable(iterable)), 'Not iterable'
    if no_header is True:
        for seq in iterable:
            graph = seq_to_networkx('NONE', seq, **options)
            yield graph
    else:
        for header, seq in iterable:
            graph = seq_to_networkx(header, seq, **options)
            yield graph
Exemple #14
0
def sample_parameters_uniformly_at_random(parameters_priors):
    """Sample parameters in parameters dictionaries uniformly at random."""
    if parameters_priors:
        parameters = {}
        for param in parameters_priors:
            if is_iterable(parameters_priors[param]):
                value = random.choice(parameters_priors[param])
            else:
                value = parameters_priors[param]
            parameters[param] = value
        return parameters
    else:
        return None
Exemple #15
0
def sample_parameters_uniformly_at_random(parameters_priors):
    """Sample parameters in parameters dictionaries uniformly at random."""
    if parameters_priors:
        parameters = {}
        for param in parameters_priors:
            if is_iterable(parameters_priors[param]):
                value = random.choice(parameters_priors[param])
            else:
                value = parameters_priors[param]
            parameters[param] = value
        return parameters
    else:
        return None
Exemple #16
0
 def _data_matrix(self, iterable, fit_vectorizer=False):
     assert(is_iterable(iterable)), 'Not iterable'
     graphs = mp_pre_process(iterable,
                             pre_processor=self.pre_processor,
                             pre_processor_args=self.pre_processor_args,
                             n_blocks=self.pre_processor_n_blocks,
                             block_size=self.pre_processor_block_size,
                             n_jobs=self.pre_processor_n_jobs)
     graphs, graphs_ = tee(graphs)
     self.vectorizer.set_params(**self.vectorizer_args)
     if fit_vectorizer:
         self.vectorizer.fit(graphs_)
     X = vectorize(graphs, vectorizer=self.vectorizer, n_jobs=self.n_jobs, n_blocks=self.n_blocks)
     return X
Exemple #17
0
    def test_fasta_to_sequence_default(self):
        """Test test_fasta_to_sequence with default parameters. -> moved to garden doctest"""

        fa_fn = "test/test_fasta_to_sequence.fa"
        seq = fasta_to_sequence(fa_fn)
        assert (is_iterable(seq))
        (header, sequence) = seq.next()
        # header should contain the fasta header with '>' removed
        assert (header == "ID0")
        # sequence should be uppercased and all Ts should be replaced by Us
        assert (
            sequence ==
            "GUGGCGUACUCACGGCCACCUUAGGACUCCGCGGACUUUAUGCCCACCAAAAAAACGAGCCGUUUCUACGCGUCCUCCGUCGCCUGUGUCGAUAAAGCAA"
        )
Exemple #18
0
def rnaplfold_to_eden(iterable, **options):
    assert(is_iterable(iterable)), 'Not iterable'
    for header, seq in iterable:
        try:
            graph = string_to_networkx(header, seq, **options)
        except Exception as e:
            print
            print '-' * 80
            # print e.__doc__
            print e.message
            print 'Error in: %s %s' % (header, seq)
            print 'Reverting to path graph from sequence'
            graph = seq_to_networkx(header, seq, **options)
        yield graph
Exemple #19
0
def rnaplfold_to_eden(iterable, **options):
    assert (is_iterable(iterable)), 'Not iterable'
    for header, seq in iterable:
        try:
            graph = string_to_networkx(header, seq, **options)
        except Exception as e:
            print
            print '-' * 80
            # print e.__doc__
            print e.message
            print 'Error in: %s %s' % (header, seq)
            print 'Reverting to path graph from sequence'
            graph = seq_to_networkx(header, seq, **options)
        yield graph
Exemple #20
0
def rnafold_to_eden(iterable=None, **options):
    '''

    Parameters
    ----------
    iterable: over (header_string, sequence_string)

    options

    Returns
    -------
        nx.graph generator
    '''
    assert (is_iterable(iterable)), 'Not iterable'
    for header, seq in iterable:
        try:
            graph = string_to_networkx(header, seq, **options)
        except Exception as e:
            print e.__doc__
            print e.message
            print 'Error in: %s' % seq
            graph = seq_to_networkx(header, seq, **options)
        yield graph
Exemple #21
0
def rnafold_to_eden(iterable=None, **options):
    """Fold RNA seq with RNAfold.

    Parameters
    ----------
    iterable: over (header_string, sequence_string)

    options

    Returns
    -------
        nx.graph generator
    """
    assert (is_iterable(iterable)), 'Not iterable'
    for header, seq in iterable:
        try:
            graph = _string_to_networkx(header, seq, **options)
        except Exception as e:
            logger.debug(e.__doc__)
            logger.debug(e.message)
            logger.debug('Error in: %s' % seq)
            graph = sequence_fold(header, seq, **options)
        yield graph
Exemple #22
0
def rnafold_to_eden(iterable=None, **options):
    '''

    Parameters
    ----------
    iterable: over (header_string, sequence_string)

    options

    Returns
    -------
        nx.graph generator
    '''
    assert (is_iterable(iterable)), 'Not iterable'
    for header, seq in iterable:
        try:
            graph = string_to_networkx(header, seq, **options)
        except Exception as e:
            print e.__doc__
            print e.message
            print 'Error in: %s' % seq
            graph = seq_to_networkx(header, seq, **options)
        yield graph
Exemple #23
0
def rnashapes_to_eden(iterable, **options):
    """Transforms sequences to graphs that encode secondary structure information
    according to the RNAShapes algorithm.

    Parameters
    ----------
    sequences : iterable
        iterable pairs of header and sequence strings

    rnashapes_version : int (default 2)
        The version of RNAshapes that is in the path.
        2   e.g. RNAshapes version 2.1.6
        3   e.g. RNAshapes version 3.3.0

    shape_type : int (default 5)
        Is the level of abstraction or dissimilarity which defines a different shape.
        In general, helical regions are depicted by a pair of opening and closing brackets
        and unpaired regions are represented as a single underscore. The differences of the
        shape types are due to whether a structural element (bulge loop, internal loop, multiloop,
        hairpin loop, stacking region and external loop) contributes to the shape representation:
        Five types are implemented.
        1   Most accurate - all loops and all unpaired  [_[_[]]_[_[]_]]_
        2   Nesting pattern for all loop types and unpaired regions in external loop
        and multiloop [[_[]][_[]_]]
        3   Nesting pattern for all loop types but no unpaired regions [[[]][[]]]
        4   Helix nesting pattern in external loop and multiloop [[][[]]]
        5   Most abstract - helix nesting pattern and no unpaired regions [[][]]

    energy_range : float (default 10)
        Sets the energy range as percentage value of the minimum free energy.
        For example, when relative deviation is specified as 5.0, and the minimum free energy
        is -10.0 kcal/mol, the energy range is set to -9.5 to -10.0 kcal/mol.
        Relative deviation must be a positive floating point number; by default it is set to to 10 %.

    max_num : int (default 3)
        Is the maximum number of structures that are generated.

    split_components : bool (default False)
        If True each structure is yielded as an independent graph. Otherwise all structures
        are part of the same graph that has therefore several disconnectd components.

    example: transform a simple sequence using RNAshapes version 3+
        >>> graphs = rnashapes_to_eden([("ID", "CCCCCGGGGG")], rnashapes_version=3)
        >>> g = graphs.next()
        >>> # extract sequence from graph nodes
        >>> "".join([ value["label"] for (key, value) in g.nodes(data=True)])
        'CCCCCGGGGG'
        >>> # get vertice types
        >>> [(start, end, g.edge[start][end]["type"]) for start, end in g.edges()]
        [(0, 8, 'basepair'), (0, 1, 'backbone'), (1, 2, 'backbone'), (1, 7, 'basepair'), (2, 3, 'backbone'), (2, 6, 'basepair'), (3, 4, 'backbone'), (4, 5, 'backbone'), (5, 6, 'backbone'), (6, 7, 'backbone'), (7, 8, 'backbone'), (8, 9, 'backbone')]

    example: transform a simple sequence using RNAshapes version 3+, splitting components
        >>> graphs = rnashapes_to_eden([("ID", "CCCCCGGGGG")], split_components=True, rnashapes_version=3)
        >>> g = graphs.next()
        >>> # extract sequence from graph nodes
        >>> "".join([ value["label"] for (key, value) in g.nodes(data=True)])
        'CCCCCGGGGG'
        >>> # get dotbracket structure annotation
        >>> g.graph["structure"]
        '(((...))).'
        >>> # get vertice types
        >>> [ (start, end, g.edge[start][end]["type"]) for start, end in g.edges()]
        [(0, 8, 'basepair'), (0, 1, 'backbone'), (1, 2, 'backbone'), (1, 7, 'basepair'), (2, 3, 'backbone'), (2, 6, 'basepair'), (3, 4, 'backbone'), (4, 5, 'backbone'), (5, 6, 'backbone'), (6, 7, 'backbone'), (7, 8, 'backbone'), (8, 9, 'backbone')]

    test max_num parameter with RNAshapes version 3+
        >>> seq = "CGUCGUCGCAUCGUACGCAUGACUCAGCAUCAGACUACGUACGCAUACGUCAGCAUCAGUCAGCAUCAGCAUGCAUCACUAGCAUGCACCCCCGGGGGCACAUCGUACGUACGCUCAGUACACUGCAUGACUACGU"
        >>> graphs = rnashapes_to_eden([("ID", seq)], split_components=True, max_num=2, rnashapes_version=3)
        >>> g = graphs.next()
        >>> # get dotbracket structure annotations
        >>> len([g.graph["structure"] for g in graphs])
        2
    """

    assert(is_iterable(iterable)), 'Not iterable'
    for header, seq in iterable:
        try:
            for graph in string_to_networkx(header, seq, **options):
                yield graph
        except Exception as e:
            print e.__doc__
            print e.message
            print 'Error in: %s' % seq
            graph = seq_to_networkx(header, seq, **options)
            yield graph
Exemple #24
0
def rnashapes_to_eden(iterable, **options):
    """Transform sequences to graphs with RNAShapes.

    Parameters
    ----------
    sequences : iterable
        iterable pairs of header and sequence strings

    rnashapes_version : int (default 2)
        The version of RNAshapes that is in the path.
        2   e.g. RNAshapes version 2.1.6
        3   e.g. RNAshapes version 3.3.0

    shape_type : int (default 5)
        Is the level of abstraction or dissimilarity which defines a different
        shape.
        In general, helical regions are depicted by a pair of opening and
        closing brackets and unpaired regions are represented as a single
        underscore. The differences of the shape types are due to whether a
        structural element (bulge loop, internal loop, multiloop, hairpin loop,
        stacking region and external loop) contributes to the shape
        representation:
        Five types are implemented.
        1   Most accurate - all loops and all unpaired  [_[_[]]_[_[]_]]_
        2   Nesting pattern for all loop types and unpaired regions in external
        loop and multiloop [[_[]][_[]_]]
        3   Nesting pattern for all loop types but no unpaired
        regions [[[]][[]]]
        4   Helix nesting pattern in external loop and multiloop [[][[]]]
        5   Most abstract - helix nesting pattern and no unpaired
        regions [[][]]

    energy_range : float (default 10)
        Sets the energy range as percentage value of the minimum free energy.
        For example, when relative deviation is specified as 5.0, and the
        minimum free energy
        is -10.0 kcal/mol, the energy range is set to -9.5 to -10.0 kcal/mol.
        Relative deviation must be a positive floating point number;
        by default it is set to to 10 %.

    max_num : int (default 3)
        Is the maximum number of structures that are generated.

    split_components : bool (default False)
        If True each structure is yielded as an independent graph.
        Otherwise all structures are part of the same graph that has
        therefore several disconnectd components.

    example: transform a simple sequence using RNAshapes version 3+
        >>> graphs = rnashapes_to_eden([("ID", "CCCCCGGGGG")], rnashapes_version=3)
        >>> g = graphs.next()
        >>> # extract sequence from graph nodes
        >>> "".join([ value["label"] for (key, value) in g.nodes(data=True)])
        'CCCCCGGGGG'
        >>> # get vertice types
        >>> [(start, end, g.edge[start][end]["type"]) for start, end in g.edges()]
        [(0, 8, 'basepair'), (0, 1, 'backbone'), (1, 2, 'backbone'), (1, 7, 'basepair'), (2, 3, 'backbone'), (2, 6, 'basepair'), (3, 4, 'backbone'), (4, 5, 'backbone'), (5, 6, 'backbone'), (6, 7, 'backbone'), (7, 8, 'backbone'), (8, 9, 'backbone')]

    example: transform a simple sequence using RNAshapes version 3+, splitting components
        >>> graphs = rnashapes_to_eden([("ID", "CCCCCGGGGG")], split_components=True, rnashapes_version=3)
        >>> g = graphs.next()
        >>> # extract sequence from graph nodes
        >>> "".join([ value["label"] for (key, value) in g.nodes(data=True)])
        'CCCCCGGGGG'
        >>> # get dotbracket structure annotation
        >>> g.graph["structure"]
        '(((...))).'
        >>> # get vertice types
        >>> [ (start, end, g.edge[start][end]["type"]) for start, end in g.edges()]
        [(0, 8, 'basepair'), (0, 1, 'backbone'), (1, 2, 'backbone'), (1, 7, 'basepair'), (2, 3, 'backbone'), (2, 6, 'basepair'), (3, 4, 'backbone'), (4, 5, 'backbone'), (5, 6, 'backbone'), (6, 7, 'backbone'), (7, 8, 'backbone'), (8, 9, 'backbone')]

    test max_num parameter with RNAshapes version 3+
        >>> seq = "CGUCGUCGCAUCGUACGCAUGACUCAGCAUCAGACUACGUACGCAUACGUCAGCAUCAGUCAGCAUCAGCAUGCAUCACUAGCAUGCACCCCCGGGGGCACAUCGUACGUACGCUCAGUACACUGCAUGACUACGU"
        >>> graphs = rnashapes_to_eden([("ID", seq)], split_components=True, max_num=2, rnashapes_version=3)
        >>> g = graphs.next()
        >>> # get dotbracket structure annotations
        >>> len([g.graph["structure"] for g in graphs])
        2
    """
    assert (is_iterable(iterable)), 'Not iterable'
    for header, seq in iterable:
        try:
            for graph in _string_to_networkx(header, seq, **options):
                yield graph
        except Exception as e:
            logger.debug(e.__doc__)
            logger.debug(e.message)
            logger.debug('Error in: %s' % seq)
            graph = sequence_fold(header, seq)
        yield graph
Exemple #25
0
def sequence_to_eden(iterable, **options):
    """Convert sequence tuples to EDeN graphs."""
    assert(is_iterable(iterable)), 'Not iterable'
    for header, seq in iterable:
        graph = seq_to_networkx(header, seq, **options)
        yield graph
Exemple #26
0
def sequence_to_eden(iterable, **options):
    """Convert sequence tuples to EDeN graphs."""
    assert (is_iterable(iterable)), 'Not iterable'
    for header, seq in iterable:
        graph = seq_to_networkx(header, seq, **options)
        yield graph