def rnaplfold_to_eden(iterable, max_num_edges=1, window_size=150, max_bp_span=100, hard_threshold=0.5, avg_bp_prob_cutoff=0.2, no_lonely_bps=True, nesting=True): """Fold RNA sequence with RNAfold.""" assert (is_iterable(iterable)), 'Not iterable' for header, seq in iterable: try: assert (header), 'Empty header' assert (seq), 'Empty seq' sequence = (header, seq) graph = _rnaplfold_to_eden(sequence, max_num_edges, window_size, max_bp_span, hard_threshold, avg_bp_prob_cutoff, no_lonely_bps, nesting) except Exception as e: logger.debug(e.__doc__) logger.debug(e.message) logger.debug('Error in: %s' % seq) graph = sequence_fold((header, seq)) yield graph
def annotate(self, iterable): assert(is_iterable(iterable)), 'Not iterable' graphs = mp_pre_process(iterable, pre_processor=self.pre_processor, pre_processor_args=self.pre_processor_args, n_blocks=self.pre_processor_n_blocks, block_size=self.pre_processor_block_size, n_jobs=self.pre_processor_n_jobs) return self.vectorizer.annotate(graphs, self.estimator)
def annotate(self, iterable): assert (is_iterable(iterable)), 'Not iterable' graphs = mp_pre_process(iterable, pre_processor=self.pre_processor, pre_processor_args=self.pre_processor_args, n_blocks=self.pre_processor_n_blocks, block_size=self.pre_processor_block_size, n_jobs=self.pre_processor_n_jobs) return self.vectorizer.annotate(graphs, self.estimator)
def rnafold_to_eden(iterable=None, **options): assert(is_iterable(iterable)), 'Not iterable' for header, seq in iterable: try: G = string_to_networkx(header, seq, **options) except Exception as e: print e.__doc__ print e.message print 'Error in: %s' % seq G = seq_to_networkx(header, seq, **options) yield G
def rnafold_to_eden(iterable=None, **options): assert (is_iterable(iterable)), 'Not iterable' for header, seq in iterable: try: G = string_to_networkx(header, seq, **options) except Exception as e: print e.__doc__ print e.message print 'Error in: %s' % seq G = seq_to_networkx(header, seq, **options) yield G
def rnasubopt_to_eden(iterable, **options): assert(is_iterable(iterable)), 'Not iterable' for header, seq, const in iterable: try: for G in string_to_networkx(header, seq, const, **options): yield G except Exception as e: print e.__doc__ print e.message print 'Error in: %s' % seq G = seq_to_networkx(header, seq, **options) yield G
def rnashapes_struct_to_eden(iterable, **options): assert(is_iterable(iterable)), 'Not iterable' for header, seq in iterable: try: for G in string_to_networkx(header, seq, **options): yield G except Exception as e: print e.__doc__ print e.message print 'Error in: %s %s' % (header, seq) G = seq_to_networkx(header, seq, **options) yield G
def rnashapes_struct_to_eden(iterable, **options): assert(is_iterable(iterable)), 'Not iterable' for header, seq in iterable: try: for G in string_to_networkx(header, seq, **options): yield G except Exception as e: print e.__doc__ print e.message print 'Error in: %s %s' % (header, seq) graph = seq_to_networkx(header, seq, **options) yield graph
def test_fasta_to_sequence_no_normalize(self): """Test default test_fasta_to_sequence with default parameters. -> moved to garden doctest""" fa_fn = "test/test_fasta_to_sequence.fa" seq = fasta_to_sequence(fa_fn, normalize=False) assert (is_iterable(seq)) (header, sequence) = seq.next() # sequence should correspond to the unmodified fasta string assert ( sequence == "gtggcgtactcacggccaCCTTAGGACTCCGCGGACTTTATGCCCACCAAAAAAACGAGCCGTTTCTACGCGTCCTCCGTCGCCTgtgtcgataaagcaa" )
def test_fasta_to_sequence_normalized(self): """Test default test_fasta_to_sequence with default parameters. -> moved to garden doctest""" fa_fn = "test/test_fasta_to_sequence.fa" seq = fasta_to_sequence(fa_fn, normalize=True) assert (is_iterable(seq)) (header, sequence) = seq.next() # sequence should be uppercased and all Ts should be replaced by Us assert ( sequence == "GUGGCGUACUCACGGCCACCUUAGGACUCCGCGGACUUUAUGCCCACCAAAAAAACGAGCCGUUUCUACGCGUCCUCCGUCGCCUGUGUCGAUAAAGCAA" )
def rnasubopt_to_eden(iterable, **options): assert (is_iterable(iterable)), 'Not iterable' for header, seq in iterable: try: for graph in string_to_networkx(header, seq, **options): yield graph except Exception as e: print e.__doc__ print e.message print 'Error in: %s' % seq graph = seq_to_networkx(header, seq, **options) yield graph
def sequence_to_eden(iterable, **options): """Convert sequence tuples to EDeN graphs.""" no_header = options.get('no_header', False) assert (is_iterable(iterable)), 'Not iterable' if no_header is True: for seq in iterable: graph = seq_to_networkx('NONE', seq, **options) yield graph else: for header, seq in iterable: graph = seq_to_networkx(header, seq, **options) yield graph
def sequence_to_eden(iterable, **options): """Convert sequence tuples to EDeN graphs.""" no_header = options.get('no_header', False) assert(is_iterable(iterable)), 'Not iterable' if no_header is True: for seq in iterable: graph = seq_to_networkx('NONE', seq, **options) yield graph else: for header, seq in iterable: graph = seq_to_networkx(header, seq, **options) yield graph
def sample_parameters_uniformly_at_random(parameters_priors): """Sample parameters in parameters dictionaries uniformly at random.""" if parameters_priors: parameters = {} for param in parameters_priors: if is_iterable(parameters_priors[param]): value = random.choice(parameters_priors[param]) else: value = parameters_priors[param] parameters[param] = value return parameters else: return None
def _data_matrix(self, iterable, fit_vectorizer=False): assert(is_iterable(iterable)), 'Not iterable' graphs = mp_pre_process(iterable, pre_processor=self.pre_processor, pre_processor_args=self.pre_processor_args, n_blocks=self.pre_processor_n_blocks, block_size=self.pre_processor_block_size, n_jobs=self.pre_processor_n_jobs) graphs, graphs_ = tee(graphs) self.vectorizer.set_params(**self.vectorizer_args) if fit_vectorizer: self.vectorizer.fit(graphs_) X = vectorize(graphs, vectorizer=self.vectorizer, n_jobs=self.n_jobs, n_blocks=self.n_blocks) return X
def test_fasta_to_sequence_default(self): """Test test_fasta_to_sequence with default parameters. -> moved to garden doctest""" fa_fn = "test/test_fasta_to_sequence.fa" seq = fasta_to_sequence(fa_fn) assert (is_iterable(seq)) (header, sequence) = seq.next() # header should contain the fasta header with '>' removed assert (header == "ID0") # sequence should be uppercased and all Ts should be replaced by Us assert ( sequence == "GUGGCGUACUCACGGCCACCUUAGGACUCCGCGGACUUUAUGCCCACCAAAAAAACGAGCCGUUUCUACGCGUCCUCCGUCGCCUGUGUCGAUAAAGCAA" )
def rnaplfold_to_eden(iterable, **options): assert(is_iterable(iterable)), 'Not iterable' for header, seq in iterable: try: graph = string_to_networkx(header, seq, **options) except Exception as e: print print '-' * 80 # print e.__doc__ print e.message print 'Error in: %s %s' % (header, seq) print 'Reverting to path graph from sequence' graph = seq_to_networkx(header, seq, **options) yield graph
def rnaplfold_to_eden(iterable, **options): assert (is_iterable(iterable)), 'Not iterable' for header, seq in iterable: try: graph = string_to_networkx(header, seq, **options) except Exception as e: print print '-' * 80 # print e.__doc__ print e.message print 'Error in: %s %s' % (header, seq) print 'Reverting to path graph from sequence' graph = seq_to_networkx(header, seq, **options) yield graph
def rnafold_to_eden(iterable=None, **options): ''' Parameters ---------- iterable: over (header_string, sequence_string) options Returns ------- nx.graph generator ''' assert (is_iterable(iterable)), 'Not iterable' for header, seq in iterable: try: graph = string_to_networkx(header, seq, **options) except Exception as e: print e.__doc__ print e.message print 'Error in: %s' % seq graph = seq_to_networkx(header, seq, **options) yield graph
def rnafold_to_eden(iterable=None, **options): """Fold RNA seq with RNAfold. Parameters ---------- iterable: over (header_string, sequence_string) options Returns ------- nx.graph generator """ assert (is_iterable(iterable)), 'Not iterable' for header, seq in iterable: try: graph = _string_to_networkx(header, seq, **options) except Exception as e: logger.debug(e.__doc__) logger.debug(e.message) logger.debug('Error in: %s' % seq) graph = sequence_fold(header, seq, **options) yield graph
def rnashapes_to_eden(iterable, **options): """Transforms sequences to graphs that encode secondary structure information according to the RNAShapes algorithm. Parameters ---------- sequences : iterable iterable pairs of header and sequence strings rnashapes_version : int (default 2) The version of RNAshapes that is in the path. 2 e.g. RNAshapes version 2.1.6 3 e.g. RNAshapes version 3.3.0 shape_type : int (default 5) Is the level of abstraction or dissimilarity which defines a different shape. In general, helical regions are depicted by a pair of opening and closing brackets and unpaired regions are represented as a single underscore. The differences of the shape types are due to whether a structural element (bulge loop, internal loop, multiloop, hairpin loop, stacking region and external loop) contributes to the shape representation: Five types are implemented. 1 Most accurate - all loops and all unpaired [_[_[]]_[_[]_]]_ 2 Nesting pattern for all loop types and unpaired regions in external loop and multiloop [[_[]][_[]_]] 3 Nesting pattern for all loop types but no unpaired regions [[[]][[]]] 4 Helix nesting pattern in external loop and multiloop [[][[]]] 5 Most abstract - helix nesting pattern and no unpaired regions [[][]] energy_range : float (default 10) Sets the energy range as percentage value of the minimum free energy. For example, when relative deviation is specified as 5.0, and the minimum free energy is -10.0 kcal/mol, the energy range is set to -9.5 to -10.0 kcal/mol. Relative deviation must be a positive floating point number; by default it is set to to 10 %. max_num : int (default 3) Is the maximum number of structures that are generated. split_components : bool (default False) If True each structure is yielded as an independent graph. Otherwise all structures are part of the same graph that has therefore several disconnectd components. example: transform a simple sequence using RNAshapes version 3+ >>> graphs = rnashapes_to_eden([("ID", "CCCCCGGGGG")], rnashapes_version=3) >>> g = graphs.next() >>> # extract sequence from graph nodes >>> "".join([ value["label"] for (key, value) in g.nodes(data=True)]) 'CCCCCGGGGG' >>> # get vertice types >>> [(start, end, g.edge[start][end]["type"]) for start, end in g.edges()] [(0, 8, 'basepair'), (0, 1, 'backbone'), (1, 2, 'backbone'), (1, 7, 'basepair'), (2, 3, 'backbone'), (2, 6, 'basepair'), (3, 4, 'backbone'), (4, 5, 'backbone'), (5, 6, 'backbone'), (6, 7, 'backbone'), (7, 8, 'backbone'), (8, 9, 'backbone')] example: transform a simple sequence using RNAshapes version 3+, splitting components >>> graphs = rnashapes_to_eden([("ID", "CCCCCGGGGG")], split_components=True, rnashapes_version=3) >>> g = graphs.next() >>> # extract sequence from graph nodes >>> "".join([ value["label"] for (key, value) in g.nodes(data=True)]) 'CCCCCGGGGG' >>> # get dotbracket structure annotation >>> g.graph["structure"] '(((...))).' >>> # get vertice types >>> [ (start, end, g.edge[start][end]["type"]) for start, end in g.edges()] [(0, 8, 'basepair'), (0, 1, 'backbone'), (1, 2, 'backbone'), (1, 7, 'basepair'), (2, 3, 'backbone'), (2, 6, 'basepair'), (3, 4, 'backbone'), (4, 5, 'backbone'), (5, 6, 'backbone'), (6, 7, 'backbone'), (7, 8, 'backbone'), (8, 9, 'backbone')] test max_num parameter with RNAshapes version 3+ >>> seq = "CGUCGUCGCAUCGUACGCAUGACUCAGCAUCAGACUACGUACGCAUACGUCAGCAUCAGUCAGCAUCAGCAUGCAUCACUAGCAUGCACCCCCGGGGGCACAUCGUACGUACGCUCAGUACACUGCAUGACUACGU" >>> graphs = rnashapes_to_eden([("ID", seq)], split_components=True, max_num=2, rnashapes_version=3) >>> g = graphs.next() >>> # get dotbracket structure annotations >>> len([g.graph["structure"] for g in graphs]) 2 """ assert(is_iterable(iterable)), 'Not iterable' for header, seq in iterable: try: for graph in string_to_networkx(header, seq, **options): yield graph except Exception as e: print e.__doc__ print e.message print 'Error in: %s' % seq graph = seq_to_networkx(header, seq, **options) yield graph
def rnashapes_to_eden(iterable, **options): """Transform sequences to graphs with RNAShapes. Parameters ---------- sequences : iterable iterable pairs of header and sequence strings rnashapes_version : int (default 2) The version of RNAshapes that is in the path. 2 e.g. RNAshapes version 2.1.6 3 e.g. RNAshapes version 3.3.0 shape_type : int (default 5) Is the level of abstraction or dissimilarity which defines a different shape. In general, helical regions are depicted by a pair of opening and closing brackets and unpaired regions are represented as a single underscore. The differences of the shape types are due to whether a structural element (bulge loop, internal loop, multiloop, hairpin loop, stacking region and external loop) contributes to the shape representation: Five types are implemented. 1 Most accurate - all loops and all unpaired [_[_[]]_[_[]_]]_ 2 Nesting pattern for all loop types and unpaired regions in external loop and multiloop [[_[]][_[]_]] 3 Nesting pattern for all loop types but no unpaired regions [[[]][[]]] 4 Helix nesting pattern in external loop and multiloop [[][[]]] 5 Most abstract - helix nesting pattern and no unpaired regions [[][]] energy_range : float (default 10) Sets the energy range as percentage value of the minimum free energy. For example, when relative deviation is specified as 5.0, and the minimum free energy is -10.0 kcal/mol, the energy range is set to -9.5 to -10.0 kcal/mol. Relative deviation must be a positive floating point number; by default it is set to to 10 %. max_num : int (default 3) Is the maximum number of structures that are generated. split_components : bool (default False) If True each structure is yielded as an independent graph. Otherwise all structures are part of the same graph that has therefore several disconnectd components. example: transform a simple sequence using RNAshapes version 3+ >>> graphs = rnashapes_to_eden([("ID", "CCCCCGGGGG")], rnashapes_version=3) >>> g = graphs.next() >>> # extract sequence from graph nodes >>> "".join([ value["label"] for (key, value) in g.nodes(data=True)]) 'CCCCCGGGGG' >>> # get vertice types >>> [(start, end, g.edge[start][end]["type"]) for start, end in g.edges()] [(0, 8, 'basepair'), (0, 1, 'backbone'), (1, 2, 'backbone'), (1, 7, 'basepair'), (2, 3, 'backbone'), (2, 6, 'basepair'), (3, 4, 'backbone'), (4, 5, 'backbone'), (5, 6, 'backbone'), (6, 7, 'backbone'), (7, 8, 'backbone'), (8, 9, 'backbone')] example: transform a simple sequence using RNAshapes version 3+, splitting components >>> graphs = rnashapes_to_eden([("ID", "CCCCCGGGGG")], split_components=True, rnashapes_version=3) >>> g = graphs.next() >>> # extract sequence from graph nodes >>> "".join([ value["label"] for (key, value) in g.nodes(data=True)]) 'CCCCCGGGGG' >>> # get dotbracket structure annotation >>> g.graph["structure"] '(((...))).' >>> # get vertice types >>> [ (start, end, g.edge[start][end]["type"]) for start, end in g.edges()] [(0, 8, 'basepair'), (0, 1, 'backbone'), (1, 2, 'backbone'), (1, 7, 'basepair'), (2, 3, 'backbone'), (2, 6, 'basepair'), (3, 4, 'backbone'), (4, 5, 'backbone'), (5, 6, 'backbone'), (6, 7, 'backbone'), (7, 8, 'backbone'), (8, 9, 'backbone')] test max_num parameter with RNAshapes version 3+ >>> seq = "CGUCGUCGCAUCGUACGCAUGACUCAGCAUCAGACUACGUACGCAUACGUCAGCAUCAGUCAGCAUCAGCAUGCAUCACUAGCAUGCACCCCCGGGGGCACAUCGUACGUACGCUCAGUACACUGCAUGACUACGU" >>> graphs = rnashapes_to_eden([("ID", seq)], split_components=True, max_num=2, rnashapes_version=3) >>> g = graphs.next() >>> # get dotbracket structure annotations >>> len([g.graph["structure"] for g in graphs]) 2 """ assert (is_iterable(iterable)), 'Not iterable' for header, seq in iterable: try: for graph in _string_to_networkx(header, seq, **options): yield graph except Exception as e: logger.debug(e.__doc__) logger.debug(e.message) logger.debug('Error in: %s' % seq) graph = sequence_fold(header, seq) yield graph
def sequence_to_eden(iterable, **options): """Convert sequence tuples to EDeN graphs.""" assert(is_iterable(iterable)), 'Not iterable' for header, seq in iterable: graph = seq_to_networkx(header, seq, **options) yield graph
def sequence_to_eden(iterable, **options): """Convert sequence tuples to EDeN graphs.""" assert (is_iterable(iterable)), 'Not iterable' for header, seq in iterable: graph = seq_to_networkx(header, seq, **options) yield graph