def _traverse_a_branch_from_queue(self): setup = self.branch_queue.popleft() color_branch_traverser = self.branch_traverser[setup.traversal_color] branch = color_branch_traverser.traverse_from( setup.start_string, orientation=setup.orientation, parent_graph=self.graph) Interactor.from_graph(self.graph).compose_in_graph(branch.graph) self._connect_branch_to_parent_graph(branch, setup) self._link_branch_and_queue_neighbor_traversals(branch)
def _traverse_from_each_kmer_in(self, kmer_generator): for start_kmer in kmer_generator: try: Interactor.from_graph(self.graph) \ .compose_in_graph(self._traverse_from(start_kmer).graph) self.log_graph_size() except KeyError: pass if self.max_nodes and len(self.graph) > self.max_nodes: raise Exception( ("Terminating contig traversal after kmer {}" " because max node limit is reached").format(start_kmer)) return self
def test_in_y_graph_finds_two_paths_of_revcomp(self): # given b = get_cortex_builder() b.with_kmer('CGC 1 .......T') b.with_kmer('AGC 1 a....CG.') b.with_kmer('AAG 1 .....C..') b.with_kmer('GCC 1 a.......') cdb = b.build() cdb = Interactor(cdb).make_graph_nodes_consistent(['AAG']).graph # when paths = list(Interactor(cdb).all_simple_paths()) # then assert ['AAGCC', 'AAGCG'] == sorted([str(p.seq) for p in paths])
def test_two_linked_kmers_are_jsonifiable(self): # given colors = (0, 1) color_names = ['samp1', 'samp2'] graph_builder = builder.Graph() \ .with_kmer_size(3) \ .with_num_colors(2) \ .with_color_names(*color_names) \ .with_kmer('AAA 1 1 .....C.. ........') \ .with_kmer('AAC 1 0 a....... ........') graph = load_cortex_graph(graph_builder.build()) graph = Interactor(graph) \ .make_graph_nodes_consistent(seed_kmer_strings=['GTT']) \ .graph kmer_json = cortexpy.graph.serializer.serializer.Serializer( graph).to_json() # when expect = expectation.JsonGraph.from_string(kmer_json) # then kmer_data = json.loads(kmer_json) # does not raise assert kmer_data['graph']['colors'] == list(colors) assert kmer_data['graph']['sample_names'] == color_names expect.has_n_nodes(2) expect.has_n_edges(1)
def build(self): graph = self.builder.build() if self.seed_kmers is None: self.seed_kmers = [next(iter(graph))] graph = Interactor.from_graph(graph).make_graph_nodes_consistent(self.seed_kmers).graph return UnitigFinder.from_graph(graph, colors=list(self.builder.colors), test_coverage=self.test_coverage)
def assemble(argv): import argparse from cortexpy.command.shared import get_shared_argparse shared_parser = get_shared_argparse() parser = argparse.ArgumentParser(prog='cortexpy assemble', parents=[shared_parser], description=""" Assemble all possible transcripts in <graph> from all k-mers in <start-sequences> and print the resulting transcripts as a FASTA to stdout. All specified colors are traversed and collapsed before output. """) parser.add_argument('graph', help='cortex graph') parser.add_argument('start_sequences_fasta', help='FASTA file with sequences to start from') parser.add_argument('--color', type=int, help='Restrict view to single color') parser.add_argument('--max-nodes', type=int, default=1000, help='Maximum number of nodes to traverse [default: %(default)s]') args = parser.parse_args(argv) from cortexpy.logging_config import configure_logging_from_args_and_get_logger logger = configure_logging_from_args_and_get_logger(args, 'cortexpy.assemble') import sys from Bio import SeqIO from cortexpy.utils import kmerize_fasta from cortexpy.graph.interactor import Interactor from cortexpy.graph.parser.random_access import RandomAccess from cortexpy.constants import EngineTraversalOrientation from cortexpy.graph.traversal.engine import Engine if args.out == '-': output = sys.stdout else: output = open(args.out, 'wt') random_access = RandomAccess(open(args.graph, 'rb')) if args.color is None: colors = list(range(random_access.num_colors)) else: colors = [args.color] traverser = Engine( random_access, traversal_colors=colors, orientation=EngineTraversalOrientation.both, max_nodes=args.max_nodes, ) traverser.traverse_from_each_kmer_in_fasta(args.start_sequences_fasta) kmers = kmerize_fasta(args.start_sequences_fasta, traverser.ra_parser.kmer_size) interactor = Interactor.from_graph(traverser.graph).make_graph_nodes_consistent( seed_kmer_strings=kmers) seq_record_generator = interactor.all_simple_paths() SeqIO.write(seq_record_generator, output, 'fasta')
def run(self): if self.retrieve: self.retriever = ContigRetriever(self.graph_builder.build()) return self.retriever.get_kmer_graph(self.contig_to_retrieve) elif self.traverse: traverser = Engine(RandomAccess(self.graph_builder.build()), traversal_colors=self.traversal_colors) graph = traverser.traverse_from(self.traversal_start_kmer).graph return Interactor(graph) \ .make_graph_nodes_consistent([self.traversal_start_kmer]) \ .graph else: raise Exception("Need to load a command")
def test_in_y_graph_finds_two_paths(self): # given b = CortexGraphBuilder() b.add_path('CAA', 'AAA') b.add_path('TAA', 'AAA') b.make_consistent('AAA') cdb = b.build() # when paths = list(Interactor(cdb).all_simple_paths()) # then assert {'CAAA', 'TAAA'} == set([str(p.seq) for p in paths])
def test_emits_one_single_color_unitig(self): # given b = CortexGraphBuilder() b.with_colors(0) b.add_edge('AAA', 'AAT', color=0) b.make_consistent('AAA') graph = b.build() # when paths = list(Interactor(graph).all_simple_paths()) # then assert ['AAAT'] == [str(p.seq) for p in paths]
def test_revcomps_a_kmer(): # given b = get_cortex_builder() b.with_kmer('AAA 1 ........') cdb = b.build() # when expect = KmerGraphExpectation( Interactor(cdb).make_graph_nodes_consistent({'TTT'}).graph) # then expect.has_node('TTT') expect.has_n_nodes(1)
def test_follows_two_colors_with_no_color_specified(self): # given b = CortexGraphBuilder() b.with_colors(0, 1) b.add_edge('AAA', 'AAT', color=0) b.add_edge('AAT', 'ATA', color=1) b.make_consistent('AAA') graph = b.build() # when paths = list(Interactor(graph).all_simple_paths()) # then assert {'AAATA'} == set([str(p.seq) for p in paths])
def test_with_link_for_y_graph_emits_one_path(self): # given b = CortexGraphBuilder() b.with_kmer_size(3) b.add_path('AAA', 'AAC') b.add_path('AAA', 'AAT') b.make_consistent('AAA') cdb = b.build() links = LinksBuilder() \ .with_link_for_kmer('F 1 1 C', 'AAA') \ .build() # when paths = list(Interactor(cdb).all_simple_paths(links=links)) # then assert ['AAAC'] == [str(p.seq) for p in paths]
def test_revcomps_path(): # given b = get_cortex_builder() b.with_kmer('CGC 1 .......T') b.with_kmer('AGC 1 ......G.') cdb = b.build() for seed, expected_nodes in [('CGC', ['CGC', 'GCT']), ('GCT', ['CGC', 'GCT']), ('AGC', ['AGC', 'GCG']), ('GCG', ['AGC', 'GCG'])]: # when expect = KmerGraphExpectation( Interactor(cdb).make_graph_nodes_consistent([seed]).graph) # then expect.has_nodes(*expected_nodes) expect.has_n_nodes(2)
def test_bubble_and_y_with_two_links_returns_two_transcripts(self): # given links = LinksBuilder() \ .with_link_for_kmer('F 2 1 CT', 'AAA') \ .with_link_for_kmer('F 1 1 A', 'CCC') \ .build() b = CortexGraphBuilder() b.with_kmer_size(3) b.add_path('AAA', 'AAC', 'ACC', 'CCC', 'CCA') b.add_path('AAA', 'AAG', 'AGC', 'GCC', 'CCC', 'CCT') b.make_consistent('AAA') cdb = b.build() # when paths = list(Interactor(cdb).all_simple_paths(links=links)) # then assert ['AAACCCA', 'AAACCCT'] == sorted([str(p.seq) for p in paths])
def prune(argv): import argparse from .shared import get_shared_argparse shared_parser = get_shared_argparse() parser = argparse.ArgumentParser('cortexpy prune', parents=[shared_parser]) parser.add_argument('-t', '--remove-tips', required=True, type=int, help='Remove tips shorter than this number') parser.add_argument('graph', help="Input cortexpy graph. '-' reads from stdin") args = parser.parse_args(argv) from cortexpy.logging_config import configure_logging_from_args_and_get_logger logger = configure_logging_from_args_and_get_logger(args, 'cortexpy.prune') if args.remove_tips < 2: logger.error('--remove-tips (%s) needs to be greater than 1', args.remove_tips) return 1 from cortexpy.graph.interactor import Interactor from cortexpy.graph.parser.streaming import load_cortex_graph from cortexpy.graph.serializer.kmer import dump_colored_de_bruijn_graph_to_cortex import sys if args.out == '-': output = sys.stdout.buffer else: output = open(args.out, 'wb') logger.info('Loading de Bruijn graph') if args.graph == '-': graph = load_cortex_graph(sys.stdin.buffer) else: graph = load_cortex_graph(open(args.graph, 'rb')) logger.info(f'Loaded {len(graph)} kmers') graph = Interactor(graph).prune_tips_less_than(args.remove_tips).graph dump_colored_de_bruijn_graph_to_cortex(graph, output)
def test_keys_y_graph(): # given b = get_cortex_builder() b.with_kmer('CGC 1 .......T') b.with_kmer('AGC 1 a....CG.') b.with_kmer('AAG 1 .....C..') b.with_kmer('GCC 1 a.......') expected_nodes1 = ['CGC', 'GCT', 'CTT', 'GGC'] expected_nodes2 = ['AAG', 'AGC', 'GCG', 'GCC'] for expected_nodes in [expected_nodes1, expected_nodes2]: for seed in expected_nodes: cdb = b.build() # when expect = KmerGraphExpectation( Interactor(cdb).make_graph_nodes_consistent([seed]).graph) # then expect.has_nodes(*expected_nodes)
def test_revcomps_many_kmers(data, num_kmers, kmer_size): # given kmers = {} for _ in range(num_kmers): kmer_string = data.draw(kmer_strings(min_size=kmer_size, max_size=kmer_size)) kmers[lexlo(kmer_string)] = kmer_string b = get_cortex_builder() for kmer in kmers.keys(): b.with_kmer('{} 1 ........'.format(kmer)) cdb = b.build() # when expect = KmerGraphExpectation( Interactor(cdb).make_graph_nodes_consistent(set(kmers.values())).graph) # then for kmer_string in kmers.values(): expect.has_node(kmer_string) expect.has_n_nodes(len(kmers))
def test_single_kmer_revcomp_seed(self, seed): # given b = get_cortex_builder() b.with_kmer('AAA 1 ......G.') b.with_kmer('AAG 1 a.......') cdb = b.build() # when graph = Interactor(cdb).make_graph_nodes_consistent([seed]).graph # then if seed == 'AAA': assert [] == list(graph.in_edges(seed)) assert [('AAA', 'AAG')] == list(graph.out_edges(seed)) else: assert [('CTT', 'TTT')] == list(graph.in_edges(seed)) assert [] == list(graph.out_edges(seed))
def test_gets_correct_neighbors_of_kmer(self): # given b = get_cortex_builder() b.with_kmer('AAC 1 .......T') b.with_kmer('ACT 1 a.....G.') b.with_kmer('CAG 1 .......T') cdb = b.build() seed = 'AAC' # when graph = Interactor(cdb).make_graph_nodes_consistent([seed]).graph # then assert ['CTG'] == list(graph['ACT']) assert ['CTG'] == list(graph.succ['ACT']) assert ['AAC'] == list(graph.pred['ACT']) assert [('ACT', 'CTG')] == list(graph.out_edges('ACT')) assert [('AAC', 'ACT')] == list(graph.in_edges('ACT')) assert [] == list(graph['CTG']) assert [] == list(graph.succ['CTG']) assert ['ACT'] == list(graph.pred['CTG']) assert [] == list(graph.out_edges('CTG')) assert [('ACT', 'CTG')] == list(graph.in_edges('CTG'))
def traverse(argv): import argparse from cortexpy.command.shared import get_shared_argparse shared_parser = get_shared_argparse() parser = argparse.ArgumentParser(prog='cortexpy traverse', parents=[shared_parser], description=""" Traverse all simple paths between all sources and targets of an input graph. Input is a cortex graph. Output is a FASTA. This tool also allows the creation of a JSON representation of a CORTEX graph that is consistent with seed strings by using the --to-json and --seed-strings arguments. If a links file is supplied, then branches consistent with the links will be preferred in the traversal. """) parser.add_argument('graph', help="cortex graph. Slurp graph from stdin is '-'.") parser.add_argument('--to-json', action='store_true') parser.add_argument( '--seed-strings', nargs='*', default=[], help="Strings with seed kmers from which to start contig traversal. " "Multiple strings can be specified.") parser.add_argument('--color', type=int, help='Restrict view to single color') parser.add_argument('--max-paths', type=int, default=0, help='Return exit status 64 if more than this ' 'number of paths are encountered. ' '0 turns off this check.') parser.add_argument( '--graph-index', type=int, default=0, help='Graph index to be added to description of all output paths') parser.add_argument( '--extra-start-kmer', help='Disconnect this k-mer from incoming k-mers before ' 'candidate transcript creation. ' 'This argument may fail if not used together with --seed-strings.') parser.add_argument('--links-file', help='gzipped Mccortex-style links file for graph') args = parser.parse_args(argv) from cortexpy.logging_config import configure_logging_from_args_and_get_logger logger = configure_logging_from_args_and_get_logger( args, 'cortexpy.traverse') import sys import gzip from cortexpy.graph.interactor import Interactor from cortexpy.graph.serializer.serializer import Serializer from cortexpy.graph.parser.streaming import load_cortex_graph from cortexpy.links import Links from . import get_exit_code_yaml_path import yaml EXIT_CODES = yaml.load(open(get_exit_code_yaml_path(), 'rt'), Loader=yaml.FullLoader) if args.out == '-': output = sys.stdout else: output = open(args.out, 'wt') logger.info(f'Loading graph: %s', args.graph) if args.graph == '-': graph = load_cortex_graph(sys.stdin.buffer) else: graph = load_cortex_graph(open(args.graph, 'rb')) logger.info(f'Loaded {len(graph)} kmers') consistent_graph = None if args.seed_strings: seed_kmer_strings = strings_to_kmer_strings(args.seed_strings, graph.graph['kmer_size']) logger.info( f'Making graph consistent with {len(seed_kmer_strings)} kmers from --seed-strings' ) consistent_graph = Interactor(graph) \ .make_graph_nodes_consistent(seed_kmer_strings) \ .graph if args.to_json: logger.info('Writing JSON representation of graph to STDOUT') if consistent_graph: graph = consistent_graph print(Serializer(graph).to_json()) return if not consistent_graph: logger.info('Making graph consistent') consistent_graph = Interactor.from_graph(graph) \ .make_graph_nodes_consistent() \ .graph if args.extra_start_kmer: if args.extra_start_kmer not in graph: logger.error( f'Could not find extra start kmer ({args.extra_start_kmer}) in graph' ) return 1 links = None if args.links_file is not None: logger.info(f'Loading links file {args.links_file}') links = Links.from_binary_stream(gzip.open(args.links_file, 'rb')) seq_record_generator = Interactor(consistent_graph) \ .all_simple_paths(args.extra_start_kmer, links=links) seq_record_generator = annotated_seq_records(seq_record_generator, graph_idx=args.graph_index) if args.max_paths > 0: logger.info('Exiting after element %s', args.max_paths) seq_record_generator = raise_after_nth_element(seq_record_generator, args.max_paths) logger.info('Writing seq records to %s', args.out) try: for record in seq_record_generator: output.write(record.format('fasta')) except IndexError: logger.error('Max paths (%s) exceeded', args.max_paths) return EXIT_CODES['MAX_PATH_EXCEEDED']
def build(self): if self.consistent_seeds: self.graph = Interactor.from_graph(self.graph) \ .make_graph_nodes_consistent(self.consistent_seeds) \ .graph return self.graph
def _make_kmer_graph_consistent(self): if isinstance(self.graph, CortexDiGraph): self.graph = Interactor(self.graph).make_graph_nodes_consistent().graph