def _traverse_a_branch_from_queue(self): setup = self.branch_queue.popleft() color_branch_traverser = self.branch_traverser[setup.traversal_color] branch = color_branch_traverser.traverse_from( setup.start_string, orientation=setup.orientation, parent_graph=self.graph) Interactor.from_graph(self.graph).compose_in_graph(branch.graph) self._connect_branch_to_parent_graph(branch, setup) self._link_branch_and_queue_neighbor_traversals(branch)
def _traverse_from_each_kmer_in(self, kmer_generator): for start_kmer in kmer_generator: try: Interactor.from_graph(self.graph) \ .compose_in_graph(self._traverse_from(start_kmer).graph) self.log_graph_size() except KeyError: pass if self.max_nodes and len(self.graph) > self.max_nodes: raise Exception( ("Terminating contig traversal after kmer {}" " because max node limit is reached").format(start_kmer)) return self
def build(self): graph = self.builder.build() if self.seed_kmers is None: self.seed_kmers = [next(iter(graph))] graph = Interactor.from_graph(graph).make_graph_nodes_consistent(self.seed_kmers).graph return UnitigFinder.from_graph(graph, colors=list(self.builder.colors), test_coverage=self.test_coverage)
def assemble(argv): import argparse from cortexpy.command.shared import get_shared_argparse shared_parser = get_shared_argparse() parser = argparse.ArgumentParser(prog='cortexpy assemble', parents=[shared_parser], description=""" Assemble all possible transcripts in <graph> from all k-mers in <start-sequences> and print the resulting transcripts as a FASTA to stdout. All specified colors are traversed and collapsed before output. """) parser.add_argument('graph', help='cortex graph') parser.add_argument('start_sequences_fasta', help='FASTA file with sequences to start from') parser.add_argument('--color', type=int, help='Restrict view to single color') parser.add_argument('--max-nodes', type=int, default=1000, help='Maximum number of nodes to traverse [default: %(default)s]') args = parser.parse_args(argv) from cortexpy.logging_config import configure_logging_from_args_and_get_logger logger = configure_logging_from_args_and_get_logger(args, 'cortexpy.assemble') import sys from Bio import SeqIO from cortexpy.utils import kmerize_fasta from cortexpy.graph.interactor import Interactor from cortexpy.graph.parser.random_access import RandomAccess from cortexpy.constants import EngineTraversalOrientation from cortexpy.graph.traversal.engine import Engine if args.out == '-': output = sys.stdout else: output = open(args.out, 'wt') random_access = RandomAccess(open(args.graph, 'rb')) if args.color is None: colors = list(range(random_access.num_colors)) else: colors = [args.color] traverser = Engine( random_access, traversal_colors=colors, orientation=EngineTraversalOrientation.both, max_nodes=args.max_nodes, ) traverser.traverse_from_each_kmer_in_fasta(args.start_sequences_fasta) kmers = kmerize_fasta(args.start_sequences_fasta, traverser.ra_parser.kmer_size) interactor = Interactor.from_graph(traverser.graph).make_graph_nodes_consistent( seed_kmer_strings=kmers) seq_record_generator = interactor.all_simple_paths() SeqIO.write(seq_record_generator, output, 'fasta')
def traverse(argv): import argparse from cortexpy.command.shared import get_shared_argparse shared_parser = get_shared_argparse() parser = argparse.ArgumentParser(prog='cortexpy traverse', parents=[shared_parser], description=""" Traverse all simple paths between all sources and targets of an input graph. Input is a cortex graph. Output is a FASTA. This tool also allows the creation of a JSON representation of a CORTEX graph that is consistent with seed strings by using the --to-json and --seed-strings arguments. If a links file is supplied, then branches consistent with the links will be preferred in the traversal. """) parser.add_argument('graph', help="cortex graph. Slurp graph from stdin is '-'.") parser.add_argument('--to-json', action='store_true') parser.add_argument( '--seed-strings', nargs='*', default=[], help="Strings with seed kmers from which to start contig traversal. " "Multiple strings can be specified.") parser.add_argument('--color', type=int, help='Restrict view to single color') parser.add_argument('--max-paths', type=int, default=0, help='Return exit status 64 if more than this ' 'number of paths are encountered. ' '0 turns off this check.') parser.add_argument( '--graph-index', type=int, default=0, help='Graph index to be added to description of all output paths') parser.add_argument( '--extra-start-kmer', help='Disconnect this k-mer from incoming k-mers before ' 'candidate transcript creation. ' 'This argument may fail if not used together with --seed-strings.') parser.add_argument('--links-file', help='gzipped Mccortex-style links file for graph') args = parser.parse_args(argv) from cortexpy.logging_config import configure_logging_from_args_and_get_logger logger = configure_logging_from_args_and_get_logger( args, 'cortexpy.traverse') import sys import gzip from cortexpy.graph.interactor import Interactor from cortexpy.graph.serializer.serializer import Serializer from cortexpy.graph.parser.streaming import load_cortex_graph from cortexpy.links import Links from . import get_exit_code_yaml_path import yaml EXIT_CODES = yaml.load(open(get_exit_code_yaml_path(), 'rt'), Loader=yaml.FullLoader) if args.out == '-': output = sys.stdout else: output = open(args.out, 'wt') logger.info(f'Loading graph: %s', args.graph) if args.graph == '-': graph = load_cortex_graph(sys.stdin.buffer) else: graph = load_cortex_graph(open(args.graph, 'rb')) logger.info(f'Loaded {len(graph)} kmers') consistent_graph = None if args.seed_strings: seed_kmer_strings = strings_to_kmer_strings(args.seed_strings, graph.graph['kmer_size']) logger.info( f'Making graph consistent with {len(seed_kmer_strings)} kmers from --seed-strings' ) consistent_graph = Interactor(graph) \ .make_graph_nodes_consistent(seed_kmer_strings) \ .graph if args.to_json: logger.info('Writing JSON representation of graph to STDOUT') if consistent_graph: graph = consistent_graph print(Serializer(graph).to_json()) return if not consistent_graph: logger.info('Making graph consistent') consistent_graph = Interactor.from_graph(graph) \ .make_graph_nodes_consistent() \ .graph if args.extra_start_kmer: if args.extra_start_kmer not in graph: logger.error( f'Could not find extra start kmer ({args.extra_start_kmer}) in graph' ) return 1 links = None if args.links_file is not None: logger.info(f'Loading links file {args.links_file}') links = Links.from_binary_stream(gzip.open(args.links_file, 'rb')) seq_record_generator = Interactor(consistent_graph) \ .all_simple_paths(args.extra_start_kmer, links=links) seq_record_generator = annotated_seq_records(seq_record_generator, graph_idx=args.graph_index) if args.max_paths > 0: logger.info('Exiting after element %s', args.max_paths) seq_record_generator = raise_after_nth_element(seq_record_generator, args.max_paths) logger.info('Writing seq records to %s', args.out) try: for record in seq_record_generator: output.write(record.format('fasta')) except IndexError: logger.error('Max paths (%s) exceeded', args.max_paths) return EXIT_CODES['MAX_PATH_EXCEEDED']
def build(self): if self.consistent_seeds: self.graph = Interactor.from_graph(self.graph) \ .make_graph_nodes_consistent(self.consistent_seeds) \ .graph return self.graph