def parse_spades_output(input_dirpath, output_dirpath, min_edge_len): gfa_fpath = find_file_by_pattern(input_dirpath, "assembly_graph.gfa") dict_edges = parse_gfa(gfa_fpath, min_edge_len, input_dirpath, assembler="spades") contig_edges = parse_spades_paths(input_dirpath, dict_edges) edges_fpath = get_edges_from_gfa(gfa_fpath, output_dirpath, min_edge_len) return dict_edges, contig_edges, edges_fpath
def parse_assembler_output(assembler_name, input_dirpath, input_fpath, output_dirpath, input_fasta_fpath, min_edge_len): edges_fpath = None if not is_empty_file(input_fpath): contig_edges = [] if input_fpath.endswith("fastg"): input_fpath = fastg_to_gfa(input_fpath, output_dirpath, assembler_name) if not input_fpath: sys.exit("ERROR! Failed parsing " + input_fpath + " file.") if input_fpath.endswith("gfa") or input_fpath.endswith("gfa2"): dict_edges = parse_gfa(input_fpath, min_edge_len) edges_fpath = get_edges_from_gfa(input_fpath, output_dirpath, min_edge_len) elif input_fpath.endswith("dot") or input_fpath.endswith("gv"): edges_fpath = format_edges_file(input_fasta_fpath, output_dirpath) dict_edges = dict() if is_abyss(assembler_name): dict_edges = parse_abyss_dot(input_fpath, min_edge_len) if not dict_edges: try: dict_edges = parse_flye_dot(input_fpath, min_edge_len) except Exception as e: sys.exit( "ERROR! Failed parsing " + input_fpath + " file.\n" "During parsing the following error has occured: " + str(e) + "\nPlease make sure that you correctly specified the assembler name using -a option. " "DOT files produced by different assemblers can have very different formats.\n" "Examples of input data can be found here https://github.com/almiheenko/AGB/tree/master/test_data" ) else: if is_canu(assembler_name): dict_edges, contig_edges, edges_fpath = parse_canu_output( input_dirpath, output_dirpath, min_edge_len) elif is_flye(assembler_name): dict_edges, contig_edges, edges_fpath = parse_flye_output( input_dirpath, output_dirpath, min_edge_len) elif is_spades(assembler_name): dict_edges, contig_edges, edges_fpath = parse_spades_output( input_dirpath, output_dirpath, min_edge_len) else: sys.exit( "Output folder of %s assembler can not be parsed! Supported assemblers: %s. " "More assemblers will be added in the next release.\n" "To visualize the assembly graph produced by this assembler, " "you should manually specify the assembly graph file in GFA/FASTG/GraphViz formats using --graph option " "and (optionally) file with edge sequences using --fasta option" % (assembler_name, ', '.join(SUPPORTED_ASSEMBLERS))) for edge_id, edge in dict_edges.items(): dict_edges[edge_id].start, dict_edges[edge_id].end = str( edge.start), str(edge.end) return dict_edges, contig_edges, edges_fpath
def parse_spades_output(input_dirpath, output_dirpath, min_edge_len): gfa_fpath = find_file_by_pattern(input_dirpath, "assembly_graph.gfa") or \ find_file_by_pattern(input_dirpath, "assembly_graph_with_scaffolds.gfa") if not gfa_fpath: print( "ERROR! Assembly graph is not found in %s! " "Please check the folder or specify the file with assembly graph using --graph option" % (input_dirpath)) sys.exit(1) dict_edges = parse_gfa(gfa_fpath, min_edge_len, input_dirpath, assembler="spades") contig_edges = parse_spades_paths(input_dirpath, dict_edges) edges_fpath = get_edges_from_gfa(gfa_fpath, output_dirpath, min_edge_len) return dict_edges, contig_edges, edges_fpath
def parse_canu_output(input_dirpath, output_dirpath, min_edge_len): raw_gfa_fpath = find_file_by_pattern(input_dirpath, ".unitigs.gfa") if not raw_gfa_fpath: print("ERROR! GFA file is not found in %s! Please check the options" % abspath(input_dirpath)) sys.exit(1) edges_fpath = get_edges_from_gfa(raw_gfa_fpath, output_dirpath, min_edge_len) gfa_fpath = join(output_dirpath, basename(raw_gfa_fpath)) if is_empty_file(gfa_fpath) or not can_reuse( gfa_fpath, files_to_check=[raw_gfa_fpath]): cmd = 'sed "1s/bogart.edges/1.0/" ' + raw_gfa_fpath subprocess.call(shlex.split(cmd), stdout=open(gfa_fpath, 'w')) dict_edges = parse_gfa(gfa_fpath, min_edge_len, input_dirpath, assembler="canu") contig_edges = parse_canu_assembly_info(input_dirpath, dict_edges) return dict_edges, contig_edges, edges_fpath