def fastg_to_gfa(input_fpath, output_dirpath, assembler_name): k8_exec = join(TOOLS_DIR, "k8-darwin") if is_osx() else join( TOOLS_DIR, "k8-linux") gfatools_exec = join(TOOLS_DIR, "gfatools.js") if gfatools_exec and k8_exec: output_fpath = join(output_dirpath, basename(input_fpath).replace("fastg", "gfa")) cmd = None if is_abyss(assembler_name): cmd = "abyss2gfa" elif is_spades(assembler_name): cmd = "spades2gfa" elif is_sga(assembler_name): cmd = "sga2gfa" elif is_soap(assembler_name): cmd = "soap2gfa" elif is_velvet(assembler_name): cmd = "velvet2gfa" if not cmd: sys.exit( "FASTG files produced by " + assembler_name + " are not supported. Supported assemblers: " + ' '.join([ ABYSS_NAME, SGA_NAME, SOAP_NAME, SPADES_NAME, VELVET_NAME ]) + " or use files in GFA format.") cmdline = [k8_exec, gfatools_exec, cmd, input_fpath] subprocess.call(cmdline, stdout=output_fpath, stderr=open("/dev/null", "w")) if not is_empty_file(output_fpath): return output_fpath
def parse_assembler_output(assembler_name, input_dirpath, input_fpath, output_dirpath, input_fasta_fpath, min_edge_len): edges_fpath = None if not is_empty_file(input_fpath): contig_edges = [] if input_fpath.endswith("fastg"): input_fpath = fastg_to_gfa(input_fpath, output_dirpath, assembler_name) if not input_fpath: sys.exit("ERROR! Failed parsing " + input_fpath + " file.") if input_fpath.endswith("gfa") or input_fpath.endswith("gfa2"): dict_edges = parse_gfa(input_fpath, min_edge_len) edges_fpath = get_edges_from_gfa(input_fpath, output_dirpath, min_edge_len) elif input_fpath.endswith("dot") or input_fpath.endswith("gv"): edges_fpath = format_edges_file(input_fasta_fpath, output_dirpath) dict_edges = dict() if is_abyss(assembler_name): dict_edges = parse_abyss_dot(input_fpath, min_edge_len) if not dict_edges: try: dict_edges = parse_flye_dot(input_fpath, min_edge_len) except Exception as e: sys.exit( "ERROR! Failed parsing " + input_fpath + " file.\n" "During parsing the following error has occured: " + str(e) + "\nPlease make sure that you correctly specified the assembler name using -a option. " "DOT files produced by different assemblers can have very different formats.\n" "Examples of input data can be found here https://github.com/almiheenko/AGB/tree/master/test_data" ) else: if is_canu(assembler_name): dict_edges, contig_edges, edges_fpath = parse_canu_output( input_dirpath, output_dirpath, min_edge_len) elif is_flye(assembler_name): dict_edges, contig_edges, edges_fpath = parse_flye_output( input_dirpath, output_dirpath, min_edge_len) elif is_spades(assembler_name): dict_edges, contig_edges, edges_fpath = parse_spades_output( input_dirpath, output_dirpath, min_edge_len) else: sys.exit( "Output folder of %s assembler can not be parsed! Supported assemblers: %s. " "More assemblers will be added in the next release.\n" "To visualize the assembly graph produced by this assembler, " "you should manually specify the assembly graph file in GFA/FASTG/GraphViz formats using --graph option " "and (optionally) file with edge sequences using --fasta option" % (assembler_name, ', '.join(SUPPORTED_ASSEMBLERS))) for edge_id, edge in dict_edges.items(): dict_edges[edge_id].start, dict_edges[edge_id].end = str( edge.start), str(edge.end) return dict_edges, contig_edges, edges_fpath
def parse_gfa(gfa_fpath, min_edge_len, input_dirpath=None, assembler=None): dict_edges = dict() predecessors = defaultdict(list) successors = defaultdict(list) g = nx.DiGraph() print("Parsing " + gfa_fpath + "...") # gfa = gfapy.Gfa.from_file(gfa_fpath, vlevel = 0) links = [] edge_overlaps = defaultdict(dict) with open(gfa_fpath) as f: for line in f: record_type = line[0] if record_type == 'S': fs = line.split() name, seq_len = fs[1], len(fs[2]) if fs[2] == '*': seq_len = None add_fields = fs[3:] if len(fs) > 3 else [] add_info = dict((f.split(':')[0].lower(), f.split(':')[-1]) for f in add_fields) cov = 1 if "dp" in add_info: cov = float(add_info["dp"]) ## coverage depth elif "kc" in add_info: cov = max(1, int(add_info["kc"]) / seq_len) ## k-mer count / edge length if "ln" in add_info: seq_len = int(add_info["ln"]) ## sequence length if seq_len and seq_len >= min_edge_len: edge_id = get_edge_agv_id(get_edge_num(name)) edge = Edge(edge_id, get_edge_num(name), seq_len, cov, element_id=edge_id) dict_edges[edge_id] = edge for overlapped_edge, overlap in edge_overlaps[edge_id].items(): dict_edges[edge_id].overlaps.append((edge_id_to_name(overlapped_edge), overlapped_edge, overlap)) rc_edge_id = get_edge_agv_id(-get_edge_num(name)) rc_edge = Edge(rc_edge_id, -get_edge_num(name), seq_len, cov, element_id=rc_edge_id) dict_edges[rc_edge_id] = rc_edge for overlapped_edge, overlap in edge_overlaps[rc_edge_id].items(): dict_edges[edge_id].overlaps.append((edge_id_to_name(overlapped_edge), overlapped_edge, overlap)) if record_type != 'L' and record_type != 'E': continue if record_type == 'L': _, from_name, from_orient, to_name, to_orient = line.split()[:5] else: # E * 2+ 65397+ 21 68$ 0 47 47M from_name, to_name = line.split()[2], line.split()[3] from_orient, to_orient = from_name[-1], to_name[-1] from_name, to_name = from_name[:-1], to_name[:-1] edge1 = get_edge_agv_id(get_edge_num(from_name)) edge2 = get_edge_agv_id(get_edge_num(to_name)) if from_orient == '-': edge1 = get_match_edge_id(edge1) if to_orient == '-': edge2 = get_match_edge_id(edge2) overlap = 0 overlap_operations = re.split('(\d+)', line.split()[-1].strip()) for i in range(0, len(overlap_operations) - 1, 1): if not overlap_operations[i]: continue if overlap_operations[i+1] == 'M' or overlap_operations[i+1] == 'I': overlap += int(overlap_operations[i]) links.append((from_name, from_orient, to_name, to_orient, overlap)) if overlap: edge_overlaps[edge1][edge2] = overlap edge_overlaps[edge2][edge1] = overlap ### gfa retains only canonical links for link in links: from_name, from_orient, to_name, to_orient, overlap = link edge1 = get_edge_agv_id(get_edge_num(from_name)) edge2 = get_edge_agv_id(get_edge_num(to_name)) if from_orient == '-': edge1 = get_match_edge_id(edge1) if to_orient == '-': edge2 = get_match_edge_id(edge2) if edge1 != edge2: predecessors[edge2].append(edge1) successors[edge1].append(edge2) g.add_edge(edge1, edge2) if is_spades(assembler) or is_abyss(assembler): edge1, edge2 = get_match_edge_id(edge2), get_match_edge_id(edge1) if edge1 != edge2: predecessors[edge2].append(edge1) successors[edge1].append(edge2) g.add_edge(edge1, edge2) if assembler == "canu" and input_dirpath: dict_edges = parse_canu_unitigs_info(input_dirpath, dict_edges) dict_edges = construct_graph(dict_edges, predecessors, successors) print("Finish parsing.") return dict_edges