def get_edges_from_gfa(gfa_fpath, output_dirpath, min_edge_len): if not gfa_fpath: return None input_edges_fpath = join(dirname(gfa_fpath), get_filename(gfa_fpath) + ".fasta") edges_fpath = join(output_dirpath, basename(input_edges_fpath)) if not is_empty_file(gfa_fpath) and not can_reuse(edges_fpath, files_to_check=[gfa_fpath]): print("Extracting edge sequences from " + gfa_fpath + "...") with open(edges_fpath, "w") as out: with open(gfa_fpath) as f: for line in f: if line.startswith('S'): fs = line.strip().split() seq_name = fs[1] seq = None if is_acgt_seq(fs[2]): seq = fs[2] elif len(fs) >= 4 and is_acgt_seq(fs[3]): seq = fs[3] if seq and len(seq) >= min_edge_len: out.write(">%s\n" % get_edge_agv_id(get_edge_num(seq_name))) out.write(seq) out.write("\n") if is_empty_file(edges_fpath) and not is_empty_file(input_edges_fpath): with open(edges_fpath, "w") as out: with open(input_edges_fpath) as f: for line in f: if line.startswith('>'): seq_name = line.strip().split()[0][1:] out.write(">%s\n" % get_edge_agv_id(get_edge_num(seq_name))) else: out.write(line) return edges_fpath
def map_edges_to_ref(input_fpath, output_dirpath, reference_fpath, threads): mapping_fpath = join(output_dirpath, "mapping.paf") if reference_fpath: if not can_reuse(mapping_fpath, files_to_check=[input_fpath, reference_fpath]): if not is_empty_file(input_fpath): print("Aligning graph edges to the reference...") cmdline = [ "minimap2", "-x", "asm20", "--score-N", "0", "-E", "1,0", "-N", "200", "-p", "0.5", "-f", "200", "-t", str(threads), reference_fpath, input_fpath ] return_code = subprocess.call(cmdline, stdout=open(mapping_fpath, "w"), stderr=open( join(output_dirpath, "minimap.log"), "w")) if return_code != 0 or is_empty_file(mapping_fpath): print( "Warning! Minimap2 failed aligning edges to the reference" ) else: print( "Warning! File with edge sequences was not found, failed aligning edges to the reference" ) return mapping_fpath
def parse_canu_unitigs_info(input_dirpath, dict_edges): tiginfo_fpath = find_file_by_pattern(input_dirpath, ".unitigs.layout.tigInfo") if not is_empty_file(tiginfo_fpath): with open(tiginfo_fpath) as f: for i, line in enumerate(f): if i == 0: header = line.strip().split() repeat_col = header.index("sugRept") if "sugRept" in header else None cov_col = header.index("coverage") if "coverage" in header else None if repeat_col is None or cov_col is None: break continue fs = line.strip().split() edge_id = get_edge_agv_id(get_edge_num(fs[0])) rc_edge_id = get_edge_agv_id(-get_edge_num(fs[0])) if edge_id in dict_edges: coverage = int(float(fs[cov_col])) dict_edges[edge_id].cov = coverage dict_edges[rc_edge_id].cov = coverage if fs[repeat_col] == "yes": dict_edges[edge_id].repetitive = True dict_edges[rc_edge_id].repetitive = True # else: # print("Warning! Edge %s is not found!" % edge_id) return dict_edges
def fastg_to_gfa(input_fpath, output_dirpath, assembler_name): k8_exec = join(TOOLS_DIR, "k8-darwin") if is_osx() else join( TOOLS_DIR, "k8-linux") gfatools_exec = join(TOOLS_DIR, "gfatools.js") if gfatools_exec and k8_exec: output_fpath = join(output_dirpath, basename(input_fpath).replace("fastg", "gfa")) cmd = None if is_abyss(assembler_name): cmd = "abyss2gfa" elif is_spades(assembler_name): cmd = "spades2gfa" elif is_sga(assembler_name): cmd = "sga2gfa" elif is_soap(assembler_name): cmd = "soap2gfa" elif is_velvet(assembler_name): cmd = "velvet2gfa" if not cmd: sys.exit( "FASTG files produced by " + assembler_name + " are not supported. Supported assemblers: " + ' '.join([ ABYSS_NAME, SGA_NAME, SOAP_NAME, SPADES_NAME, VELVET_NAME ]) + " or use files in GFA format.") cmdline = [k8_exec, gfatools_exec, cmd, input_fpath] subprocess.call(cmdline, stdout=output_fpath, stderr=open("/dev/null", "w")) if not is_empty_file(output_fpath): return output_fpath
def parse_flye_assembly_info(input_dirpath, dict_edges): contig_edges = defaultdict(list) info_fpath = join(input_dirpath, "assembly_info.txt") if is_empty_file(info_fpath): print( "Warning! Assembly_info.txt is not found, information about contigs will not be provided" ) with open(info_fpath) as f: for i, line in enumerate(f): if i == 0: # header = line.strip().split() continue fs = line.strip().split() contig = fs[0] path = fs[-1] edges = path.split(',') start = 0 for edge_name in edges: edge_id = get_edge_agv_id(edge_name) if edge_id in dict_edges: edge_len = dict_edges[edge_id].length contig_edges[contig].append( (str(start), str(start + edge_len), edge_id)) start += edge_len return contig_edges
def parse_spades_paths(input_dirpath, dict_edges): contig_edges = defaultdict(list) paths_fpath = join(input_dirpath, "scaffolds.paths") if is_empty_file(paths_fpath): print( "Warning! %s is not found, information about scaffold paths will not be provided" % paths_fpath) # NODE_1_length_8242890_cov_19.815448 # 1893359+,1801779-,1893273-,400678-,1892977+,1869659-,1892443+,272108+,1694470+,1893863+ with open(paths_fpath) as f: contig = None start = 0 for line in f: if line.strip().endswith("'"): contig = None elif line.startswith("NODE"): contig = line.strip() start = 0 continue elif contig: edges = line.strip().replace(';', '').split(',') for edge_name in edges: edge_num = int(edge_name[:-1]) if edge_name[-1] == '-': edge_num *= -1 edge_id = get_edge_agv_id(edge_num) if edge_id in dict_edges: edge_len = dict_edges[edge_id].length contig_edges[contig].append( (str(start), str(start + edge_len), edge_id)) start += edge_len start += 10 # NNNNNNNNNN return contig_edges
def parse_canu_contigs_info(input_dirpath): contig_info = dict() edges_by_contig = defaultdict(list) unitigs_info_fpath = find_file_by_pattern(input_dirpath, "unitigs.bed") if input_dirpath and not is_empty_file(unitigs_info_fpath): with open(unitigs_info_fpath) as f: for line in f: fs = line.strip().split() contig, start, end, unitig = fs[:4] strand = fs[-1] edge_name = get_edge_num( unitig) if strand == "+" else -get_edge_num(unitig) contig_id = get_canu_id(contig) edges_by_contig[contig_id].append(str(edge_name)) contigs_info_fpath = find_file_by_pattern(input_dirpath, "contigs.layout.tigInfo") if input_dirpath and not is_empty_file(contigs_info_fpath): len_col = None cov_col = None with open(contigs_info_fpath) as f: for i, line in enumerate(f): if i == 0: header = line.strip().split() len_col = header.index( "tigLen") if "tigLen" in header else None cov_col = header.index( "coverage") if "coverage" in header else None if len_col is None or cov_col is None: break continue fs = line.strip().split() length = int(float(fs[len_col])) coverage = int(float(fs[cov_col])) contig_id = get_canu_id(fs[0]) if contig_id in edges_by_contig: contig_info[contig_id] = { 'length': length, 'cov': coverage, 'mult': 1 } for contig_id, edges in edges_by_contig.items(): contig_info[contig_id]['edges'] = edges return contig_info
def parse_assembler_output(assembler_name, input_dirpath, input_fpath, output_dirpath, input_fasta_fpath, min_edge_len): edges_fpath = None if not is_empty_file(input_fpath): contig_edges = [] if input_fpath.endswith("fastg"): input_fpath = fastg_to_gfa(input_fpath, output_dirpath, assembler_name) if not input_fpath: sys.exit("ERROR! Failed parsing " + input_fpath + " file.") if input_fpath.endswith("gfa") or input_fpath.endswith("gfa2"): dict_edges = parse_gfa(input_fpath, min_edge_len) edges_fpath = get_edges_from_gfa(input_fpath, output_dirpath, min_edge_len) elif input_fpath.endswith("dot") or input_fpath.endswith("gv"): edges_fpath = format_edges_file(input_fasta_fpath, output_dirpath) dict_edges = dict() if is_abyss(assembler_name): dict_edges = parse_abyss_dot(input_fpath, min_edge_len) if not dict_edges: try: dict_edges = parse_flye_dot(input_fpath, min_edge_len) except Exception as e: sys.exit( "ERROR! Failed parsing " + input_fpath + " file.\n" "During parsing the following error has occured: " + str(e) + "\nPlease make sure that you correctly specified the assembler name using -a option. " "DOT files produced by different assemblers can have very different formats.\n" "Examples of input data can be found here https://github.com/almiheenko/AGB/tree/master/test_data" ) else: if is_canu(assembler_name): dict_edges, contig_edges, edges_fpath = parse_canu_output( input_dirpath, output_dirpath, min_edge_len) elif is_flye(assembler_name): dict_edges, contig_edges, edges_fpath = parse_flye_output( input_dirpath, output_dirpath, min_edge_len) elif is_spades(assembler_name): dict_edges, contig_edges, edges_fpath = parse_spades_output( input_dirpath, output_dirpath, min_edge_len) else: sys.exit( "Output folder of %s assembler can not be parsed! Supported assemblers: %s. " "More assemblers will be added in the next release.\n" "To visualize the assembly graph produced by this assembler, " "you should manually specify the assembly graph file in GFA/FASTG/GraphViz formats using --graph option " "and (optionally) file with edge sequences using --fasta option" % (assembler_name, ', '.join(SUPPORTED_ASSEMBLERS))) for edge_id, edge in dict_edges.items(): dict_edges[edge_id].start, dict_edges[edge_id].end = str( edge.start), str(edge.end) return dict_edges, contig_edges, edges_fpath
def format_edges_file(input_fpath, output_dirpath): if is_empty_file(input_fpath): return None edges_fpath = join(output_dirpath, "edges.fasta") if not can_reuse(edges_fpath, files_to_check=[input_fpath]): with open(input_fpath) as f: with open(edges_fpath, "w") as out_f: for line in f: if line.startswith('>'): edge_id = get_edge_agv_id(get_edge_num(line[1:])) out_f.write(">%s\n" % edge_id) else: out_f.write(line) return edges_fpath
def parse_canu_assembly_info(input_dirpath, dict_edges): contig_edges = defaultdict(list) unitigs_fpath = find_file_by_pattern(input_dirpath, ".unitigs.bed") if is_empty_file(unitigs_fpath): print( "Warning! Unitigs.bed is not found, information about contigs will not be provided" ) with open(unitigs_fpath) as f: for line in f: fs = line.strip().split() contig, start, end, unitig = fs[:4] edge_id = get_edge_agv_id(get_edge_num(unitig)) if edge_id in dict_edges: contig_id = get_canu_id(contig) contig_edges[contig_id].append((start, end, edge_id)) return contig_edges
def run(input_fpath, reference_fpath, out_fpath, output_dirpath, threads, is_meta): if not exists(output_dirpath): os.makedirs(output_dirpath) if not can_reuse(out_fpath, files_to_check=[input_fpath, reference_fpath]): quast_exec_path = get_path_to_program("quast.py") if not quast_exec_path: print("QUAST is not found!") return None cmdline = [quast_exec_path, "--fast", "--agb", input_fpath, "-r", reference_fpath, "-t", str(threads), "-o", output_dirpath, "--min-contig", "0"] + \ (["--large"] if getsize(input_fpath) > 10 * 1024 * 1024 or is_meta else []) + (["--min-identity", "90"] if is_meta else []) subprocess.call(cmdline, stdout=open("/dev/null", "w"), stderr=open("/dev/null", "w")) if is_empty_file(out_fpath) or not can_reuse( out_fpath, files_to_check=[input_fpath, reference_fpath]): return None return out_fpath
def parse_canu_output(input_dirpath, output_dirpath, min_edge_len): raw_gfa_fpath = find_file_by_pattern(input_dirpath, ".unitigs.gfa") if not raw_gfa_fpath: print("ERROR! GFA file is not found in %s! Please check the options" % abspath(input_dirpath)) sys.exit(1) edges_fpath = get_edges_from_gfa(raw_gfa_fpath, output_dirpath, min_edge_len) gfa_fpath = join(output_dirpath, basename(raw_gfa_fpath)) if is_empty_file(gfa_fpath) or not can_reuse( gfa_fpath, files_to_check=[raw_gfa_fpath]): cmd = 'sed "1s/bogart.edges/1.0/" ' + raw_gfa_fpath subprocess.call(shlex.split(cmd), stdout=open(gfa_fpath, 'w')) dict_edges = parse_gfa(gfa_fpath, min_edge_len, input_dirpath, assembler="canu") contig_edges = parse_canu_assembly_info(input_dirpath, dict_edges) return dict_edges, contig_edges, edges_fpath
def parse_flye_contigs_info(input_dirpath): contig_info = dict() if input_dirpath and not is_empty_file( join(input_dirpath, 'assembly_info.txt')): with open(join(input_dirpath, 'assembly_info.txt')) as f: for i, line in enumerate(f): if i == 0: header = line.strip().split() continue fs = line.strip().split() contig = fs[0] path = fs[-1] length, cov = map(int, (fs[1], fs[2])) multiplicity = fs[5] edges = path.split(',') contig_info[contig] = { 'edges': edges, 'length': length, 'cov': cov, 'mult': multiplicity } return contig_info
def run_quast_analysis(input_fpath, reference_fpath, output_dirpath, json_output_dirpath, threads, contig_edges, dict_edges=None, is_meta=False): ms_out_fpath = None quast_output_dir = join( output_dirpath, "quast_output" if not dict_edges else "quast_edge_output") if not is_empty_file(input_fpath) and not is_empty_file(reference_fpath): ms_out_fpath = get_mis_report_fpath(quast_output_dir, input_fpath) ms_out_fpath = run(input_fpath, reference_fpath, ms_out_fpath, quast_output_dir, threads, is_meta) if not ms_out_fpath: if not is_empty_file(input_fpath) and not is_empty_file( reference_fpath): print( "QUAST failed! Make sure you are using the latest version of QUAST" ) print("No information about %s mappings to the reference genome" % ("edge" if dict_edges else "contig")) with open(join(json_output_dirpath, "reference.json"), 'w') as handle: handle.write("chrom_lengths=" + json.dumps([]) + ";\n") handle.write("edgeMappingInfo=" + json.dumps([]) + ";\n") handle.write("chromGaps=" + json.dumps([]) + ";\n") handle.write("chromAligns=" + json.dumps([]) + ";\n") with open(join(json_output_dirpath, 'errors.json'), 'w') as handle: handle.write("misassembledContigs=[];\n") return None, None, None, dict_edges # search for misassemblies and store them for each edge and contig misassembled_seqs = defaultdict(list) with open(ms_out_fpath) as f: seq_id = '' for line in f: if line.startswith("Extensive misassembly"): match = re.search(align_pattern, line) if not match or len(match.groups()) < 4: continue start1, end1, start2, end2 = match.group( 'start1'), match.group('end1'), match.group( 'start2'), match.group('end2') if dict_edges: edge_id = get_edge_agv_id(get_edge_num(seq_id)) dict_edges[edge_id].errors.append( (start1, end1, start2, end2)) else: misassembled_seqs[seq_id].append( (start1, end1, start2, end2)) ## add misassembl edge else: seq_id = line.strip() if not dict_edges: with open(join(json_output_dirpath, 'errors.json'), 'w') as handle: handle.write("misassembledContigs='" + json.dumps(misassembled_seqs) + "';\n") return None, None, None, dict_edges else: parse_alignments(get_alignments_fpath(quast_output_dir, input_fpath), json_output_dirpath) mapping_fpath = map_edges_to_ref(input_fpath, output_dirpath, reference_fpath, threads) mapping_info, chrom_names, edge_by_chrom = parse_mapping_info( mapping_fpath, json_output_dirpath, dict_edges) return mapping_info, chrom_names, edge_by_chrom, dict_edges