Esempio n. 1
0
def parse_spades_output(input_dirpath, output_dirpath, min_edge_len):
    gfa_fpath = find_file_by_pattern(input_dirpath, "assembly_graph.gfa")
    dict_edges = parse_gfa(gfa_fpath,
                           min_edge_len,
                           input_dirpath,
                           assembler="spades")
    contig_edges = parse_spades_paths(input_dirpath, dict_edges)
    edges_fpath = get_edges_from_gfa(gfa_fpath, output_dirpath, min_edge_len)
    return dict_edges, contig_edges, edges_fpath
Esempio n. 2
0
def parse_assembler_output(assembler_name, input_dirpath, input_fpath,
                           output_dirpath, input_fasta_fpath, min_edge_len):
    edges_fpath = None
    if not is_empty_file(input_fpath):
        contig_edges = []
        if input_fpath.endswith("fastg"):
            input_fpath = fastg_to_gfa(input_fpath, output_dirpath,
                                       assembler_name)
        if not input_fpath:
            sys.exit("ERROR! Failed parsing " + input_fpath + " file.")
        if input_fpath.endswith("gfa") or input_fpath.endswith("gfa2"):
            dict_edges = parse_gfa(input_fpath, min_edge_len)
            edges_fpath = get_edges_from_gfa(input_fpath, output_dirpath,
                                             min_edge_len)
        elif input_fpath.endswith("dot") or input_fpath.endswith("gv"):
            edges_fpath = format_edges_file(input_fasta_fpath, output_dirpath)
            dict_edges = dict()
            if is_abyss(assembler_name):
                dict_edges = parse_abyss_dot(input_fpath, min_edge_len)
            if not dict_edges:
                try:
                    dict_edges = parse_flye_dot(input_fpath, min_edge_len)
                except Exception as e:
                    sys.exit(
                        "ERROR! Failed parsing " + input_fpath + " file.\n"
                        "During parsing the following error has occured: " +
                        str(e) +
                        "\nPlease make sure that you correctly specified the assembler name using -a option. "
                        "DOT files produced by different assemblers can have very different formats.\n"
                        "Examples of input data can be found here https://github.com/almiheenko/AGB/tree/master/test_data"
                    )
    else:
        if is_canu(assembler_name):
            dict_edges, contig_edges, edges_fpath = parse_canu_output(
                input_dirpath, output_dirpath, min_edge_len)
        elif is_flye(assembler_name):
            dict_edges, contig_edges, edges_fpath = parse_flye_output(
                input_dirpath, output_dirpath, min_edge_len)
        elif is_spades(assembler_name):
            dict_edges, contig_edges, edges_fpath = parse_spades_output(
                input_dirpath, output_dirpath, min_edge_len)
        else:
            sys.exit(
                "Output folder of %s assembler can not be parsed! Supported assemblers: %s. "
                "More assemblers will be added in the next release.\n"
                "To visualize the assembly graph produced by this assembler, "
                "you should manually specify the assembly graph file in GFA/FASTG/GraphViz formats using --graph option "
                "and (optionally) file with edge sequences using --fasta option"
                % (assembler_name, ', '.join(SUPPORTED_ASSEMBLERS)))
    for edge_id, edge in dict_edges.items():
        dict_edges[edge_id].start, dict_edges[edge_id].end = str(
            edge.start), str(edge.end)
    return dict_edges, contig_edges, edges_fpath
Esempio n. 3
0
def parse_spades_output(input_dirpath, output_dirpath, min_edge_len):
    gfa_fpath = find_file_by_pattern(input_dirpath, "assembly_graph.gfa") or \
                find_file_by_pattern(input_dirpath, "assembly_graph_with_scaffolds.gfa")
    if not gfa_fpath:
        print(
            "ERROR! Assembly graph is not found in %s! "
            "Please check the folder or specify the file with assembly graph using --graph option"
            % (input_dirpath))
        sys.exit(1)

    dict_edges = parse_gfa(gfa_fpath,
                           min_edge_len,
                           input_dirpath,
                           assembler="spades")
    contig_edges = parse_spades_paths(input_dirpath, dict_edges)
    edges_fpath = get_edges_from_gfa(gfa_fpath, output_dirpath, min_edge_len)
    return dict_edges, contig_edges, edges_fpath
Esempio n. 4
0
def parse_canu_output(input_dirpath, output_dirpath, min_edge_len):
    raw_gfa_fpath = find_file_by_pattern(input_dirpath, ".unitigs.gfa")
    if not raw_gfa_fpath:
        print("ERROR! GFA file is not found in %s! Please check the options" %
              abspath(input_dirpath))
        sys.exit(1)
    edges_fpath = get_edges_from_gfa(raw_gfa_fpath, output_dirpath,
                                     min_edge_len)
    gfa_fpath = join(output_dirpath, basename(raw_gfa_fpath))
    if is_empty_file(gfa_fpath) or not can_reuse(
            gfa_fpath, files_to_check=[raw_gfa_fpath]):
        cmd = 'sed "1s/bogart.edges/1.0/" ' + raw_gfa_fpath
        subprocess.call(shlex.split(cmd), stdout=open(gfa_fpath, 'w'))
    dict_edges = parse_gfa(gfa_fpath,
                           min_edge_len,
                           input_dirpath,
                           assembler="canu")
    contig_edges = parse_canu_assembly_info(input_dirpath, dict_edges)
    return dict_edges, contig_edges, edges_fpath