예제 #1
0
def fastg_to_gfa(input_fpath, output_dirpath, assembler_name):
    k8_exec = join(TOOLS_DIR, "k8-darwin") if is_osx() else join(
        TOOLS_DIR, "k8-linux")
    gfatools_exec = join(TOOLS_DIR, "gfatools.js")
    if gfatools_exec and k8_exec:
        output_fpath = join(output_dirpath,
                            basename(input_fpath).replace("fastg", "gfa"))
        cmd = None
        if is_abyss(assembler_name):
            cmd = "abyss2gfa"
        elif is_spades(assembler_name):
            cmd = "spades2gfa"
        elif is_sga(assembler_name):
            cmd = "sga2gfa"
        elif is_soap(assembler_name):
            cmd = "soap2gfa"
        elif is_velvet(assembler_name):
            cmd = "velvet2gfa"
        if not cmd:
            sys.exit(
                "FASTG files produced by " + assembler_name +
                " are not supported. Supported assemblers: " + ' '.join([
                    ABYSS_NAME, SGA_NAME, SOAP_NAME, SPADES_NAME, VELVET_NAME
                ]) + " or use files in GFA format.")
        cmdline = [k8_exec, gfatools_exec, cmd, input_fpath]
        subprocess.call(cmdline,
                        stdout=output_fpath,
                        stderr=open("/dev/null", "w"))
        if not is_empty_file(output_fpath):
            return output_fpath
예제 #2
0
def parse_assembler_output(assembler_name, input_dirpath, input_fpath,
                           output_dirpath, input_fasta_fpath, min_edge_len):
    edges_fpath = None
    if not is_empty_file(input_fpath):
        contig_edges = []
        if input_fpath.endswith("fastg"):
            input_fpath = fastg_to_gfa(input_fpath, output_dirpath,
                                       assembler_name)
        if not input_fpath:
            sys.exit("ERROR! Failed parsing " + input_fpath + " file.")
        if input_fpath.endswith("gfa") or input_fpath.endswith("gfa2"):
            dict_edges = parse_gfa(input_fpath, min_edge_len)
            edges_fpath = get_edges_from_gfa(input_fpath, output_dirpath,
                                             min_edge_len)
        elif input_fpath.endswith("dot") or input_fpath.endswith("gv"):
            edges_fpath = format_edges_file(input_fasta_fpath, output_dirpath)
            dict_edges = dict()
            if is_abyss(assembler_name):
                dict_edges = parse_abyss_dot(input_fpath, min_edge_len)
            if not dict_edges:
                try:
                    dict_edges = parse_flye_dot(input_fpath, min_edge_len)
                except Exception as e:
                    sys.exit(
                        "ERROR! Failed parsing " + input_fpath + " file.\n"
                        "During parsing the following error has occured: " +
                        str(e) +
                        "\nPlease make sure that you correctly specified the assembler name using -a option. "
                        "DOT files produced by different assemblers can have very different formats.\n"
                        "Examples of input data can be found here https://github.com/almiheenko/AGB/tree/master/test_data"
                    )
    else:
        if is_canu(assembler_name):
            dict_edges, contig_edges, edges_fpath = parse_canu_output(
                input_dirpath, output_dirpath, min_edge_len)
        elif is_flye(assembler_name):
            dict_edges, contig_edges, edges_fpath = parse_flye_output(
                input_dirpath, output_dirpath, min_edge_len)
        elif is_spades(assembler_name):
            dict_edges, contig_edges, edges_fpath = parse_spades_output(
                input_dirpath, output_dirpath, min_edge_len)
        else:
            sys.exit(
                "Output folder of %s assembler can not be parsed! Supported assemblers: %s. "
                "More assemblers will be added in the next release.\n"
                "To visualize the assembly graph produced by this assembler, "
                "you should manually specify the assembly graph file in GFA/FASTG/GraphViz formats using --graph option "
                "and (optionally) file with edge sequences using --fasta option"
                % (assembler_name, ', '.join(SUPPORTED_ASSEMBLERS)))
    for edge_id, edge in dict_edges.items():
        dict_edges[edge_id].start, dict_edges[edge_id].end = str(
            edge.start), str(edge.end)
    return dict_edges, contig_edges, edges_fpath
예제 #3
0
def parse_gfa(gfa_fpath, min_edge_len, input_dirpath=None, assembler=None):
    dict_edges = dict()
    predecessors = defaultdict(list)
    successors = defaultdict(list)
    g = nx.DiGraph()

    print("Parsing " + gfa_fpath + "...")
    # gfa = gfapy.Gfa.from_file(gfa_fpath, vlevel = 0)
    links = []
    edge_overlaps = defaultdict(dict)
    with open(gfa_fpath) as f:
        for line in f:
            record_type = line[0]
            if record_type == 'S':
                fs = line.split()
                name, seq_len = fs[1], len(fs[2])
                if fs[2] == '*':
                    seq_len = None
                add_fields = fs[3:] if len(fs) > 3 else []
                add_info = dict((f.split(':')[0].lower(), f.split(':')[-1]) for f in add_fields)
                cov = 1
                if "dp" in add_info:
                    cov = float(add_info["dp"])  ## coverage depth
                elif "kc" in add_info:
                    cov = max(1, int(add_info["kc"]) / seq_len)  ## k-mer count / edge length
                if "ln" in add_info:
                    seq_len = int(add_info["ln"])  ## sequence length
                if seq_len and seq_len >= min_edge_len:
                    edge_id = get_edge_agv_id(get_edge_num(name))
                    edge = Edge(edge_id, get_edge_num(name), seq_len, cov, element_id=edge_id)
                    dict_edges[edge_id] = edge
                    for overlapped_edge, overlap in edge_overlaps[edge_id].items():
                        dict_edges[edge_id].overlaps.append((edge_id_to_name(overlapped_edge), overlapped_edge, overlap))
                    rc_edge_id = get_edge_agv_id(-get_edge_num(name))
                    rc_edge = Edge(rc_edge_id, -get_edge_num(name), seq_len, cov, element_id=rc_edge_id)
                    dict_edges[rc_edge_id] = rc_edge
                    for overlapped_edge, overlap in edge_overlaps[rc_edge_id].items():
                        dict_edges[edge_id].overlaps.append((edge_id_to_name(overlapped_edge), overlapped_edge, overlap))

            if record_type != 'L' and record_type != 'E':
                continue
            if record_type == 'L':
                _, from_name, from_orient, to_name, to_orient = line.split()[:5]
            else:
                # E       *       2+      65397+  21      68$     0       47      47M
                from_name, to_name = line.split()[2], line.split()[3]
                from_orient, to_orient = from_name[-1], to_name[-1]
                from_name, to_name = from_name[:-1], to_name[:-1]
            edge1 = get_edge_agv_id(get_edge_num(from_name))
            edge2 = get_edge_agv_id(get_edge_num(to_name))
            if from_orient == '-': edge1 = get_match_edge_id(edge1)
            if to_orient == '-': edge2 = get_match_edge_id(edge2)
            overlap = 0
            overlap_operations = re.split('(\d+)', line.split()[-1].strip())
            for i in range(0, len(overlap_operations) - 1, 1):
                if not overlap_operations[i]:
                    continue
                if overlap_operations[i+1] == 'M' or overlap_operations[i+1] == 'I':
                    overlap += int(overlap_operations[i])
            links.append((from_name, from_orient, to_name, to_orient, overlap))
            if overlap:
                edge_overlaps[edge1][edge2] = overlap
                edge_overlaps[edge2][edge1] = overlap

    ### gfa retains only canonical links
    for link in links:
        from_name, from_orient, to_name, to_orient, overlap = link
        edge1 = get_edge_agv_id(get_edge_num(from_name))
        edge2 = get_edge_agv_id(get_edge_num(to_name))
        if from_orient == '-': edge1 = get_match_edge_id(edge1)
        if to_orient == '-': edge2 = get_match_edge_id(edge2)
        if edge1 != edge2:
            predecessors[edge2].append(edge1)
            successors[edge1].append(edge2)
        g.add_edge(edge1, edge2)
        if is_spades(assembler) or is_abyss(assembler):
            edge1, edge2 = get_match_edge_id(edge2), get_match_edge_id(edge1)
            if edge1 != edge2:
                predecessors[edge2].append(edge1)
                successors[edge1].append(edge2)
            g.add_edge(edge1, edge2)

    if assembler == "canu" and input_dirpath:
        dict_edges = parse_canu_unitigs_info(input_dirpath, dict_edges)
    dict_edges = construct_graph(dict_edges, predecessors, successors)
    print("Finish parsing.")
    return dict_edges