Beispiel #1
0
def create_contig_info(dict_edges, input_dirpath, output_dirpath, contig_edges,
                       edges_by_component, edges_by_repeat_component,
                       edges_by_ref_component, assembler):
    contig_info = None
    if is_canu(assembler):
        contig_info = parse_canu_contigs_info(input_dirpath)
    elif is_flye(assembler):
        contig_info = parse_flye_contigs_info(input_dirpath)
    elif is_spades(assembler):
        contig_info = parse_spades_contigs_info(input_dirpath, contig_edges)
    if not contig_info:
        with open(join(output_dirpath, 'contig_info.json'), 'a') as handle:
            handle.write("contigInfo=" + json.dumps([]) + ";\n")

        with open(join(output_dirpath, 'edges_base_info.json'), 'w') as handle:
            handle.write("edgeInfo=" + json.dumps([]) + ";")
            handle.write("medianCov=" +
                         json.dumps(calculate_median_cov(dict_edges)) + ";\n")
        return

    edge_contigs = defaultdict(set)
    for contig, data in contig_info.items():
        subgraph = None
        repeat_subgraph = None
        ref_subgraph = None
        edges = data['edges']
        for edge_name in set(edges):
            edge_id = get_edge_agv_id(edge_name)
            if edge_id in dict_edges:
                edge_contigs[edge_id].add(contig)
                match_edge_id = get_match_edge_id(edge_id)
                if match_edge_id in dict_edges:
                    edge_contigs[match_edge_id].add(contig)
            if not subgraph and edge_id in edges_by_component:
                data['g'] = edges_by_component[edge_id]
            if not repeat_subgraph and edge_id in edges_by_repeat_component:
                data['rep_g'] = edges_by_repeat_component[edge_id]
            if not ref_subgraph and edge_id in edges_by_ref_component:
                data['ref_g'] = edges_by_ref_component[edge_id]

        data['num_edges'] = str(len(edges))
        contig_info[contig] = data

    for edge_id in edge_contigs:
        edge_contigs[edge_id] = list(edge_contigs[edge_id])

    with open(join(output_dirpath, 'contig_info.json'), 'a') as handle:
        handle.write("contigInfo=" + json.dumps(contig_info) + ";\n")

    with open(join(output_dirpath, 'edges_base_info.json'), 'w') as handle:
        handle.write("edgeInfo=" + json.dumps(edge_contigs) + ";")
        handle.write("medianCov=" +
                     json.dumps(calculate_median_cov(dict_edges)) + ";\n")
    return
Beispiel #2
0
def parse_assembler_output(assembler_name, input_dirpath, input_fpath,
                           output_dirpath, input_fasta_fpath, min_edge_len):
    edges_fpath = None
    if not is_empty_file(input_fpath):
        contig_edges = []
        if input_fpath.endswith("fastg"):
            input_fpath = fastg_to_gfa(input_fpath, output_dirpath,
                                       assembler_name)
        if not input_fpath:
            sys.exit("ERROR! Failed parsing " + input_fpath + " file.")
        if input_fpath.endswith("gfa") or input_fpath.endswith("gfa2"):
            dict_edges = parse_gfa(input_fpath, min_edge_len)
            edges_fpath = get_edges_from_gfa(input_fpath, output_dirpath,
                                             min_edge_len)
        elif input_fpath.endswith("dot") or input_fpath.endswith("gv"):
            edges_fpath = format_edges_file(input_fasta_fpath, output_dirpath)
            dict_edges = dict()
            if is_abyss(assembler_name):
                dict_edges = parse_abyss_dot(input_fpath, min_edge_len)
            if not dict_edges:
                try:
                    dict_edges = parse_flye_dot(input_fpath, min_edge_len)
                except Exception as e:
                    sys.exit(
                        "ERROR! Failed parsing " + input_fpath + " file.\n"
                        "During parsing the following error has occured: " +
                        str(e) +
                        "\nPlease make sure that you correctly specified the assembler name using -a option. "
                        "DOT files produced by different assemblers can have very different formats.\n"
                        "Examples of input data can be found here https://github.com/almiheenko/AGB/tree/master/test_data"
                    )
    else:
        if is_canu(assembler_name):
            dict_edges, contig_edges, edges_fpath = parse_canu_output(
                input_dirpath, output_dirpath, min_edge_len)
        elif is_flye(assembler_name):
            dict_edges, contig_edges, edges_fpath = parse_flye_output(
                input_dirpath, output_dirpath, min_edge_len)
        elif is_spades(assembler_name):
            dict_edges, contig_edges, edges_fpath = parse_spades_output(
                input_dirpath, output_dirpath, min_edge_len)
        else:
            sys.exit(
                "Output folder of %s assembler can not be parsed! Supported assemblers: %s. "
                "More assemblers will be added in the next release.\n"
                "To visualize the assembly graph produced by this assembler, "
                "you should manually specify the assembly graph file in GFA/FASTG/GraphViz formats using --graph option "
                "and (optionally) file with edge sequences using --fasta option"
                % (assembler_name, ', '.join(SUPPORTED_ASSEMBLERS)))
    for edge_id, edge in dict_edges.items():
        dict_edges[edge_id].start, dict_edges[edge_id].end = str(
            edge.start), str(edge.end)
    return dict_edges, contig_edges, edges_fpath
Beispiel #3
0
def process_graph(g, undirected_g, dict_edges, edges_by_nodes, two_way_edges, output_dirpath, suffix, assembler,
                  base_graph=None, contig_edges=None, chrom_names=None, edge_by_chrom=None, mapping_info=None):
    last_idx = 0
    parts_info = dict()
    graph = []
    modified_dict_edges = dict()
    loop_edges = defaultdict(set)
    hanging_nodes = []
    connected_nodes = []
    enters = []
    exits = []
    base_graph = base_graph or g

    chrom_list = []
    contig_list = []
    complex_component = False
    if suffix == "ref":
        if chrom_names:
            ## create graph for reference-based mode
            for chrom in list(natural_sort(chrom_names)):
                edges = edge_by_chrom[chrom]  # use only edges mapped to the chromosome
                graph_component = nx.DiGraph()
                for edge_id in set(edges):
                    graph_component.add_edge(dict_edges[edge_id].start, dict_edges[edge_id].end)
                viewer_data, last_idx, sub_complex_component = \
                    split_graph(graph_component, g, undirected_g, dict_edges, modified_dict_edges, loop_edges, edges_by_nodes,
                                two_way_edges, last_idx, parts_info, mapping_info=mapping_info, chrom=chrom)
                parts_info = viewer_data.parts_info
                graph.extend(viewer_data.g)
                for i in range(len(viewer_data.g)):
                    chrom_list.append(chrom)
                complex_component = complex_component or sub_complex_component
        with open(join(output_dirpath, 'reference.json'), 'a') as handle:
            handle.write("chromosomes=" + json.dumps(chrom_list) + ";\n")
    elif contig_edges and suffix == "contig":
        ## create graph for contig-focused mode
        for contig, edges in contig_edges.items():
            graph_component = nx.DiGraph()
            edge_ids = set()
            for edge in edges:
                _, _, edge_id = edge
                edge_ids.add(edge_id)
                edge_ids.add(get_match_edge_id(edge_id))
            filtered_edge_ids = set()
            for edge_id in edge_ids:
                if edge_id in dict_edges:
                    graph_component.add_edge(dict_edges[edge_id].start, dict_edges[edge_id].end)
                    filtered_edge_ids.add(edge_id)
            viewer_data, last_idx, sub_complex_component = \
                split_graph(graph_component, g, undirected_g, dict_edges, modified_dict_edges, loop_edges, edges_by_nodes,
                            two_way_edges, last_idx, parts_info, contig_edges=filtered_edge_ids)
            parts_info = viewer_data.parts_info
            for i in range(len(viewer_data.g)):
                contig_list.append(contig)
            graph.extend(viewer_data.g)
        with open(join(output_dirpath, 'contig_info.json'), 'w') as handle:
            handle.write("contigs=" + json.dumps(contig_list) + ";\n")
    elif suffix == "repeat" or suffix == "def":
        fake_edges = []
        if is_flye(assembler):
            ## add fake edges to keep forward and reverse complement components of an edge together
            for edge_id, edge in dict_edges.items():
                if edge_id.startswith("rc"): continue
                if suffix == "repeat" and not edge.repetitive: continue
                match_edge_id = get_match_edge_id(edge_id)
                if match_edge_id not in dict_edges: continue
                match_edge_nodes = [dict_edges[match_edge_id].start, dict_edges[match_edge_id].end]
                if not any([e in undirected_g.neighbors(edge.start) for e in match_edge_nodes]) and not \
                        any([e in undirected_g.neighbors(edge.end) for e in match_edge_nodes]):
                    g.add_edge(edge.end, dict_edges[match_edge_id].start)
                    g.add_edge(edge.start, dict_edges[match_edge_id].end)
                    fake_edges.append((edge.start, dict_edges[match_edge_id].end))
                    fake_edges.append((edge.end, dict_edges[match_edge_id].start))
        # split graph into connected components
        connected_components = list(nx.weakly_connected_component_subgraphs(g))
        if fake_edges:
            g.remove_edges_from(fake_edges)
        for i, graph_component in enumerate(connected_components):
            viewer_data, last_idx, sub_complex_component = \
                split_graph(graph_component, base_graph, undirected_g, dict_edges, modified_dict_edges, loop_edges, edges_by_nodes,
                        two_way_edges, last_idx, parts_info,
                        fake_edges=fake_edges, find_hanging_nodes=suffix == "def", is_repeat_graph=suffix == "repeat")
            parts_info = viewer_data.parts_info
            graph.extend(viewer_data.g)
            hanging_nodes.extend(viewer_data.hanging_nodes)
            connected_nodes.extend(viewer_data.connected_nodes)
            enters.extend(viewer_data.enters)
            exits.extend(viewer_data.exits)
    edges_by_component = save_graph(graph, hanging_nodes, connected_nodes, enters, exits, dict_edges, modified_dict_edges,
                                    loop_edges, parts_info, output_dirpath, suffix,
                                    complex_component=complex_component,
                                    mapping_info=mapping_info, chrom_list=chrom_list, contig_list=contig_list)
    return edges_by_component