Example #1
0
def create_contig_info(dict_edges, input_dirpath, output_dirpath, contig_edges,
                       edges_by_component, edges_by_repeat_component,
                       edges_by_ref_component, assembler):
    contig_info = None
    if is_canu(assembler):
        contig_info = parse_canu_contigs_info(input_dirpath)
    elif is_flye(assembler):
        contig_info = parse_flye_contigs_info(input_dirpath)
    elif is_spades(assembler):
        contig_info = parse_spades_contigs_info(input_dirpath, contig_edges)
    if not contig_info:
        with open(join(output_dirpath, 'contig_info.json'), 'a') as handle:
            handle.write("contigInfo=" + json.dumps([]) + ";\n")

        with open(join(output_dirpath, 'edges_base_info.json'), 'w') as handle:
            handle.write("edgeInfo=" + json.dumps([]) + ";")
            handle.write("medianCov=" +
                         json.dumps(calculate_median_cov(dict_edges)) + ";\n")
        return

    edge_contigs = defaultdict(set)
    for contig, data in contig_info.items():
        subgraph = None
        repeat_subgraph = None
        ref_subgraph = None
        edges = data['edges']
        for edge_name in set(edges):
            edge_id = get_edge_agv_id(edge_name)
            if edge_id in dict_edges:
                edge_contigs[edge_id].add(contig)
                match_edge_id = get_match_edge_id(edge_id)
                if match_edge_id in dict_edges:
                    edge_contigs[match_edge_id].add(contig)
            if not subgraph and edge_id in edges_by_component:
                data['g'] = edges_by_component[edge_id]
            if not repeat_subgraph and edge_id in edges_by_repeat_component:
                data['rep_g'] = edges_by_repeat_component[edge_id]
            if not ref_subgraph and edge_id in edges_by_ref_component:
                data['ref_g'] = edges_by_ref_component[edge_id]

        data['num_edges'] = str(len(edges))
        contig_info[contig] = data

    for edge_id in edge_contigs:
        edge_contigs[edge_id] = list(edge_contigs[edge_id])

    with open(join(output_dirpath, 'contig_info.json'), 'a') as handle:
        handle.write("contigInfo=" + json.dumps(contig_info) + ";\n")

    with open(join(output_dirpath, 'edges_base_info.json'), 'w') as handle:
        handle.write("edgeInfo=" + json.dumps(edge_contigs) + ";")
        handle.write("medianCov=" +
                     json.dumps(calculate_median_cov(dict_edges)) + ";\n")
    return
Example #2
0
def parse_alignments(alignments_fpath, json_output_dirpath):
    gaps_info = defaultdict(list)
    chrom_alignments = defaultdict(list)
    ms_info = defaultdict(list)
    aligns_by_chroms = defaultdict(list)
    # S1      E1      S2      E2      Reference       Contig  IDY     Ambiguous       Best_group
    with open(alignments_fpath) as f:
        for i, line in enumerate(f):
            if i == 0:
                continue
            fs = line.split('\t')
            if len(fs) > 5:
                start, end, start2, end2, chrom, edge_id = fs[:6]
                start, end = int(start), int(end)
                if int(start2) > int(end2):
                    edge_id = get_match_edge_id(edge_id)
                chrom_alignments[chrom].append((start, end, edge_id))
            elif line.startswith("relocation") or line.startswith(
                    "transloc") or line.startswith("invers"):
                ms_info[(chrom, start, end)].append(line.strip())
    for chrom, alignments in chrom_alignments.items():
        alignments.sort(key=lambda x: (x[0], x[1]))
        prev_end = 0
        for start, end, edge_id in alignments:
            if start - prev_end > GAP_THRESHOLD:
                gaps_info[chrom].append((prev_end, start - 1))
            prev_end = max(prev_end, end)
            align = {
                's': start,
                'e': end,
                'edge': edge_id,
                'ms': ';'.join(ms_info[(chrom, start, end)])
            }
            aligns_by_chroms[chrom].append(align)
    with open(join(json_output_dirpath, 'reference.json'), 'w') as handle:
        handle.write("chromGaps=" + json.dumps(gaps_info) + ";\n")
        handle.write("chromAligns=" + json.dumps(aligns_by_chroms) + ";\n")
Example #3
0
def parse_gfa(gfa_fpath, min_edge_len, input_dirpath=None, assembler=None):
    dict_edges = dict()
    predecessors = defaultdict(list)
    successors = defaultdict(list)
    g = nx.DiGraph()

    print("Parsing " + gfa_fpath + "...")
    # gfa = gfapy.Gfa.from_file(gfa_fpath, vlevel = 0)
    links = []
    edge_overlaps = defaultdict(dict)
    with open(gfa_fpath) as f:
        for line in f:
            record_type = line[0]
            if record_type == 'S':
                fs = line.split()
                name, seq_len = fs[1], len(fs[2])
                if fs[2] == '*':
                    seq_len = None
                add_fields = fs[3:] if len(fs) > 3 else []
                add_info = dict((f.split(':')[0].lower(), f.split(':')[-1]) for f in add_fields)
                cov = 1
                if "dp" in add_info:
                    cov = float(add_info["dp"])  ## coverage depth
                elif "kc" in add_info:
                    cov = max(1, int(add_info["kc"]) / seq_len)  ## k-mer count / edge length
                if "ln" in add_info:
                    seq_len = int(add_info["ln"])  ## sequence length
                if seq_len and seq_len >= min_edge_len:
                    edge_id = get_edge_agv_id(get_edge_num(name))
                    edge = Edge(edge_id, get_edge_num(name), seq_len, cov, element_id=edge_id)
                    dict_edges[edge_id] = edge
                    for overlapped_edge, overlap in edge_overlaps[edge_id].items():
                        dict_edges[edge_id].overlaps.append((edge_id_to_name(overlapped_edge), overlapped_edge, overlap))
                    rc_edge_id = get_edge_agv_id(-get_edge_num(name))
                    rc_edge = Edge(rc_edge_id, -get_edge_num(name), seq_len, cov, element_id=rc_edge_id)
                    dict_edges[rc_edge_id] = rc_edge
                    for overlapped_edge, overlap in edge_overlaps[rc_edge_id].items():
                        dict_edges[edge_id].overlaps.append((edge_id_to_name(overlapped_edge), overlapped_edge, overlap))

            if record_type != 'L' and record_type != 'E':
                continue
            if record_type == 'L':
                _, from_name, from_orient, to_name, to_orient = line.split()[:5]
            else:
                # E       *       2+      65397+  21      68$     0       47      47M
                from_name, to_name = line.split()[2], line.split()[3]
                from_orient, to_orient = from_name[-1], to_name[-1]
                from_name, to_name = from_name[:-1], to_name[:-1]
            edge1 = get_edge_agv_id(get_edge_num(from_name))
            edge2 = get_edge_agv_id(get_edge_num(to_name))
            if from_orient == '-': edge1 = get_match_edge_id(edge1)
            if to_orient == '-': edge2 = get_match_edge_id(edge2)
            overlap = 0
            overlap_operations = re.split('(\d+)', line.split()[-1].strip())
            for i in range(0, len(overlap_operations) - 1, 1):
                if not overlap_operations[i]:
                    continue
                if overlap_operations[i+1] == 'M' or overlap_operations[i+1] == 'I':
                    overlap += int(overlap_operations[i])
            links.append((from_name, from_orient, to_name, to_orient, overlap))
            if overlap:
                edge_overlaps[edge1][edge2] = overlap
                edge_overlaps[edge2][edge1] = overlap

    ### gfa retains only canonical links
    for link in links:
        from_name, from_orient, to_name, to_orient, overlap = link
        edge1 = get_edge_agv_id(get_edge_num(from_name))
        edge2 = get_edge_agv_id(get_edge_num(to_name))
        if from_orient == '-': edge1 = get_match_edge_id(edge1)
        if to_orient == '-': edge2 = get_match_edge_id(edge2)
        if edge1 != edge2:
            predecessors[edge2].append(edge1)
            successors[edge1].append(edge2)
        g.add_edge(edge1, edge2)
        if is_spades(assembler) or is_abyss(assembler):
            edge1, edge2 = get_match_edge_id(edge2), get_match_edge_id(edge1)
            if edge1 != edge2:
                predecessors[edge2].append(edge1)
                successors[edge1].append(edge2)
            g.add_edge(edge1, edge2)

    if assembler == "canu" and input_dirpath:
        dict_edges = parse_canu_unitigs_info(input_dirpath, dict_edges)
    dict_edges = construct_graph(dict_edges, predecessors, successors)
    print("Finish parsing.")
    return dict_edges
Example #4
0
def process_graph(g, undirected_g, dict_edges, edges_by_nodes, two_way_edges, output_dirpath, suffix, assembler,
                  base_graph=None, contig_edges=None, chrom_names=None, edge_by_chrom=None, mapping_info=None):
    last_idx = 0
    parts_info = dict()
    graph = []
    modified_dict_edges = dict()
    loop_edges = defaultdict(set)
    hanging_nodes = []
    connected_nodes = []
    enters = []
    exits = []
    base_graph = base_graph or g

    chrom_list = []
    contig_list = []
    complex_component = False
    if suffix == "ref":
        if chrom_names:
            ## create graph for reference-based mode
            for chrom in list(natural_sort(chrom_names)):
                edges = edge_by_chrom[chrom]  # use only edges mapped to the chromosome
                graph_component = nx.DiGraph()
                for edge_id in set(edges):
                    graph_component.add_edge(dict_edges[edge_id].start, dict_edges[edge_id].end)
                viewer_data, last_idx, sub_complex_component = \
                    split_graph(graph_component, g, undirected_g, dict_edges, modified_dict_edges, loop_edges, edges_by_nodes,
                                two_way_edges, last_idx, parts_info, mapping_info=mapping_info, chrom=chrom)
                parts_info = viewer_data.parts_info
                graph.extend(viewer_data.g)
                for i in range(len(viewer_data.g)):
                    chrom_list.append(chrom)
                complex_component = complex_component or sub_complex_component
        with open(join(output_dirpath, 'reference.json'), 'a') as handle:
            handle.write("chromosomes=" + json.dumps(chrom_list) + ";\n")
    elif contig_edges and suffix == "contig":
        ## create graph for contig-focused mode
        for contig, edges in contig_edges.items():
            graph_component = nx.DiGraph()
            edge_ids = set()
            for edge in edges:
                _, _, edge_id = edge
                edge_ids.add(edge_id)
                edge_ids.add(get_match_edge_id(edge_id))
            filtered_edge_ids = set()
            for edge_id in edge_ids:
                if edge_id in dict_edges:
                    graph_component.add_edge(dict_edges[edge_id].start, dict_edges[edge_id].end)
                    filtered_edge_ids.add(edge_id)
            viewer_data, last_idx, sub_complex_component = \
                split_graph(graph_component, g, undirected_g, dict_edges, modified_dict_edges, loop_edges, edges_by_nodes,
                            two_way_edges, last_idx, parts_info, contig_edges=filtered_edge_ids)
            parts_info = viewer_data.parts_info
            for i in range(len(viewer_data.g)):
                contig_list.append(contig)
            graph.extend(viewer_data.g)
        with open(join(output_dirpath, 'contig_info.json'), 'w') as handle:
            handle.write("contigs=" + json.dumps(contig_list) + ";\n")
    elif suffix == "repeat" or suffix == "def":
        fake_edges = []
        if is_flye(assembler):
            ## add fake edges to keep forward and reverse complement components of an edge together
            for edge_id, edge in dict_edges.items():
                if edge_id.startswith("rc"): continue
                if suffix == "repeat" and not edge.repetitive: continue
                match_edge_id = get_match_edge_id(edge_id)
                if match_edge_id not in dict_edges: continue
                match_edge_nodes = [dict_edges[match_edge_id].start, dict_edges[match_edge_id].end]
                if not any([e in undirected_g.neighbors(edge.start) for e in match_edge_nodes]) and not \
                        any([e in undirected_g.neighbors(edge.end) for e in match_edge_nodes]):
                    g.add_edge(edge.end, dict_edges[match_edge_id].start)
                    g.add_edge(edge.start, dict_edges[match_edge_id].end)
                    fake_edges.append((edge.start, dict_edges[match_edge_id].end))
                    fake_edges.append((edge.end, dict_edges[match_edge_id].start))
        # split graph into connected components
        connected_components = list(nx.weakly_connected_component_subgraphs(g))
        if fake_edges:
            g.remove_edges_from(fake_edges)
        for i, graph_component in enumerate(connected_components):
            viewer_data, last_idx, sub_complex_component = \
                split_graph(graph_component, base_graph, undirected_g, dict_edges, modified_dict_edges, loop_edges, edges_by_nodes,
                        two_way_edges, last_idx, parts_info,
                        fake_edges=fake_edges, find_hanging_nodes=suffix == "def", is_repeat_graph=suffix == "repeat")
            parts_info = viewer_data.parts_info
            graph.extend(viewer_data.g)
            hanging_nodes.extend(viewer_data.hanging_nodes)
            connected_nodes.extend(viewer_data.connected_nodes)
            enters.extend(viewer_data.enters)
            exits.extend(viewer_data.exits)
    edges_by_component = save_graph(graph, hanging_nodes, connected_nodes, enters, exits, dict_edges, modified_dict_edges,
                                    loop_edges, parts_info, output_dirpath, suffix,
                                    complex_component=complex_component,
                                    mapping_info=mapping_info, chrom_list=chrom_list, contig_list=contig_list)
    return edges_by_component
Example #5
0
def parse_mapping_info(mapping_fpath, json_output_dir, dict_edges):
    # assign edges to chromosomes and color edges to corresponding colors

    mapping_info = defaultdict(set)

    edge_mappings = defaultdict(lambda: defaultdict(list))
    edge_lengths = dict()
    chrom_lengths = dict()
    with open(mapping_fpath) as f:
        for line in f:
            # contig_1        257261  14      160143  -       chr13   924431  196490  356991  147365  161095  60      tp:A:P  cm:i:14049      s1:i:147260     s2:i:4375       dv:f:0.0066
            fs = line.split()
            edge_id = get_edge_agv_id(get_edge_num(fs[0]))
            start, end = int(fs[2]), int(fs[3])
            edge_lengths[edge_id] = int(fs[1])
            chrom, chrom_len = fs[5], int(fs[6])
            ref_start, ref_end = int(fs[7]), int(fs[8])
            chrom_lengths[chrom] = chrom_len
            edge_mappings[edge_id][chrom].append(
                (start, end, ref_start, ref_end))

    chroms_by_edge = defaultdict(set)
    edge_by_chrom = defaultdict(set)
    chrom_names = set()
    best_aligns = defaultdict(defaultdict)
    for edge_id in edge_mappings:
        # assign an edge to a chromosome if more than 90% of edge aligned to the chromosome
        len_threshold = 0.9 * edge_lengths[edge_id]
        gap_threshold = min(5000, 0.05 * edge_lengths[edge_id])
        for chrom, mappings in edge_mappings[edge_id].items():
            mappings.sort(key=lambda x: (x[0], -x[1]), reverse=False)
        for chrom, mappings in edge_mappings[edge_id].items():
            aligns = []
            covered_len = 0
            last_pos = 0
            last_ref_pos = 0
            align_s, align_e = 0, 0
            # calculate covered length (do not count overlaps)
            for (start, end, ref_start, ref_end) in mappings:
                start = max(start, last_pos)
                covered_len += max(0, end - start + 1)
                last_pos = max(last_pos, end + 1)

            if covered_len >= len_threshold:
                chroms_by_edge[edge_id].add(chrom)
                chrom_names.add(chrom)
                edge_by_chrom[chrom].add(edge_id)
                mappings.sort(key=lambda x: (x[2], -x[3]), reverse=False)
                for (start, end, ref_start, ref_end) in mappings:
                    ref_start = max(ref_start, last_ref_pos)
                    last_ref_pos = max(last_ref_pos, ref_end + 1)
                    if not align_s:
                        align_s = ref_start
                    if align_e and ref_start - align_e >= gap_threshold:
                        if align_e - align_s >= 500:  # break alignments if gap longer than 500 bp
                            aligns.append((chrom, align_s, align_e))
                        align_s = ref_start
                    align_e = ref_end - 1
                if align_e and align_e - align_s >= 500:
                    aligns.append((chrom, align_s, align_e))
                aligns.sort(reverse=True, key=lambda x: x[2] - x[1])
                edge_alignment = chrom + ":"
                if aligns:
                    best_aligns[edge_id][chrom] = aligns[0][1]
                for align in aligns[:
                                    3]:  # store top 3 alignments for each edge
                    edge_alignment += " %s-%s," % (format_pos(
                        align[1]), format_pos(align[2]))
                dict_edges[edge_id].aligns[chrom] = edge_alignment[:-1]
                if get_match_edge_id(edge_id) in dict_edges:
                    dict_edges[get_match_edge_id(
                        edge_id)].aligns[chrom] = edge_alignment[:-1]

    chrom_len_dict = OrderedDict(
        (chrom, chrom_lengths[chrom])
        for i, chrom in enumerate(list(natural_sort(chrom_names))))
    non_alt_chroms = [
        c for c in chrom_names
        if 'alt' not in c and 'random' not in c and 'chrUn' not in c
    ]
    chrom_order = OrderedDict(
        (chrom, i)
        for i, chrom in enumerate(list(natural_sort(non_alt_chroms))))
    color_list = [
        '#e6194b', '#3cb44b', '#ffe119', '#1792d4', '#f58231', '#911eb4',
        '#46f0f0', '#f032e6', '#d2f53c', '#fabebe', '#00dbb1', '#dba2ff',
        '#aa6e28', '#83360e', '#800000', '#003bff', '#808000', '#8d73d4',
        '#000080', '#806680', '#51205a', '#558859', '#d1a187', '#87a1d1',
        '#87a1d1', '#afd187'
    ]

    edge_chroms = defaultdict(set)
    for edge_id, chroms in chroms_by_edge.items():
        match_edge_id = edge_id.replace(
            'rc', 'e') if edge_id.startswith('rc') else edge_id.replace(
                'e', 'rc')
        for chrom in chroms:
            edge_chroms[edge_id].add(chrom)
            edge_chroms[match_edge_id].add(chrom)
            if match_edge_id in dict_edges:
                edge_by_chrom[chrom].add(match_edge_id)

    is_single_chrom = len(chrom_order.keys()) == 1
    for edge_id, chroms in edge_chroms.items():
        if edge_id not in dict_edges:
            continue
        mapping_info[edge_id] = list(chroms)
        colors = set()
        for chrom in chroms:
            if chrom in chrom_order:
                if is_single_chrom:  # color an edge according to its position in reference
                    pos = best_aligns[edge_id][chrom] if best_aligns[edge_id] else \
                        best_aligns[get_match_edge_id(edge_id)][chrom]
                    color = get_rainbow_color(pos, chrom_len_dict[chrom])
                else:
                    color = color_list[chrom_order[chrom] % len(color_list)]
            else:
                color = '#808080'
            colors.add(color)
        if len(colors) <= 5:
            dict_edges[edge_id].chrom = ':'.join(list(colors))
        else:
            dict_edges[edge_id].chrom = 'white:red:black:red:black:white'
    with open(join(json_output_dir, "reference.json"), 'a') as handle:
        handle.write("chrom_lengths=" + json.dumps(chrom_len_dict) + ";\n")
        handle.write("edgeMappingInfo=" + json.dumps(mapping_info) + ";\n")
    return mapping_info, non_alt_chroms, edge_by_chrom