Example #1
0
def parse_canu_unitigs_info(input_dirpath, dict_edges):
    tiginfo_fpath = find_file_by_pattern(input_dirpath, ".unitigs.layout.tigInfo")
    if not is_empty_file(tiginfo_fpath):
        with open(tiginfo_fpath) as f:
            for i, line in enumerate(f):
                if i == 0:
                    header = line.strip().split()
                    repeat_col = header.index("sugRept") if "sugRept" in header else None
                    cov_col = header.index("coverage") if "coverage" in header else None
                    if repeat_col is None or cov_col is None:
                        break
                    continue
                fs = line.strip().split()
                edge_id = get_edge_agv_id(get_edge_num(fs[0]))
                rc_edge_id = get_edge_agv_id(-get_edge_num(fs[0]))
                if edge_id in dict_edges:
                    coverage = int(float(fs[cov_col]))
                    dict_edges[edge_id].cov = coverage
                    dict_edges[rc_edge_id].cov = coverage
                    if fs[repeat_col] == "yes":
                        dict_edges[edge_id].repetitive = True
                        dict_edges[rc_edge_id].repetitive = True
                # else:
                #    print("Warning! Edge %s is not found!" % edge_id)
    return dict_edges
Example #2
0
def get_edges_from_gfa(gfa_fpath, output_dirpath, min_edge_len):
    if not gfa_fpath:
        return None

    input_edges_fpath = join(dirname(gfa_fpath), get_filename(gfa_fpath) + ".fasta")
    edges_fpath = join(output_dirpath, basename(input_edges_fpath))
    if not is_empty_file(gfa_fpath) and not can_reuse(edges_fpath, files_to_check=[gfa_fpath]):
        print("Extracting edge sequences from " + gfa_fpath + "...")
        with open(edges_fpath, "w") as out:
            with open(gfa_fpath) as f:
                for line in f:
                    if line.startswith('S'):
                        fs = line.strip().split()
                        seq_name = fs[1]
                        seq = None
                        if is_acgt_seq(fs[2]):
                            seq = fs[2]
                        elif len(fs) >= 4 and is_acgt_seq(fs[3]):
                            seq = fs[3]
                        if seq and len(seq) >= min_edge_len:
                            out.write(">%s\n" % get_edge_agv_id(get_edge_num(seq_name)))
                            out.write(seq)
                            out.write("\n")
    if is_empty_file(edges_fpath) and not is_empty_file(input_edges_fpath):
        with open(edges_fpath, "w") as out:
            with open(input_edges_fpath) as f:
                for line in f:
                    if line.startswith('>'):
                        seq_name = line.strip().split()[0][1:]
                        out.write(">%s\n" % get_edge_agv_id(get_edge_num(seq_name)))
                    else:
                        out.write(line)
    return edges_fpath
Example #3
0
def parse_abyss_dot(dot_fpath, min_edge_len):
    '''digraph adj {
    graph [k=50]
    edge [d=-49]
    "3+" [l=99 C=454]
    "3-" [l=99 C=454]
    '''
    dict_edges = dict()
    predecessors = defaultdict(list)
    successors = defaultdict(list)

    edge_pattern = '"?(?P<edge_id>\d+)(?P<edge_sign>[\+\-])"? (?P<info>.+)'
    link_pattern = '"?(?P<start>\d+)(?P<start_sign>[\+\-])"? -> "?(?P<end>\d+)(?P<end_sign>[\+\-])"?'
    info_pattern = 'l=(?P<edge_len>\d+) C=(?P<edge_cov>\d+)'
    with open(dot_fpath) as f:
        for line in f:
            if 'l=' in line:
                #  "3+" -> "157446-" [d=-45]
                match = re.search(edge_pattern, line)
                if not match or len(match.groups()) < 3:
                    continue
                edge_id, edge_sign, info = match.group('edge_id'), match.group(
                    'edge_sign'), match.group('info')
                edge_name = (edge_sign if edge_sign != '+' else '') + edge_id
                edge_id = get_edge_agv_id(edge_name)
                match = re.search(info_pattern, info)
                if match and len(match.groups()) == 2:
                    cov = max(1, int(match.group('edge_cov')))
                    edge_len = max(1, int(float(match.group('edge_len'))))
                    if edge_len >= min_edge_len:
                        edge = Edge(edge_id,
                                    edge_name,
                                    edge_len,
                                    cov,
                                    element_id=edge_id)
                        dict_edges[edge_id] = edge
            if '->' in line:
                #  "3+" -> "157446-" [d=-45]
                match = re.search(link_pattern, line)
                if not match or len(match.groups()) < 2:
                    continue
                start, start_sign, end, end_sign = match.group(
                    'start'), match.group('start_sign'), match.group(
                        'end'), match.group('end_sign')
                start_edge_id = get_edge_agv_id(
                    (start_sign if start_sign == '-' else '') + start)
                end_edge_id = get_edge_agv_id(
                    (end_sign if end_sign == '-' else '') + end)
                predecessors[end_edge_id].append(start_edge_id)
                successors[start_edge_id].append(end_edge_id)

    dict_edges = construct_graph(dict_edges, predecessors, successors)
    return dict_edges
Example #4
0
def parse_flye_dot(dot_fpath, min_edge_len):
    dict_edges = dict()

    pattern = '"?(?P<start>\d+)"? -> "?(?P<end>\d+)"? \[(?P<info>.+)]'
    label_pattern = 'id (?P<edge_id>\-*.+) (?P<edge_len>[0-9\.]+)k (?P<coverage>\d+)'
    with open(dot_fpath) as f:
        for line in f:
            if 'label =' in line:
                # "7" -> "29" [label = "id 1\l53k 59x", color = "black"] ;
                line = line.replace('\\l', ' ')
                match = re.search(pattern, line)
                if not match or len(match.groups()) < 3:
                    continue
                start, end, info = match.group('start'), match.group('end'), match.group('info')
                params_dict = dict(param.split(' = ') for param in info.split(', ') if '=' in param)
                # label = params_dict.get('label')
                color = params_dict.get('color').strip().replace('"', '')
                line = line.replace(' ,', ',')
                match = re.search(label_pattern, info)
                if match and match.group('edge_id'):
                    edge_id = get_edge_agv_id(match.group('edge_id'))
                    cov = max(1, int(match.group('coverage')))
                    edge_len = max(1, int(float(match.group('edge_len')) * 1000))
                    if edge_len < min_edge_len:
                        continue
                    edge = Edge(edge_id, match.group('edge_id'), edge_len, cov, element_id=edge_id)
                    edge.color = color
                    if edge.color != "black":
                        edge.repetitive = True
                    edge.start, edge.end = int(start), int(end)
                    if 'dir = both' in line:
                        edge.two_way = True
                    dict_edges[edge_id] = edge
    dict_edges = calculate_multiplicities(dict_edges)
    return dict_edges
Example #5
0
def parse_flye_assembly_info(input_dirpath, dict_edges):
    contig_edges = defaultdict(list)
    info_fpath = join(input_dirpath, "assembly_info.txt")
    if is_empty_file(info_fpath):
        print(
            "Warning! Assembly_info.txt is not found, information about contigs will not be provided"
        )
    with open(info_fpath) as f:
        for i, line in enumerate(f):
            if i == 0:
                # header = line.strip().split()
                continue
            fs = line.strip().split()
            contig = fs[0]
            path = fs[-1]
            edges = path.split(',')
            start = 0
            for edge_name in edges:
                edge_id = get_edge_agv_id(edge_name)
                if edge_id in dict_edges:
                    edge_len = dict_edges[edge_id].length
                    contig_edges[contig].append(
                        (str(start), str(start + edge_len), edge_id))
                    start += edge_len
    return contig_edges
Example #6
0
def parse_spades_paths(input_dirpath, dict_edges):
    contig_edges = defaultdict(list)
    paths_fpath = join(input_dirpath, "scaffolds.paths")
    if is_empty_file(paths_fpath):
        print(
            "Warning! %s is not found, information about scaffold paths will not be provided"
            % paths_fpath)
    # NODE_1_length_8242890_cov_19.815448
    # 1893359+,1801779-,1893273-,400678-,1892977+,1869659-,1892443+,272108+,1694470+,1893863+
    with open(paths_fpath) as f:
        contig = None
        start = 0
        for line in f:
            if line.strip().endswith("'"):
                contig = None
            elif line.startswith("NODE"):
                contig = line.strip()
                start = 0
                continue
            elif contig:
                edges = line.strip().replace(';', '').split(',')
                for edge_name in edges:
                    edge_num = int(edge_name[:-1])
                    if edge_name[-1] == '-':
                        edge_num *= -1
                    edge_id = get_edge_agv_id(edge_num)
                    if edge_id in dict_edges:
                        edge_len = dict_edges[edge_id].length
                        contig_edges[contig].append(
                            (str(start), str(start + edge_len), edge_id))
                        start += edge_len
                start += 10  # NNNNNNNNNN
    return contig_edges
Example #7
0
def create_contig_info(dict_edges, input_dirpath, output_dirpath, contig_edges,
                       edges_by_component, edges_by_repeat_component,
                       edges_by_ref_component, assembler):
    contig_info = None
    if is_canu(assembler):
        contig_info = parse_canu_contigs_info(input_dirpath)
    elif is_flye(assembler):
        contig_info = parse_flye_contigs_info(input_dirpath)
    elif is_spades(assembler):
        contig_info = parse_spades_contigs_info(input_dirpath, contig_edges)
    if not contig_info:
        with open(join(output_dirpath, 'contig_info.json'), 'a') as handle:
            handle.write("contigInfo=" + json.dumps([]) + ";\n")

        with open(join(output_dirpath, 'edges_base_info.json'), 'w') as handle:
            handle.write("edgeInfo=" + json.dumps([]) + ";")
            handle.write("medianCov=" +
                         json.dumps(calculate_median_cov(dict_edges)) + ";\n")
        return

    edge_contigs = defaultdict(set)
    for contig, data in contig_info.items():
        subgraph = None
        repeat_subgraph = None
        ref_subgraph = None
        edges = data['edges']
        for edge_name in set(edges):
            edge_id = get_edge_agv_id(edge_name)
            if edge_id in dict_edges:
                edge_contigs[edge_id].add(contig)
                match_edge_id = get_match_edge_id(edge_id)
                if match_edge_id in dict_edges:
                    edge_contigs[match_edge_id].add(contig)
            if not subgraph and edge_id in edges_by_component:
                data['g'] = edges_by_component[edge_id]
            if not repeat_subgraph and edge_id in edges_by_repeat_component:
                data['rep_g'] = edges_by_repeat_component[edge_id]
            if not ref_subgraph and edge_id in edges_by_ref_component:
                data['ref_g'] = edges_by_ref_component[edge_id]

        data['num_edges'] = str(len(edges))
        contig_info[contig] = data

    for edge_id in edge_contigs:
        edge_contigs[edge_id] = list(edge_contigs[edge_id])

    with open(join(output_dirpath, 'contig_info.json'), 'a') as handle:
        handle.write("contigInfo=" + json.dumps(contig_info) + ";\n")

    with open(join(output_dirpath, 'edges_base_info.json'), 'w') as handle:
        handle.write("edgeInfo=" + json.dumps(edge_contigs) + ";")
        handle.write("medianCov=" +
                     json.dumps(calculate_median_cov(dict_edges)) + ";\n")
    return
Example #8
0
def format_edges_file(input_fpath, output_dirpath):
    if is_empty_file(input_fpath):
        return None
    edges_fpath = join(output_dirpath, "edges.fasta")
    if not can_reuse(edges_fpath, files_to_check=[input_fpath]):
        with open(input_fpath) as f:
            with open(edges_fpath, "w") as out_f:
                for line in f:
                    if line.startswith('>'):
                        edge_id = get_edge_agv_id(get_edge_num(line[1:]))
                        out_f.write(">%s\n" % edge_id)
                    else:
                        out_f.write(line)
    return edges_fpath
Example #9
0
def parse_canu_assembly_info(input_dirpath, dict_edges):
    contig_edges = defaultdict(list)
    unitigs_fpath = find_file_by_pattern(input_dirpath, ".unitigs.bed")
    if is_empty_file(unitigs_fpath):
        print(
            "Warning! Unitigs.bed is not found, information about contigs will not be provided"
        )
    with open(unitigs_fpath) as f:
        for line in f:
            fs = line.strip().split()
            contig, start, end, unitig = fs[:4]
            edge_id = get_edge_agv_id(get_edge_num(unitig))
            if edge_id in dict_edges:
                contig_id = get_canu_id(contig)
                contig_edges[contig_id].append((start, end, edge_id))
    return contig_edges
Example #10
0
def parse_gfa(gfa_fpath, min_edge_len, input_dirpath=None, assembler=None):
    dict_edges = dict()
    predecessors = defaultdict(list)
    successors = defaultdict(list)
    g = nx.DiGraph()

    print("Parsing " + gfa_fpath + "...")
    # gfa = gfapy.Gfa.from_file(gfa_fpath, vlevel = 0)
    links = []
    edge_overlaps = defaultdict(dict)
    with open(gfa_fpath) as f:
        for line in f:
            record_type = line[0]
            if record_type == 'S':
                fs = line.split()
                name, seq_len = fs[1], len(fs[2])
                if fs[2] == '*':
                    seq_len = None
                add_fields = fs[3:] if len(fs) > 3 else []
                add_info = dict((f.split(':')[0].lower(), f.split(':')[-1]) for f in add_fields)
                cov = 1
                if "dp" in add_info:
                    cov = float(add_info["dp"])  ## coverage depth
                elif "kc" in add_info:
                    cov = max(1, int(add_info["kc"]) / seq_len)  ## k-mer count / edge length
                if "ln" in add_info:
                    seq_len = int(add_info["ln"])  ## sequence length
                if seq_len and seq_len >= min_edge_len:
                    edge_id = get_edge_agv_id(get_edge_num(name))
                    edge = Edge(edge_id, get_edge_num(name), seq_len, cov, element_id=edge_id)
                    dict_edges[edge_id] = edge
                    for overlapped_edge, overlap in edge_overlaps[edge_id].items():
                        dict_edges[edge_id].overlaps.append((edge_id_to_name(overlapped_edge), overlapped_edge, overlap))
                    rc_edge_id = get_edge_agv_id(-get_edge_num(name))
                    rc_edge = Edge(rc_edge_id, -get_edge_num(name), seq_len, cov, element_id=rc_edge_id)
                    dict_edges[rc_edge_id] = rc_edge
                    for overlapped_edge, overlap in edge_overlaps[rc_edge_id].items():
                        dict_edges[edge_id].overlaps.append((edge_id_to_name(overlapped_edge), overlapped_edge, overlap))

            if record_type != 'L' and record_type != 'E':
                continue
            if record_type == 'L':
                _, from_name, from_orient, to_name, to_orient = line.split()[:5]
            else:
                # E       *       2+      65397+  21      68$     0       47      47M
                from_name, to_name = line.split()[2], line.split()[3]
                from_orient, to_orient = from_name[-1], to_name[-1]
                from_name, to_name = from_name[:-1], to_name[:-1]
            edge1 = get_edge_agv_id(get_edge_num(from_name))
            edge2 = get_edge_agv_id(get_edge_num(to_name))
            if from_orient == '-': edge1 = get_match_edge_id(edge1)
            if to_orient == '-': edge2 = get_match_edge_id(edge2)
            overlap = 0
            overlap_operations = re.split('(\d+)', line.split()[-1].strip())
            for i in range(0, len(overlap_operations) - 1, 1):
                if not overlap_operations[i]:
                    continue
                if overlap_operations[i+1] == 'M' or overlap_operations[i+1] == 'I':
                    overlap += int(overlap_operations[i])
            links.append((from_name, from_orient, to_name, to_orient, overlap))
            if overlap:
                edge_overlaps[edge1][edge2] = overlap
                edge_overlaps[edge2][edge1] = overlap

    ### gfa retains only canonical links
    for link in links:
        from_name, from_orient, to_name, to_orient, overlap = link
        edge1 = get_edge_agv_id(get_edge_num(from_name))
        edge2 = get_edge_agv_id(get_edge_num(to_name))
        if from_orient == '-': edge1 = get_match_edge_id(edge1)
        if to_orient == '-': edge2 = get_match_edge_id(edge2)
        if edge1 != edge2:
            predecessors[edge2].append(edge1)
            successors[edge1].append(edge2)
        g.add_edge(edge1, edge2)
        if is_spades(assembler) or is_abyss(assembler):
            edge1, edge2 = get_match_edge_id(edge2), get_match_edge_id(edge1)
            if edge1 != edge2:
                predecessors[edge2].append(edge1)
                successors[edge1].append(edge2)
            g.add_edge(edge1, edge2)

    if assembler == "canu" and input_dirpath:
        dict_edges = parse_canu_unitigs_info(input_dirpath, dict_edges)
    dict_edges = construct_graph(dict_edges, predecessors, successors)
    print("Finish parsing.")
    return dict_edges
Example #11
0
def run_quast_analysis(input_fpath,
                       reference_fpath,
                       output_dirpath,
                       json_output_dirpath,
                       threads,
                       contig_edges,
                       dict_edges=None,
                       is_meta=False):
    ms_out_fpath = None
    quast_output_dir = join(
        output_dirpath,
        "quast_output" if not dict_edges else "quast_edge_output")
    if not is_empty_file(input_fpath) and not is_empty_file(reference_fpath):
        ms_out_fpath = get_mis_report_fpath(quast_output_dir, input_fpath)
        ms_out_fpath = run(input_fpath, reference_fpath, ms_out_fpath,
                           quast_output_dir, threads, is_meta)
    if not ms_out_fpath:
        if not is_empty_file(input_fpath) and not is_empty_file(
                reference_fpath):
            print(
                "QUAST failed! Make sure you are using the latest version of QUAST"
            )
        print("No information about %s mappings to the reference genome" %
              ("edge" if dict_edges else "contig"))
        with open(join(json_output_dirpath, "reference.json"), 'w') as handle:
            handle.write("chrom_lengths=" + json.dumps([]) + ";\n")
            handle.write("edgeMappingInfo=" + json.dumps([]) + ";\n")
            handle.write("chromGaps=" + json.dumps([]) + ";\n")
            handle.write("chromAligns=" + json.dumps([]) + ";\n")
        with open(join(json_output_dirpath, 'errors.json'), 'w') as handle:
            handle.write("misassembledContigs=[];\n")
        return None, None, None, dict_edges

    # search for misassemblies and store them for each edge and contig
    misassembled_seqs = defaultdict(list)
    with open(ms_out_fpath) as f:
        seq_id = ''
        for line in f:
            if line.startswith("Extensive misassembly"):
                match = re.search(align_pattern, line)
                if not match or len(match.groups()) < 4:
                    continue
                start1, end1, start2, end2 = match.group(
                    'start1'), match.group('end1'), match.group(
                        'start2'), match.group('end2')
                if dict_edges:
                    edge_id = get_edge_agv_id(get_edge_num(seq_id))
                    dict_edges[edge_id].errors.append(
                        (start1, end1, start2, end2))
                else:
                    misassembled_seqs[seq_id].append(
                        (start1, end1, start2, end2))
                ## add misassembl edge
            else:
                seq_id = line.strip()

    if not dict_edges:
        with open(join(json_output_dirpath, 'errors.json'), 'w') as handle:
            handle.write("misassembledContigs='" +
                         json.dumps(misassembled_seqs) + "';\n")
        return None, None, None, dict_edges
    else:
        parse_alignments(get_alignments_fpath(quast_output_dir, input_fpath),
                         json_output_dirpath)
        mapping_fpath = map_edges_to_ref(input_fpath, output_dirpath,
                                         reference_fpath, threads)
        mapping_info, chrom_names, edge_by_chrom = parse_mapping_info(
            mapping_fpath, json_output_dirpath, dict_edges)
        return mapping_info, chrom_names, edge_by_chrom, dict_edges
Example #12
0
def parse_mapping_info(mapping_fpath, json_output_dir, dict_edges):
    # assign edges to chromosomes and color edges to corresponding colors

    mapping_info = defaultdict(set)

    edge_mappings = defaultdict(lambda: defaultdict(list))
    edge_lengths = dict()
    chrom_lengths = dict()
    with open(mapping_fpath) as f:
        for line in f:
            # contig_1        257261  14      160143  -       chr13   924431  196490  356991  147365  161095  60      tp:A:P  cm:i:14049      s1:i:147260     s2:i:4375       dv:f:0.0066
            fs = line.split()
            edge_id = get_edge_agv_id(get_edge_num(fs[0]))
            start, end = int(fs[2]), int(fs[3])
            edge_lengths[edge_id] = int(fs[1])
            chrom, chrom_len = fs[5], int(fs[6])
            ref_start, ref_end = int(fs[7]), int(fs[8])
            chrom_lengths[chrom] = chrom_len
            edge_mappings[edge_id][chrom].append(
                (start, end, ref_start, ref_end))

    chroms_by_edge = defaultdict(set)
    edge_by_chrom = defaultdict(set)
    chrom_names = set()
    best_aligns = defaultdict(defaultdict)
    for edge_id in edge_mappings:
        # assign an edge to a chromosome if more than 90% of edge aligned to the chromosome
        len_threshold = 0.9 * edge_lengths[edge_id]
        gap_threshold = min(5000, 0.05 * edge_lengths[edge_id])
        for chrom, mappings in edge_mappings[edge_id].items():
            mappings.sort(key=lambda x: (x[0], -x[1]), reverse=False)
        for chrom, mappings in edge_mappings[edge_id].items():
            aligns = []
            covered_len = 0
            last_pos = 0
            last_ref_pos = 0
            align_s, align_e = 0, 0
            # calculate covered length (do not count overlaps)
            for (start, end, ref_start, ref_end) in mappings:
                start = max(start, last_pos)
                covered_len += max(0, end - start + 1)
                last_pos = max(last_pos, end + 1)

            if covered_len >= len_threshold:
                chroms_by_edge[edge_id].add(chrom)
                chrom_names.add(chrom)
                edge_by_chrom[chrom].add(edge_id)
                mappings.sort(key=lambda x: (x[2], -x[3]), reverse=False)
                for (start, end, ref_start, ref_end) in mappings:
                    ref_start = max(ref_start, last_ref_pos)
                    last_ref_pos = max(last_ref_pos, ref_end + 1)
                    if not align_s:
                        align_s = ref_start
                    if align_e and ref_start - align_e >= gap_threshold:
                        if align_e - align_s >= 500:  # break alignments if gap longer than 500 bp
                            aligns.append((chrom, align_s, align_e))
                        align_s = ref_start
                    align_e = ref_end - 1
                if align_e and align_e - align_s >= 500:
                    aligns.append((chrom, align_s, align_e))
                aligns.sort(reverse=True, key=lambda x: x[2] - x[1])
                edge_alignment = chrom + ":"
                if aligns:
                    best_aligns[edge_id][chrom] = aligns[0][1]
                for align in aligns[:
                                    3]:  # store top 3 alignments for each edge
                    edge_alignment += " %s-%s," % (format_pos(
                        align[1]), format_pos(align[2]))
                dict_edges[edge_id].aligns[chrom] = edge_alignment[:-1]
                if get_match_edge_id(edge_id) in dict_edges:
                    dict_edges[get_match_edge_id(
                        edge_id)].aligns[chrom] = edge_alignment[:-1]

    chrom_len_dict = OrderedDict(
        (chrom, chrom_lengths[chrom])
        for i, chrom in enumerate(list(natural_sort(chrom_names))))
    non_alt_chroms = [
        c for c in chrom_names
        if 'alt' not in c and 'random' not in c and 'chrUn' not in c
    ]
    chrom_order = OrderedDict(
        (chrom, i)
        for i, chrom in enumerate(list(natural_sort(non_alt_chroms))))
    color_list = [
        '#e6194b', '#3cb44b', '#ffe119', '#1792d4', '#f58231', '#911eb4',
        '#46f0f0', '#f032e6', '#d2f53c', '#fabebe', '#00dbb1', '#dba2ff',
        '#aa6e28', '#83360e', '#800000', '#003bff', '#808000', '#8d73d4',
        '#000080', '#806680', '#51205a', '#558859', '#d1a187', '#87a1d1',
        '#87a1d1', '#afd187'
    ]

    edge_chroms = defaultdict(set)
    for edge_id, chroms in chroms_by_edge.items():
        match_edge_id = edge_id.replace(
            'rc', 'e') if edge_id.startswith('rc') else edge_id.replace(
                'e', 'rc')
        for chrom in chroms:
            edge_chroms[edge_id].add(chrom)
            edge_chroms[match_edge_id].add(chrom)
            if match_edge_id in dict_edges:
                edge_by_chrom[chrom].add(match_edge_id)

    is_single_chrom = len(chrom_order.keys()) == 1
    for edge_id, chroms in edge_chroms.items():
        if edge_id not in dict_edges:
            continue
        mapping_info[edge_id] = list(chroms)
        colors = set()
        for chrom in chroms:
            if chrom in chrom_order:
                if is_single_chrom:  # color an edge according to its position in reference
                    pos = best_aligns[edge_id][chrom] if best_aligns[edge_id] else \
                        best_aligns[get_match_edge_id(edge_id)][chrom]
                    color = get_rainbow_color(pos, chrom_len_dict[chrom])
                else:
                    color = color_list[chrom_order[chrom] % len(color_list)]
            else:
                color = '#808080'
            colors.add(color)
        if len(colors) <= 5:
            dict_edges[edge_id].chrom = ':'.join(list(colors))
        else:
            dict_edges[edge_id].chrom = 'white:red:black:red:black:white'
    with open(join(json_output_dir, "reference.json"), 'a') as handle:
        handle.write("chrom_lengths=" + json.dumps(chrom_len_dict) + ";\n")
        handle.write("edgeMappingInfo=" + json.dumps(mapping_info) + ";\n")
    return mapping_info, non_alt_chroms, edge_by_chrom