Exemple #1
0
def run_flye(assembly, reads_fname, out_dir, threads, no_nucl_alignment):
    try:
        make_flye()
    except:
        print(
            'Failed to compile Flye! Please try to compile it manually: create %s folder and run "make" in %s'
            % (dirname(ASSEMBLY_BIN), dirname(dirname(ASSEMBLY_BIN))))
        sys.exit(2)
    cmd = [
        ASSEMBLY_BIN, '--reads', reads_fname, '--asm',
        assembly.compressed_fname or assembly.fname, '--kmers',
        abspath(assembly.kmers_fname), '--out-file',
        abspath(assembly.chains_fname), '--out-asm', 'draft_assembly.fasta',
        '--max-diff',
        str(assembly.max_aln_diff), '--genome-size',
        str(get_fasta_len(assembly.fname)), '--config',
        abspath(get_flye_cfg_fname()), '--log',
        join(out_dir, 'mapping.log'), '--min-kmers',
        str(MIN_CHAIN_KMERS) if not no_nucl_alignment else '1000000',
        '--threads',
        str(threads), '--min-ovlp',
        str(MIN_CHAIN_LEN), '--kmer',
        str(KMER_SIZE)
    ]
    subprocess.call(cmd,
                    stdout=open("/dev/null", "w"),
                    stderr=open("/dev/null", "w"))
Exemple #2
0
def set_params(fnames, threads):
    if not check_fasta_files(fnames):
        sys.exit(1)
    assembly_len = max([get_fasta_len(f) for f in fnames])
    #print("Max assembly len: %d" % assembly_len)
    config.KMER_WINDOW_SIZE = max(500, assembly_len // 150)
    if assembly_len < 100000:
        config.BP_WINDOW_SIZE = 200
    elif assembly_len < 1000000:
        config.BP_WINDOW_SIZE = 500
    else:
        config.BP_WINDOW_SIZE = 1000
    config.MOVING_AVG_WINDOW_SIZE = min(
        20, max(20, assembly_len // config.BP_WINDOW_SIZE // 20))
    config.MAX_THREADS = threads
Exemple #3
0
def do(assemblies, reads_fname, reads_real_coords, out_dir, threads, no_reuse,
       no_nucl_alignment):
    print("")
    print("*********************************")
    print("Read mapping started...")
    run_parallel(run_flye,
                 [(assembly, reads_fname, out_dir,
                   max(1, threads // len(assemblies)), no_nucl_alignment)
                  for assembly in assemblies
                  if not exists(assembly.bed_fname) or no_reuse],
                 n_jobs=min(len(assemblies), threads))
    all_data = []
    for assembly in assemblies:
        errors = postprocess_chains(assembly, reads_real_coords)
        coverage = calculate_coverage(get_fasta_len(assembly.fname),
                                      assembly.bed_fname)
        all_data.append((errors, coverage))
    make_plotly_noise(assemblies, all_data, out_dir)
    print("Read mapping finished")
Exemple #4
0
def do(assemblies, reads_fname, hifi_reads_fname, out_dir, tmp_dir):
    print("")
    print("*********************************")
    print("Running polishing module...")
    out_dir = join(out_dir, "polished")
    if not exists(out_dir):
        os.makedirs(out_dir)
    try:
        make_flye()
    except:
        print('Failed to compile Flye! Please try to compile it manually: create %s folder and run "make" in %s'
              % (dirname(POLISH_BIN), dirname(dirname(POLISH_BIN))))
        sys.exit(2)
    for i in range(4):
        select_kmers.do(assemblies, reads_fname, reads_fname, hifi_reads_fname, out_dir, tmp_dir, no_reuse=True, only_polish=True)
        for assembly in assemblies:
            print("Polishing genome (%d/%d)" % (i+1, 4))
            assembly.fname = polish(assembly.fname, reads_fname, out_dir, assembly.kmers_fname,
                                    get_fasta_len(assembly.fname), config.MAX_THREADS, config.platform, get_flye_cfg_fname(), i)
    print("Polished assemblies saved to %s" % out_dir)
Exemple #5
0
def do(assemblies, reads_fname, out_dir, no_reuse=False):
    print("")
    print("*********************************")
    print("K-mer analysis started...")

    kmer_stats_table = [['Assembly'] +
                        [assembly.name for assembly in assemblies]]
    kmer_stats_table.append(["K-mers forming single clump"] +
                            ["-" for assembly in assemblies])
    kmer_stats_table.append(["K-mers forming multiple clumps"] +
                            ["-" for assembly in assemblies])
    kmer_stats_table.append(["K-mers forming no clumps"] +
                            ["-" for assembly in assemblies])
    for i, assembly in enumerate(assemblies):
        if exists(assembly.good_kmers_fname) and exists(
                join(out_dir, "report",
                     assembly.name + "_kmer_stats.txt")) and not no_reuse:
            print("Reusing latest results...")
            with open(
                    join(out_dir, "report",
                         assembly.name + "_kmer_stats.txt")) as f:
                line = f.readline()
                kmer_stats_table[1][i + 1] = line.split("\t")[1]
                line = f.readline()
                kmer_stats_table[2][i + 1] = line.split("\t")[1]
                line = f.readline()
                kmer_stats_table[3][i + 1] = line.split("\t")[1]
            continue

        solid_kmers = get_kmers(assembly.solid_kmers_fname)
        assembly_len = get_fasta_len(assembly.fname)
        ref_kmers_pos, kmer_by_pos = get_kmers_positions(
            assembly.fname, solid_kmers)
        read_kmer_pos, reads_coords = get_kmers_read_pos(
            assembly, reads_fname, solid_kmers)

        no_clumps = []
        one_clump = []
        good_kmers = []
        multi_clumps = []
        bad_kmers1 = []
        bad_kmers2 = []
        multi_clump_pos = []
        no_clump_pos = []
        for kmer, pos in ref_kmers_pos.items():
            pos_in_ref = []
            read_pos = []
            reads = []
            for read_name, kmers_pos in read_kmer_pos.items():
                if kmer in kmers_pos and kmers_pos[kmer] in reads_coords[
                        read_name]:
                    k_pos = kmers_pos[kmer]
                    read_pos.append(k_pos)
                    pos_in_ref.append(reads_coords[read_name][k_pos])
                    reads.append(read_name)
            if read_pos and len(read_pos) >= MIN_CLUMP_SIZE:
                clusters = get_clusters(pos_in_ref)
                if not clusters:
                    no_clumps.append(pos)
                    bad_kmers1.append(kmer)
                    multi_clump_pos.extend(pos_in_ref)
                elif len(clusters) == 1:
                    one_clump.append(pos)
                    good_kmers.append(kmer)
                else:
                    multi_clumps.append(pos)
                    bad_kmers2.append(kmer)
                    no_clump_pos.extend(pos_in_ref)

        all_kmers = len(one_clump) + len(multi_clumps) + len(no_clumps)
        with open(assembly.good_kmers_fname, "w") as f:
            for kmer in good_kmers:
                f.write("%s\n" % kmer)
        with open(join(out_dir, "report", assembly.name + "_kmer_stats.txt"),
                  "w") as f:
            f.write("Single clump\t%.2f (%d)\n" %
                    (len(one_clump) * 100.0 / all_kmers, len(one_clump)))
            f.write("Multi clump\t%.2f (%d)\n" %
                    (len(multi_clumps) * 100.0 / all_kmers, len(multi_clumps)))
            f.write("No clumps\t%.2f (%d)\n" %
                    (len(no_clumps) * 100.0 / all_kmers, len(no_clumps)))
        kmer_stats_table[1][i + 1] = "%.2f (%d)" % (len(one_clump) * 100.0 /
                                                    all_kmers, len(one_clump))
        kmer_stats_table[2][i +
                            1] = "%.2f (%d)" % (len(multi_clumps) * 100.0 /
                                                all_kmers, len(multi_clumps))
        kmer_stats_table[3][i + 1] = "%.2f (%d)" % (len(no_clumps) * 100.0 /
                                                    all_kmers, len(no_clumps))

        one_clump_dist = [0] * assembly_len
        no_clump_dist = [0] * assembly_len
        multi_clump_dist = [0] * assembly_len
        for p in one_clump:
            one_clump_dist[p] = 1
        for p in no_clumps:
            no_clump_dist[p] = 1
        for p in multi_clumps:
            multi_clump_dist[p] = 1

        one_clump_vals = [
            sum(one_clump_dist[i:i + KMER_WINDOW_SIZE])
            for i in range(0, assembly_len, KMER_WINDOW_SIZE)
        ]
        multi_clump_vals = [
            sum(multi_clump_dist[i:i + KMER_WINDOW_SIZE])
            for i in range(0, assembly_len, KMER_WINDOW_SIZE)
        ]
        no_clump_vals = [
            sum(no_clump_dist[i:i + KMER_WINDOW_SIZE])
            for i in range(0, assembly_len, KMER_WINDOW_SIZE)
        ]

        plot_fname = join(out_dir, "report",
                          assembly.name + "_kmer_analysis.png")
        make_plot(plot_fname,
                  "K-mer analysis",
                  assembly.label,
                  xlabel="Position",
                  ylabel="$\it{k}$-mer counts",
                  list_vals=[one_clump_vals, multi_clump_vals, no_clump_vals],
                  legend=("Single clump", "Multiple clumps", "No clumps"),
                  max_x=assembly_len)
    #draw_report_table("K-mer statistics", "", kmer_stats_table)
    print("K-mer analysis finished.")
Exemple #6
0
def do(assemblies, reads_fname, monomers_fname, out_dir):
    print("")
    print("*********************************")
    print("Monomer analysis started...")

    reads_mm_structure = get_reads_monomer_structure(reads_fname,
                                                     monomers_fname, out_dir)

    for assembly in assemblies:
        print("")
        print("Processing %s assembly..." % assembly.label)
        ref_mm_structure, ref_stats = get_ref_monomers(assembly,
                                                       monomers_fname, out_dir)
        ref_mm_structure.sort(key=lambda x: x[1])

        make_plotly_html(assembly, ref_stats, out_dir)

        _, reads_coords = get_kmers_read_pos(assembly, reads_fname)

        assembly_len = get_fasta_len(assembly.fname)
        reads_monomers = [[] for i in range(len(ref_mm_structure))]
        coverage = [0] * assembly_len
        for read_name, coord_dict in reads_coords.items():
            for (mm_name, mm_start, mm_end) in reads_mm_structure[read_name]:
                ref_i = -1
                ref_i2 = -1
                mm_len = mm_end - mm_start,
                if mm_start in reads_coords[
                        read_name] and mm_end in reads_coords[read_name]:
                    mm_start = reads_coords[read_name][mm_start]
                    mm_end = reads_coords[read_name][mm_end]
                    mm_start, mm_end = min(mm_start,
                                           mm_end), max(mm_start, mm_end)
                    ref_i = approx_binary_search(ref_mm_structure, 1, 0,
                                                 len(ref_mm_structure),
                                                 mm_start)
                    if mm_end - mm_start > 50:
                        ref_i2 = approx_binary_search(ref_mm_structure, 2, 0,
                                                      len(ref_mm_structure),
                                                      mm_end)
                    else:
                        ref_i2 = ref_i
                if ref_i > -1 and ref_i2 == ref_i:
                    coverage[ref_i] += 1
                    reads_monomers[ref_i].append((mm_name, mm_len, mm_start))

        read_support = []
        for i in range(len(ref_mm_structure)):
            if len(reads_monomers[i]) >= MIN_COV:
                read_support.append(
                    sum([
                        1 for m in reads_monomers[i]
                        if m[0] == ref_mm_structure[i][0]
                    ]) * 1.0 / coverage[i])
            else:
                read_support.append(1)

        plot_fname = join(out_dir, "report",
                          assembly.name + "_monomer_analysis.png")
        make_plot(plot_fname,
                  "Monomer analysis",
                  assembly.label,
                  xlabel="Position",
                  ylabel="MonomersRatio",
                  plot_values=read_support,
                  plot_color="blue",
                  ymax=1.05,
                  max_x=assembly_len)

        #### UNIT ANALYSIS
        ref_unit_structure = analyze_unit_structure(ref_mm_structure)

        unit_occ = defaultdict(int)
        units_fname = join(out_dir, "%s_units.txt") % assembly.name
        with open(units_fname, "w") as f:
            f.write("\t".join(["Unit", "Start", "End", "Monomer sequence\n"]))
            for i in range(len(ref_unit_structure)):
                if not ref_unit_structure[i]:
                    continue
                unit_str = ref_unit_structure[i][2]
                unit_occ[unit_str] += 1
                f.write("\t".join(
                    str(s) for s in [
                        i + 1, ref_unit_structure[i][0], ref_unit_structure[i]
                        [1], unit_str
                    ]) + "\n")

        print("Total units: %d, units sequences saved to %s" %
              (len(ref_unit_structure), units_fname))
Exemple #7
0
def do(assemblies, out_dir):
    print("")
    print("*********************************")
    print("Breakpoint analysis started...")

    for assembly in assemblies:
        assembly_len = get_fasta_len(assembly.fname)
        starts = [0] * assembly_len
        ends = [0] * assembly_len
        ideal_starts = [0] * assembly_len
        ideal_ends = [0] * assembly_len
        coverage = [0] * assembly_len
        ideal_coverage = [0] * assembly_len
        tips = [0] * assembly_len

        rare_kmers = get_kmers(assembly.kmers_fname)
        with open(assembly.fname) as handle:
            for record in SeqIO.parse(handle, 'fasta'):
                assembly_len = len(record.seq)
                assembly_seq = str(record.seq)
                rare_kmers_by_pos = [0] * assembly_len
                for i in range(len(assembly_seq) - KMER_SIZE + 1):
                    kmer = assembly_seq[i:i + KMER_SIZE]
                    if kmer in rare_kmers or rev_comp(kmer) in rare_kmers:
                        rare_kmers_by_pos[i] = 1

        used_reads = set()
        with open(assembly.bed_fname) as f:
            for line in f:
                fs = line.split()
                ref, ref_s, ref_e, read_name, align_start, align_end, read_len = fs
                ref_s, ref_e, align_start, align_end, read_len = map(
                    int, (ref_s, ref_e, align_start, align_end, read_len))

                align_start, align_end = min(align_start, align_end), max(
                    align_start, align_end)
                tips[ref_s] += 1
                tips[ref_e - 1] += 1
                starts[ref_s] += 1
                ends[ref_e - 1] += 1
                if read_name in used_reads:
                    continue
                if sum(rare_kmers_by_pos[ref_s - align_start:ref_s]
                       ) > MAX_MISSED_KMERS:
                    ideal_starts[max(0, ref_s - align_start)] += 1
                else:
                    ideal_starts[ref_s] += 1
                if sum(rare_kmers_by_pos[ref_e:ref_e +
                                         read_len]) > MAX_MISSED_KMERS:
                    ideal_ends[min(assembly_len - 1,
                                   ref_e + read_len - align_end - 1)] += 1
                else:
                    ideal_ends[ref_e] += 1
                used_reads.add(read_name)

        cur_cov = 0
        ideal_cur_cov = 0
        uncovered_regions = []
        prev_s, prev_e = -1, -1
        for i in range(assembly_len):
            cur_cov += starts[i]
            cur_cov -= ends[i]
            ideal_cur_cov += ideal_starts[i]
            ideal_cur_cov -= ideal_ends[i]
            ideal_coverage[i] = ideal_cur_cov
            coverage[i] = cur_cov
            if cur_cov < MIN_BP_COV:
                if prev_s != -1:
                    prev_e = i
                else:
                    prev_s = i
            elif prev_s != -1 and prev_e != -1:
                uncovered_regions.append((prev_s, prev_e))
                prev_s, prev_e = -1, -1
            else:
                prev_s, prev_e = -1, -1

        factor = 2
        step = BP_WINDOW_SIZE // factor
        real_bp_ratio = [
            sum(tips[i:i + BP_WINDOW_SIZE]) * 1.0 /
            max(1, coverage[i] + sum(starts[i + 1:i + BP_WINDOW_SIZE]))
            if max(coverage[i:i + BP_WINDOW_SIZE]) >= MIN_BP_COV else 0
            for i in range(0, len(coverage), step)
        ]
        ideal_bp_ratio = [
            (sum(ideal_starts[i:i + BP_WINDOW_SIZE]) +
             sum(ideal_ends[i:i + BP_WINDOW_SIZE])) * 1.1 / max(
                 1, ideal_coverage[i] +
                 sum(ideal_starts[i + 1:i + BP_WINDOW_SIZE]))
            if max(ideal_coverage[i:i + BP_WINDOW_SIZE]) >= MIN_BP_COV else 0
            for i in range(0, len(coverage), step)
        ]

        def running_mean(data):
            cumsum = np.cumsum(np.insert(data, 0, 0))
            return (cumsum[MOVING_AVG_WINDOW_SIZE:] -
                    cumsum[:-MOVING_AVG_WINDOW_SIZE]) / 10

        for i in range(2):
            ideal_bp_ratio[i], ideal_bp_ratio[-i - 1] = 0, 0
            real_bp_ratio[i], real_bp_ratio[-i - 1] = 0, 0
        real_vals = [min(1, v) for v in running_mean(real_bp_ratio)]
        ideal_vals = [min(1, v) for v in running_mean(ideal_bp_ratio)]

        plot_fname = join(out_dir, "report",
                          assembly.name + "_bp_analysis.png")
        uncovered_bars = [(r[0] / step, r[1] / step) for r in uncovered_regions
                          if (r[1] / step - r[0] / step) > 10]
        make_plot(plot_fname,
                  "Breakpoint",
                  assembly.label,
                  xlabel="position",
                  ylabel="breakpointRatio",
                  fill_values=real_vals,
                  fill_color="red",
                  fill_values2=ideal_vals,
                  fill_color2="gray",
                  ymax=1,
                  max_x=assembly_len,
                  bg_bars=uncovered_bars)
    print("Breakpoint analysis finished.")
Exemple #8
0
def do(assemblies, reads_file, out_dir):
    if len(assemblies) < 2:
        return
    print("")
    print("*********************************")
    print("Discordance test started...")
    for (assembly1, assembly2) in itertools.combinations(assemblies, 2):
        kmers1 = get_kmers(assembly1.good_kmers_fname)
        kmers2 = get_kmers(assembly2.good_kmers_fname)
        shared_kmers = kmers1.intersection(kmers2)
        with open(join(out_dir, "shared_kmers.txt"), "w") as f:
            for k in shared_kmers:
                f.write(k + "\n")

        ref_kmers_pos1, kmer_by_pos1 = get_kmers_positions(
            assembly1.fname, shared_kmers)
        ref_kmers_pos2, kmer_by_pos2 = get_kmers_positions(
            assembly2.fname, shared_kmers)

        score1 = 0
        voting_reads1 = 0
        score2 = 0
        voting_reads2 = 0

        read_kmer_pos1, reads_coords1 = get_kmers_read_pos(
            assembly1, reads_file, shared_kmers)
        read_kmer_pos2, reads_coords2 = get_kmers_read_pos(
            assembly2, reads_file, shared_kmers)

        selected_reads1 = []
        selected_reads2 = []
        for read_name in reads_coords1.keys():
            read_name = slugify(read_name)
            read_score = 0
            for kmer in shared_kmers:
                if kmer in ref_kmers_pos1 and read_name in read_kmer_pos1 and kmer in read_kmer_pos1[read_name] and \
                        read_kmer_pos1[read_name][kmer] in reads_coords1[read_name]:
                    if abs(ref_kmers_pos1[kmer] - reads_coords1[read_name][read_kmer_pos1[read_name][kmer]]) <= 1000 or \
                            abs(ref_kmers_pos2[kmer] - reads_coords1[read_name][
                                read_kmer_pos1[read_name][kmer]]) <= 1000:
                        score1 += 1
                        read_score += 1
                if kmer in ref_kmers_pos2 and read_name in read_kmer_pos2 and kmer in read_kmer_pos2[read_name] and \
                        read_kmer_pos2[read_name][kmer] in reads_coords2[read_name]:
                    if abs(ref_kmers_pos2[kmer] - reads_coords2[read_name][
                            read_kmer_pos2[read_name][kmer]]) <= 1000:
                        score2 += 1
                        read_score -= 1
            if read_score > KMER_SIZE:
                voting_reads1 += 1
                selected_reads1.append(read_name)
            elif read_score < -KMER_SIZE:
                voting_reads2 += 1
                selected_reads2.append(read_name)
        total_discordance = score1 - score2
        print(
            "Discordance between %s and %s: %d. There are %d (%d) discordant reads voting for %s (%s)."
            % (assembly1.name, assembly2.name, total_discordance,
               voting_reads1, voting_reads2, assembly1.name, assembly2.name))
        plot_fname = join(
            out_dir, "report",
            "discordance_%s_vs_%s.png" % (assembly1.name, assembly2.name))
        draw_discordance_coverage(
            max(get_fasta_len(assembly1.fname),
                get_fasta_len(assembly2.fname)), assembly1.name,
            assembly2.name, assembly1.bed_fname, assembly2.bed_fname,
            selected_reads1, selected_reads2, "Discordance coverage", out_dir,
            plot_fname)
    print("Discordance test finished.")