Esempio n. 1
0
def get_consensus(alignment_path, contigs_path, contigs_info, num_proc,
                  platform):
    """
    Main function
    """
    aln_reader = SynchronizedSamReader(
        alignment_path,
        fp.read_sequence_dict(contigs_path),
        max_coverage=cfg.vals["max_read_coverage"],
        use_secondary=True)
    manager = multiprocessing.Manager()
    results_queue = manager.Queue()
    error_queue = manager.Queue()

    #making sure the main process catches SIGINT
    orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN)
    threads = []
    for _ in range(num_proc):
        threads.append(
            multiprocessing.Process(target=_thread_worker,
                                    args=(aln_reader, contigs_info, platform,
                                          results_queue, error_queue)))
    signal.signal(signal.SIGINT, orig_sigint)

    for t in threads:
        t.start()
    try:
        for t in threads:
            t.join()
            if t.exitcode == -9:
                logger.error("Looks like the system ran out of memory")
            if t.exitcode != 0:
                raise Exception(
                    "One of the processes exited with code: {0}".format(
                        t.exitcode))
    except KeyboardInterrupt:
        for t in threads:
            t.terminate()
        raise

    if not error_queue.empty():
        raise error_queue.get()
    aln_reader.close()

    out_fasta = {}
    total_aln_errors = []
    while not results_queue.empty():
        ctg_id, ctg_seq, aln_errors = results_queue.get()
        total_aln_errors.extend(aln_errors)
        if len(ctg_seq) > 0:
            out_fasta[ctg_id] = ctg_seq

    mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1)
    logger.info("Alignment error rate: %f", mean_aln_error)

    return out_fasta
Esempio n. 2
0
def make_bubbles(alignment_path, contigs_info, contigs_path, err_mode,
                 num_proc, bubbles_out):
    """
    The main function: takes an alignment and returns bubbles
    """
    CHUNK_SIZE = 1000000

    contigs_fasta = fp.read_sequence_dict(contigs_path)
    aln_reader = SynchronizedSamReader(alignment_path,
                                       contigs_fasta,
                                       cfg.vals["max_read_coverage"],
                                       use_secondary=True)
    chunk_feeder = SynchonizedChunkManager(contigs_fasta,
                                           chunk_size=CHUNK_SIZE)

    manager = multiprocessing.Manager()
    results_queue = manager.Queue()
    error_queue = manager.Queue()
    bubbles_out_lock = multiprocessing.Lock()
    bubbles_out_handle = open(bubbles_out, "w")

    process_in_parallel(
        _thread_worker,
        (aln_reader, chunk_feeder, contigs_info, err_mode, results_queue,
         error_queue, bubbles_out_handle, bubbles_out_lock), num_proc)
    if not error_queue.empty():
        raise error_queue.get()

    #logging
    total_bubbles = 0
    total_long_bubbles = 0
    total_long_branches = 0
    total_empty = 0
    total_aln_errors = []
    coverage_stats = defaultdict(list)

    while not results_queue.empty():
        (ctg_id, num_bubbles, num_long_bubbles, num_empty, num_long_branch,
         aln_errors, mean_coverage) = results_queue.get()
        total_long_bubbles += num_long_bubbles
        total_long_branches += num_long_branch
        total_empty += num_empty
        total_aln_errors.extend(aln_errors)
        total_bubbles += num_bubbles
        coverage_stats[ctg_id].append(mean_coverage)

    for ctg in coverage_stats:
        coverage_stats[ctg] = int(
            sum(coverage_stats[ctg]) / len(coverage_stats[ctg]))

    mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1)
    logger.debug("Generated %d bubbles", total_bubbles)
    logger.debug("Split %d long bubbles", total_long_bubbles)
    logger.debug("Skipped %d empty bubbles", total_empty)
    logger.debug("Skipped %d bubbles with long branches", total_long_branches)
    ###

    return coverage_stats, mean_aln_error
Esempio n. 3
0
def get_consensus(alignment_path, contigs_path, contigs_info, num_proc,
                  platform):
    """
    Main function
    """

    CHUNK_SIZE = 1000000
    contigs_fasta = fp.read_sequence_dict(contigs_path)
    mp_manager = multiprocessing.Manager()
    aln_reader = SynchronizedSamReader(
        alignment_path,
        contigs_fasta,
        mp_manager,
        max_coverage=cfg.vals["max_read_coverage"],
        use_secondary=True)
    chunk_feeder = SynchonizedChunkManager(contigs_fasta, mp_manager,
                                           CHUNK_SIZE)

    #manager = multiprocessing.Manager()
    results_queue = mp_manager.Queue()
    error_queue = mp_manager.Queue()

    process_in_parallel(
        _thread_worker,
        (aln_reader, chunk_feeder, platform, results_queue, error_queue),
        num_proc)

    if not error_queue.empty():
        raise error_queue.get()

    chunk_consensus = defaultdict(list)
    total_aln_errors = []
    while not results_queue.empty():
        ctg_id, region_start, ctg_seq, aln_errors = results_queue.get()
        total_aln_errors.extend(aln_errors)
        if len(ctg_seq) > 0:
            chunk_consensus[ctg_id].append((region_start, ctg_seq))

    out_fasta = {}
    for ctg in chunk_consensus:
        sorted_chunks = [
            x[1] for x in sorted(chunk_consensus[ctg], key=lambda p: p[0])
        ]
        out_fasta[ctg] = "".join(sorted_chunks)

    mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1)
    logger.info("Alignment error rate: %f", mean_aln_error)

    return out_fasta
Esempio n. 4
0
def make_bubbles(alignment_path, contigs_info, contigs_path, err_mode,
                 num_proc, bubbles_out):
    """
    The main function: takes an alignment and returns bubbles
    """
    aln_reader = SynchronizedSamReader(alignment_path,
                                       fp.read_sequence_dict(contigs_path),
                                       cfg.vals["max_read_coverage"],
                                       use_secondary=True)
    manager = multiprocessing.Manager()
    results_queue = manager.Queue()
    error_queue = manager.Queue()

    #making sure the main process catches SIGINT
    orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN)
    threads = []
    bubbles_out_lock = multiprocessing.Lock()
    bubbles_out_handle = open(bubbles_out, "w")
    for _ in range(num_proc):
        threads.append(
            multiprocessing.Process(
                target=_thread_worker,
                args=(aln_reader, contigs_info, err_mode, results_queue,
                      error_queue, bubbles_out_handle, bubbles_out_lock)))
    signal.signal(signal.SIGINT, orig_sigint)

    for t in threads:
        t.start()
    try:
        for t in threads:
            t.join()
            if t.exitcode == -9:
                logger.error("Looks like the system ran out of memory")
            if t.exitcode != 0:
                raise Exception(
                    "One of the processes exited with code: {0}".format(
                        t.exitcode))
    except KeyboardInterrupt:
        for t in threads:
            t.terminate()
        raise

    if not error_queue.empty():
        raise error_queue.get()
    aln_reader.close()

    total_bubbles = 0
    total_long_bubbles = 0
    total_long_branches = 0
    total_empty = 0
    total_aln_errors = []
    coverage_stats = {}

    while not results_queue.empty():
        (ctg_id, num_bubbles, num_long_bubbles, num_empty, num_long_branch,
         aln_errors, mean_coverage) = results_queue.get()
        total_long_bubbles += num_long_bubbles
        total_long_branches += num_long_branch
        total_empty += num_empty
        total_aln_errors.extend(aln_errors)
        total_bubbles += num_bubbles
        coverage_stats[ctg_id] = mean_coverage

    mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1)
    logger.debug("Generated %d bubbles", total_bubbles)
    logger.debug("Split %d long bubbles", total_long_bubbles)
    logger.debug("Skipped %d empty bubbles", total_empty)
    logger.debug("Skipped %d bubbles with long branches", total_long_branches)

    return coverage_stats, mean_aln_error
Esempio n. 5
0
def generate_polished_edges(edges_file, gfa_file, polished_contigs, work_dir,
                            error_mode, polished_stats, num_threads):
    """
    Generate polished graph edges sequences by extracting them from
    polished contigs
    """
    logger.debug("Generating polished GFA")

    edges_new_coverage = {}
    with open(polished_stats, "r") as f:
        for line in f:
            if line.startswith("#"):
                continue
            ctg, _len, coverage = line.strip().split()
            ctg_id = ctg.split("_")[1]
            edges_new_coverage[ctg_id] = int(coverage)

    alignment_file = os.path.join(work_dir, "edges_aln.bam")
    polished_dict = fp.read_sequence_dict(polished_contigs)
    make_alignment(polished_contigs, [edges_file],
                   num_threads,
                   work_dir,
                   error_mode,
                   alignment_file,
                   reference_mode=True,
                   sam_output=True)
    aln_reader = SynchronizedSamReader(alignment_file, polished_dict,
                                       multiprocessing.Manager(),
                                       cfg.vals["max_read_coverage"])
    aln_by_edge = defaultdict(list)

    #getting one best alignment for each contig
    #for ctg in polished_dict:
    #    ctg_aln = aln_reader.get_alignments(ctg)
    for aln in aln_reader.get_all_alignments():
        aln_by_edge[aln.qry_id].append(aln)
    #logger.debug("Bam parsing done")

    MIN_CONTAINMENT = 0.9
    updated_seqs = 0
    edges_dict = fp.read_sequence_dict(edges_file)
    for edge in edges_dict:
        if edge in aln_by_edge:
            aln_by_edge[edge].sort(key=lambda a: a.qry_end - a.qry_start,
                                   reverse=True)
            main_aln = aln_by_edge[edge][0]
            map_start = main_aln.trg_start
            map_end = main_aln.trg_end
            for aln in aln_by_edge[edge]:
                if aln.trg_id == main_aln.trg_id and aln.trg_sign == main_aln.trg_sign:
                    map_start = min(map_start, aln.trg_start)
                    map_end = max(map_end, aln.trg_end)

            new_seq = polished_dict[main_aln.trg_id][map_start:map_end]
            if main_aln.qry_sign == "-":
                new_seq = fp.reverse_complement(new_seq)

            #print(edge, main_aln.qry_len, len(new_seq), main_aln.qry_start, main_aln.qry_end)
            if len(new_seq) / aln.qry_len > MIN_CONTAINMENT:
                edges_dict[edge] = new_seq
                updated_seqs += 1

    #writes fasta file with polished egdes
    #edges_polished = os.path.join(work_dir, "polished_edges.fasta")
    #fp.write_fasta_dict(edges_dict, edges_polished)

    #writes gfa file with polished edges
    with open(os.path.join(work_dir, "polished_edges.gfa"), "w") as gfa_polished, \
         open(gfa_file, "r") as gfa_in:
        for line in gfa_in:
            if line.startswith("S"):
                seq_id = line.split()[1]
                coverage_tag = line.split()[3]
                seq_num = seq_id.split("_")[1]
                if seq_num in edges_new_coverage:
                    #logger.info("from {0} to {1}".format(coverage_tag, edges_new_coverage[seq_num]))
                    coverage_tag = "dp:i:{0}".format(
                        edges_new_coverage[seq_num])
                gfa_polished.write("S\t{0}\t{1}\t{2}\n".format(
                    seq_id, edges_dict[seq_id], coverage_tag))
            else:
                gfa_polished.write(line)

    logger.debug("%d sequences remained unpolished",
                 len(edges_dict) - updated_seqs)
    os.remove(alignment_file)
Esempio n. 6
0
def find_divergence(alignment_path, contigs_path, contigs_info, frequency_path,
                    positions_path, div_sum_path, min_aln_rate, platform,
                    num_proc, sub_thresh, del_thresh, ins_thresh):
    """
    Main function: takes in an alignment and finds the divergent positions
    """
    if not os.path.isfile(alignment_path) or not os.path.isfile(contigs_path):
        ctg_profile = []
        positions = _write_frequency_path(frequency_path, ctg_profile,
                                          sub_thresh, del_thresh, ins_thresh)
        total_header = "".join([
            "Total_positions_{0}_".format(len(positions["total"])),
            "with_thresholds_sub_{0}".format(sub_thresh),
            "_del_{0}_ins_{1}".format(del_thresh, ins_thresh)
        ])
        sub_header = "".join([
            "Sub_positions_{0}_".format(len(positions["sub"])),
            "with_threshold_sub_{0}".format(sub_thresh)
        ])
        del_header = "".join([
            "Del_positions_{0}_".format(len(positions["del"])),
            "with_threshold_del_{0}".format(del_thresh)
        ])
        ins_header = "".join([
            "Ins_positions_{0}_".format(len(positions["ins"])),
            "with_threshold_ins_{0}".format(ins_thresh)
        ])
        _write_positions(positions_path, positions, total_header, sub_header,
                         del_header, ins_header)

        window_len = 1000
        sum_header = "Tentative Divergent Position Summary"
        _write_div_summary(div_sum_path, sum_header, positions,
                           len(ctg_profile), window_len)
        return

    contigs_fasta = fp.read_sequence_dict(contigs_path)
    aln_reader = SynchronizedSamReader(alignment_path, contigs_fasta,
                                       config.vals["max_read_coverage"])
    chunk_feeder = SynchonizedChunkManager(contigs_fasta)

    manager = multiprocessing.Manager()
    results_queue = manager.Queue()
    error_queue = manager.Queue()

    process_in_parallel(_thread_worker,
                        (aln_reader, chunk_feeder, contigs_info, platform,
                         results_queue, error_queue), num_proc)

    if not error_queue.empty():
        raise error_queue.get()

    total_aln_errors = []
    while not results_queue.empty():
        _, ctg_profile, aln_errors = results_queue.get()

        positions = _write_frequency_path(frequency_path, ctg_profile,
                                          sub_thresh, del_thresh, ins_thresh)
        total_header = "".join([
            "Total_positions_{0}_".format(len(positions["total"])),
            "with_thresholds_sub_{0}".format(sub_thresh),
            "_del_{0}_ins_{1}".format(del_thresh, ins_thresh)
        ])
        sub_header = "".join([
            "Sub_positions_{0}_".format(len(positions["sub"])),
            "with_threshold_sub_{0}".format(sub_thresh)
        ])
        del_header = "".join([
            "Del_positions_{0}_".format(len(positions["del"])),
            "with_threshold_del_{0}".format(del_thresh)
        ])
        ins_header = "".join([
            "Ins_positions_{0}_".format(len(positions["ins"])),
            "with_threshold_ins_{0}".format(ins_thresh)
        ])
        _write_positions(positions_path, positions, total_header, sub_header,
                         del_header, ins_header)

        window_len = 1000
        sum_header = "Tentative Divergent Position Summary"
        _write_div_summary(div_sum_path, sum_header, positions,
                           len(ctg_profile), window_len)

        logger.debug("Total positions: %d", len(positions["total"]))
        total_aln_errors.extend(aln_errors)

    mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1)
    logger.debug("Alignment error rate: %f", mean_aln_error)
Esempio n. 7
0
def generate_polished_edges(edges_file, gfa_file, polished_contigs, work_dir,
                            error_mode, num_threads):
    """
    Generate polished graph edges sequences by extracting them from
    polished contigs
    """
    logger.debug("Generating polished GFA")

    alignment_file = os.path.join(work_dir, "edges_aln.sam")
    polished_dict = fp.read_sequence_dict(polished_contigs)
    make_alignment(polished_contigs, [edges_file],
                   num_threads,
                   work_dir,
                   error_mode,
                   alignment_file,
                   reference_mode=True,
                   sam_output=True)
    aln_reader = SynchronizedSamReader(alignment_file, polished_dict,
                                       cfg.vals["max_read_coverage"])
    aln_reader.init_reading()
    aln_by_edge = defaultdict(list)

    #getting one best alignment for each contig
    while not aln_reader.is_eof():
        _, ctg_aln = aln_reader.get_chunk()
        for aln in ctg_aln:
            aln_by_edge[aln.qry_id].append(aln)
    aln_reader.stop_reading()

    MIN_CONTAINMENT = 0.9
    updated_seqs = 0
    edges_dict = fp.read_sequence_dict(edges_file)
    for edge in edges_dict:
        if edge in aln_by_edge:
            main_aln = aln_by_edge[edge][0]
            map_start = main_aln.trg_start
            map_end = main_aln.trg_end
            for aln in aln_by_edge[edge]:
                if aln.trg_id == main_aln.trg_id and aln.trg_sign == main_aln.trg_sign:
                    map_start = min(map_start, aln.trg_start)
                    map_end = max(map_end, aln.trg_end)

            new_seq = polished_dict[main_aln.trg_id][map_start:map_end]
            if main_aln.qry_sign == "-":
                new_seq = fp.reverse_complement(new_seq)

            #print edge, main_aln.qry_len, len(new_seq), main_aln.qry_start, main_aln.qry_end
            if len(new_seq) / aln.qry_len > MIN_CONTAINMENT:
                edges_dict[edge] = new_seq
                updated_seqs += 1

    #writes fasta file with polished egdes
    #edges_polished = os.path.join(work_dir, "polished_edges.fasta")
    #fp.write_fasta_dict(edges_dict, edges_polished)

    #writes gfa file with polished edges
    with open(os.path.join(work_dir, "polished_edges.gfa"), "w") as gfa_polished, \
         open(gfa_file, "r") as gfa_in:
        for line in gfa_in:
            if line.startswith("S"):
                seq_id = line.split()[1]
                coverage_tag = line.split()[3]
                gfa_polished.write("S\t{0}\t{1}\t{2}\n".format(
                    seq_id, edges_dict[seq_id], coverage_tag))
            else:
                gfa_polished.write(line)

    logger.debug("%d sequences remained unpolished",
                 len(edges_dict) - updated_seqs)
    os.remove(alignment_file)
Esempio n. 8
0
def find_divergence(alignment_path, contigs_path, contigs_info, frequency_path,
                    positions_path, div_sum_path, min_aln_rate, platform,
                    num_proc, sub_thresh, del_thresh, ins_thresh):
    """
    Main function: takes in an alignment and finds the divergent positions
    """
    if not os.path.isfile(alignment_path) or not os.path.isfile(contigs_path):
        ctg_profile = []
        positions = _write_frequency_path(frequency_path, ctg_profile,
                                          sub_thresh, del_thresh, ins_thresh)
        total_header = "".join([
            "Total_positions_{0}_".format(len(positions["total"])),
            "with_thresholds_sub_{0}".format(sub_thresh),
            "_del_{0}_ins_{1}".format(del_thresh, ins_thresh)
        ])
        sub_header = "".join([
            "Sub_positions_{0}_".format(len(positions["sub"])),
            "with_threshold_sub_{0}".format(sub_thresh)
        ])
        del_header = "".join([
            "Del_positions_{0}_".format(len(positions["del"])),
            "with_threshold_del_{0}".format(del_thresh)
        ])
        ins_header = "".join([
            "Ins_positions_{0}_".format(len(positions["ins"])),
            "with_threshold_ins_{0}".format(ins_thresh)
        ])
        _write_positions(positions_path, positions, total_header, sub_header,
                         del_header, ins_header)

        window_len = 1000
        sum_header = "Tentative Divergent Position Summary"
        _write_div_summary(div_sum_path, sum_header, positions,
                           len(ctg_profile), window_len)
        return

    aln_reader = SynchronizedSamReader(alignment_path,
                                       fp.read_sequence_dict(contigs_path),
                                       config.vals["max_read_coverage"])
    manager = multiprocessing.Manager()
    results_queue = manager.Queue()
    error_queue = manager.Queue()

    #making sure the main process catches SIGINT
    orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN)
    threads = []
    for _ in range(num_proc):
        threads.append(
            multiprocessing.Process(target=_thread_worker,
                                    args=(aln_reader, contigs_info, platform,
                                          results_queue, error_queue)))
    signal.signal(signal.SIGINT, orig_sigint)

    for t in threads:
        t.start()
    try:
        for t in threads:
            t.join()
    except KeyboardInterrupt:
        for t in threads:
            t.terminate()

    if not error_queue.empty():
        raise error_queue.get()

    total_aln_errors = []
    while not results_queue.empty():
        _, ctg_profile, aln_errors = results_queue.get()

        positions = _write_frequency_path(frequency_path, ctg_profile,
                                          sub_thresh, del_thresh, ins_thresh)
        total_header = "".join([
            "Total_positions_{0}_".format(len(positions["total"])),
            "with_thresholds_sub_{0}".format(sub_thresh),
            "_del_{0}_ins_{1}".format(del_thresh, ins_thresh)
        ])
        sub_header = "".join([
            "Sub_positions_{0}_".format(len(positions["sub"])),
            "with_threshold_sub_{0}".format(sub_thresh)
        ])
        del_header = "".join([
            "Del_positions_{0}_".format(len(positions["del"])),
            "with_threshold_del_{0}".format(del_thresh)
        ])
        ins_header = "".join([
            "Ins_positions_{0}_".format(len(positions["ins"])),
            "with_threshold_ins_{0}".format(ins_thresh)
        ])
        _write_positions(positions_path, positions, total_header, sub_header,
                         del_header, ins_header)

        window_len = 1000
        sum_header = "Tentative Divergent Position Summary"
        _write_div_summary(div_sum_path, sum_header, positions,
                           len(ctg_profile), window_len)

        logger.debug("Total positions: %d", len(positions["total"]))
        total_aln_errors.extend(aln_errors)

    mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1)
    logger.debug("Alignment error rate: %f", mean_aln_error)