Example #1
0
def _is_simple_kmer(profile, position):
    """
    Checks if the kmer with center at the given position is simple
    """
    SIMPLE_LEN = cfg.vals["simple_kmer_length"]

    extended_len = SIMPLE_LEN * 2
    nucl_str = [
        p.nucl for p in profile[position - extended_len // 2:position +
                                extended_len // 2]
    ]

    #single nucleotide homopolymers
    for i in range(extended_len // 2 - SIMPLE_LEN // 2,
                   extended_len // 2 + SIMPLE_LEN // 2 - 1):
        if nucl_str[i] == nucl_str[i + 1]:
            return False

    #dinucleotide homopolymers
    for shift in [0, 1]:
        for i in range(SIMPLE_LEN - shift - 1):
            pos = extended_len // 2 - SIMPLE_LEN + shift + i * 2
            if (nucl_str[pos:pos + 2] == nucl_str[pos + 2:pos + 4]):
                return False

    #trinucleotide homopolymers
    #for shift in [0, 1, 2]:
    #    for i in xrange(SIMPLE_LEN - shift - 1):
    #        pos = shift + i * 3
    #        if (nucl_str[pos : pos + 3] == nucl_str[pos + 3 : pos + 6]):
    #            #logger.debug("tri" + "".join(nucl_str))
    #            return False

    return True
Example #2
0
def _run(args):
    """
    Runs the pipeline
    """
    logger.info("Starting Flye " + _version())
    logger.debug("Cmd: %s", " ".join(sys.argv))
    logger.debug("Python version: " + sys.version)

    if args.genome_size:
        _set_genome_size(args)

    for read_file in args.reads:
        if not os.path.exists(read_file):
            raise ResumeException("Can't open " + read_file)
        if " " in read_file:
            raise ResumeException("Path to reads contain spaces: " + read_file)

    save_file = os.path.join(args.out_dir, "params.json")
    jobs = _create_job_list(args, args.out_dir, args.log_file)

    if args.stop_after and not args.stop_after in [j.name for j in jobs]:
        raise ResumeException("Stop after: unknown stage '{0}'".format(
            args.stop_after))

    current_job = 0
    if args.resume or args.resume_from:
        if not os.path.exists(save_file):
            raise ResumeException("Can't find save file")

        logger.info("Resuming previous run")
        if args.resume_from:
            job_to_resume = args.resume_from
        else:
            job_to_resume = json.load(open(save_file, "r"))["stage_name"]

        can_resume = False
        for i in range(len(jobs)):
            if jobs[i].name == job_to_resume:
                jobs[i].load(save_file)
                current_job = i
                if not jobs[i - 1].completed(save_file):
                    raise ResumeException(
                        "Can't resume: stage '{0}' incomplete".format(
                            jobs[i - 1].name))
                can_resume = True
                break

        if not can_resume:
            raise ResumeException(
                "Can't resume: stage {0} does not exist".format(job_to_resume))

    for i in range(current_job, len(jobs)):
        jobs[i].save(save_file)
        jobs[i].run()
        if args.stop_after == jobs[i].name:
            if i + 1 < len(jobs):
                jobs[i + 1].save(save_file)
            logger.info("Pipeline stopped as requested by --stop-after")
            break
Example #3
0
    def trim_and_transpose(_self, alignmens, region_start, region_end):
        """
        Transforms alignments so that the are strictly within the interval,
        and shifts the coordinates relative to this interval
        """
        MIN_ALN = 100

        trimmed_aln = []
        for aln in alignmens:
            if aln.trg_start >= region_start and aln.trg_end <= region_end:
                trimmed_aln.append(copy(aln))
                continue

            #trimming from left
            new_qry_start = aln.qry_start
            new_trg_start = aln.trg_start
            left_offset = None
            for left_offset in range(len(aln.trg_seq)):
                if new_trg_start >= region_start:
                    break
                if aln.trg_seq[left_offset] != "-":
                    new_trg_start += 1
                if aln.qry_seq[left_offset] != "-":
                    new_qry_start += 1

            #trimming from right
            new_qry_end = aln.qry_end
            new_trg_end = aln.trg_end
            right_offset = None
            for right_offset in range(len(aln.trg_seq)):
                if new_trg_end <= region_end:
                    break
                if aln.trg_seq[-1 - right_offset] != "-":
                    new_trg_end -= 1
                if aln.qry_seq[-1 - right_offset] != "-":
                    new_qry_end -= 1

            if new_trg_end - new_qry_end > MIN_ALN:
                new_qry_seq = aln.qry_seq[left_offset : len(aln.qry_seq) - right_offset]
                new_trg_seq = aln.trg_seq[left_offset : len(aln.trg_seq) - right_offset]
                trimmed_aln.append(aln._replace(qry_start=new_qry_start, qry_end=new_qry_end,
                                                trg_start=new_trg_start, trg_end=new_trg_end,
                                                qry_seq=new_qry_seq, trg_seq=new_trg_seq))

            #print("Aln trg", aln.trg_start, aln.trg_end, "qry", aln.qry_start, aln.qry_end)
            #print("Left offset", left_offset, "right offset", right_offset)
            #print("New aln", new_trg_start, new_trg_end, new_qry_start, new_qry_end)
            #print("")

        for i, aln in enumerate(trimmed_aln):
            trimmed_aln[i] = aln._replace(trg_start=aln.trg_start - region_start,
                                          trg_end=aln.trg_end - region_start,
                                          trg_len=region_end - region_start)

        #print(len(alignmens), len(trimmed_aln))

        return trimmed_aln
Example #4
0
    def __init__(self, reference_fasta, multiproc_manager, chunk_size=None):
        #prepare list of chunks to read
        self.fetch_list = []
        self.chunk_size = chunk_size

        #will be shared between processes
        #self.shared_manager = multiprocessing.Manager()
        self.shared_num_jobs = multiprocessing.Value(ctypes.c_int, 0)
        self.shared_lock = multiproc_manager.Lock()
        self.shared_eof = multiprocessing.Value(ctypes.c_bool, False)


        for ctg_id in reference_fasta:
            ctg_len = len(reference_fasta[ctg_id])
            chunk_size = self.chunk_size if self.chunk_size is not None else ctg_len
            for i in range(0, max(ctg_len // chunk_size, 1)):
                reg_start = i * chunk_size
                reg_end = (i + 1) * chunk_size
                if ctg_len - reg_end < chunk_size:
                    reg_end = ctg_len
                self.fetch_list.append(ContigRegion(ctg_id, reg_start, reg_end))
                #logger.debug("Region: {0} {1} {2}".format(ctg_id, reg_start, reg_end))

        if len(self.fetch_list) == 0:
            self.shared_eof.value = True
Example #5
0
def _contig_profile(alignment, platform, genome_len):
    """
    Computes alignment profile
    """
    #max_aln_err = config.vals["err_modes"][platform]["max_aln_error"]
    aln_errors = []
    profile = [Profile() for _ in range(genome_len)]
    for aln in alignment:
        #if aln.err_rate > max_aln_err: continue
        aln_errors.append(aln.err_rate)

        qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq)
        trg_seq = shift_gaps(qry_seq, aln.trg_seq)
        #qry_seq = aln.qry_seq
        #trg_seq = aln.trg_seq

        trg_pos = aln.trg_start
        for trg_nuc, qry_nuc in zip(trg_seq, qry_seq):
            if trg_nuc == "-":
                trg_pos -= 1
            if trg_pos >= genome_len:
                trg_pos -= genome_len

            prof_elem = profile[trg_pos]
            if trg_nuc == "-":
                prof_elem.insertions[qry_nuc] += 1
            else:
                prof_elem.nucl = trg_nuc
                prof_elem.matches[qry_nuc] += 1

            trg_pos += 1

    return profile, aln_errors
Example #6
0
def shift_gaps(seq_trg, seq_qry):
    """
    Shifts all ambigious query gaps to the right
    """
    lst_trg, lst_qry = list("$" + seq_trg + "$"), list("$" + seq_qry + "$")
    is_gap = False
    gap_start = 0
    for i in range(len(lst_trg)):
        if is_gap and lst_qry[i] != "-":
            is_gap = False
            swap_left = gap_start - 1
            swap_right = i - 1

            while (swap_left > 0 and swap_right >= gap_start and
                   lst_qry[swap_left] == lst_trg[swap_right]):
                lst_qry[swap_left], lst_qry[swap_right] = \
                            lst_qry[swap_right], lst_qry[swap_left]
                swap_left -= 1
                swap_right -= 1

        if not is_gap and lst_qry[i] == "-":
            is_gap = True
            gap_start = i

    return "".join(lst_qry[1 : -1])
Example #7
0
def _split_long_bubbles(bubbles):
    MAX_BUBBLE = cfg.vals["max_bubble_length"]
    #MAX_BUBBLE = 50
    #MAX_BRANCH = MAX_BUBBLE * 1.5
    new_bubbles = []
    long_branches = 0

    for bubble in bubbles:
        median_branch = sorted(bubble.branches,
                               key=len)[len(bubble.branches) // 2]
        num_chunks = len(median_branch) // MAX_BUBBLE
        #if len(median_branch) > MAX_BRANCH:
        if num_chunks > 1:
            #logger.debug("Splitting: pos:{0} len:{1}".format(bubble.position, len(median_branch)))
            long_branches += 1

            for part_num in range(num_chunks):
                new_branches = []
                for b in bubble.branches:
                    chunk_len = len(b) // num_chunks
                    start = part_num * chunk_len
                    end = (
                        part_num + 1
                    ) * chunk_len if part_num != num_chunks - 1 else len(b)
                    new_branches.append(b[start:end])

                new_bubbles.append(Bubble(bubble.contig_id, bubble.position))
                new_bubbles[-1].consensus = new_branches[0]
                new_bubbles[-1].branches = new_branches
                new_bubbles[-1].sub_position = part_num

        else:
            new_bubbles.append(bubble)

    return new_bubbles, long_branches
Example #8
0
 def _aln_score(aln):
     wnd_good = 0
     wnd_bad = 0
     for i in range(aln.trg_start // WINDOW, aln.trg_end // WINDOW + 1):
         if wnd_primary_cov[i] < cov_threshold:
             wnd_good += 1
         else:
             wnd_bad += 1
     return wnd_good, wnd_bad
Example #9
0
def get_consensus(alignment_path, contigs_path, contigs_info, num_proc,
                  platform):
    """
    Main function
    """
    aln_reader = SynchronizedSamReader(
        alignment_path,
        fp.read_sequence_dict(contigs_path),
        max_coverage=cfg.vals["max_read_coverage"],
        use_secondary=True)
    manager = multiprocessing.Manager()
    results_queue = manager.Queue()
    error_queue = manager.Queue()

    #making sure the main process catches SIGINT
    orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN)
    threads = []
    for _ in range(num_proc):
        threads.append(
            multiprocessing.Process(target=_thread_worker,
                                    args=(aln_reader, contigs_info, platform,
                                          results_queue, error_queue)))
    signal.signal(signal.SIGINT, orig_sigint)

    for t in threads:
        t.start()
    try:
        for t in threads:
            t.join()
            if t.exitcode == -9:
                logger.error("Looks like the system ran out of memory")
            if t.exitcode != 0:
                raise Exception(
                    "One of the processes exited with code: {0}".format(
                        t.exitcode))
    except KeyboardInterrupt:
        for t in threads:
            t.terminate()
        raise

    if not error_queue.empty():
        raise error_queue.get()
    aln_reader.close()

    out_fasta = {}
    total_aln_errors = []
    while not results_queue.empty():
        ctg_id, ctg_seq, aln_errors = results_queue.get()
        total_aln_errors.extend(aln_errors)
        if len(ctg_seq) > 0:
            out_fasta[ctg_id] = ctg_seq

    mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1)
    logger.info("Alignment error rate: %f", mean_aln_error)

    return out_fasta
Example #10
0
def write_fasta_dict(fasta_dict, filename):
    """
    Writes dictionary with fasta to file
    """
    with open(filename, "w") as f:
        for header in sorted(fasta_dict):
            f.write(">{0}\n".format(header))

            for i in range(0, len(fasta_dict[header]), 60):
                f.write(fasta_dict[header][i:i + 60] + "\n")
Example #11
0
def _get_partition(profile, err_mode):
    """
    Partitions genome into sub-alignments at solid regions / simple kmers
    """
    #logger.debug("Partitioning genome")
    SOLID_LEN = cfg.vals["solid_kmer_length"]
    SIMPLE_LEN = cfg.vals["simple_kmer_length"]
    MAX_BUBBLE = cfg.vals["max_bubble_length"]

    solid_flags = [False for _ in range(len(profile))]
    prof_pos = 0
    while prof_pos < len(profile) - SOLID_LEN:
        if _is_solid_kmer(profile, prof_pos, err_mode):
            for i in range(prof_pos, prof_pos + SOLID_LEN):
                solid_flags[i] = True
            prof_pos += SOLID_LEN
        else:
            prof_pos += 1

    partition = []
    prev_partition = SOLID_LEN

    long_bubbles = 0
    prof_pos = SOLID_LEN
    while prof_pos < len(profile) - SOLID_LEN:
        cur_partition = prof_pos + SIMPLE_LEN // 2
        landmark = (all(solid_flags[prof_pos:prof_pos + SIMPLE_LEN])
                    and _is_simple_kmer(profile, cur_partition))

        if prof_pos - prev_partition > MAX_BUBBLE:
            long_bubbles += 1

        if landmark or prof_pos - prev_partition > MAX_BUBBLE:
            partition.append(cur_partition)
            prev_partition = cur_partition
            prof_pos += SOLID_LEN
        else:
            prof_pos += 1

    #logger.debug("Partitioned into {0} segments".format(len(partition) + 1))
    #logger.debug("Long bubbles: {0}".format(long_bubbles))

    return partition, long_bubbles
Example #12
0
def unite_mapping_segments(segments):
    segments.sort(key=lambda segment: segment.start)
    united_segments = [segments[0]]

    for i in range(1, len(segments)):
        if segments[i].start <= united_segments[-1].end:
            if segments[i].end > united_segments[-1].end:
                united_segments[-1].end = segments[i].end
        else:
            united_segments.append(segments[i])

    return united_segments
Example #13
0
def get_uniform_alignments(alignments, seq_len):
    """
    Leaves top alignments for each position within contig
    assuming uniform coverage distribution
    """
    def _get_median(lst):
        if not lst:
            raise ValueError("_get_median() arg is an empty sequence")
        sorted_list = sorted(lst)
        if len(lst) % 2 == 1:
            return sorted_list[len(lst) // 2]
        else:
            mid1 = sorted_list[(len(lst) // 2) - 1]
            mid2 = sorted_list[(len(lst) // 2)]
            return (mid1 + mid2) / 2

    WINDOW = 100
    MIN_COV = 10
    COV_RATE = 1.25

    #split contig into windows, get median read coverage over all windows and
    #determine the quality threshold cutoffs for each window
    wnd_primary_cov = [0 for _ in range(seq_len // WINDOW + 1)]
    wnd_aln_quality = [[] for _ in range(seq_len // WINDOW + 1)]
    wnd_qual_thresholds = [1.0 for _ in range(seq_len // WINDOW + 1)]
    for aln in alignments:
        for i in range(aln.trg_start // WINDOW, aln.trg_end // WINDOW):
            if not aln.is_secondary:
                wnd_primary_cov[i] += 1
            wnd_aln_quality[i].append(aln.err_rate)

    #for each window, select top X alignmetns, where X is the median read coverage
    cov_threshold = max(int(COV_RATE * _get_median(wnd_primary_cov)), MIN_COV)
    for i in range(len(wnd_aln_quality)):
        if len(wnd_aln_quality[i]) > cov_threshold:
            wnd_qual_thresholds[i] = sorted(wnd_aln_quality[i])[cov_threshold]

    #for each alignment, count in how many windows it passes the threshold
    filtered_alignments = []
    total_sequence = 0
    filtered_sequence = 0
    for aln in alignments:
        good_windows = 0
        total_windows = aln.trg_end // WINDOW - aln.trg_start // WINDOW
        total_sequence += aln.trg_end - aln.trg_start
        for i in range(aln.trg_start // WINDOW, aln.trg_end // WINDOW):
            if aln.err_rate <= wnd_qual_thresholds[i]:
                good_windows += 1

        if good_windows > total_windows // 2:
            filtered_alignments.append(aln)
            filtered_sequence += aln.trg_end - aln.trg_start

    #filtered_reads_rate = 1 - float(len(filtered_alignments)) / len(alignments)
    #filtered_seq_rate = 1 - float(filtered_sequence) / total_sequence
    #logger.debug("Filtered {0:7.2f}% reads, {1:7.2f}% sequence"
    #                .format(filtered_reads_rate * 100, filtered_seq_rate * 100))

    return filtered_alignments
Example #14
0
def find_connected_components(graph):
    def dfs(start_vertex, connected_components_counter):
        dfs_stack = [start_vertex]
        used[start_vertex] = True
        while len(dfs_stack):
            vertex = dfs_stack.pop()
            connected_components[vertex] = connected_components_counter
            for neighbour in graph[vertex]:
                if not used[neighbour]:
                    dfs_stack.append(neighbour)
                    used[neighbour] = True

    n_vertices = len(graph)
    connected_components = [0 for _ in range(n_vertices)]
    connected_components_counter = 0
    used = [False for _ in range(n_vertices)]

    for i in range(n_vertices):
        if not used[i]:
            dfs(i, connected_components_counter)
            connected_components_counter += 1

    return connected_components, connected_components_counter
Example #15
0
def split_into_chunks(fasta_in, chunk_size, fasta_out):
    out_dict = {}
    for header, seq in fp.stream_sequence(fasta_in):
        #print len(seq)
        for i in range(0, max(len(seq) // chunk_size, 1)):
            chunk_hdr = "{0}$chunk_{1}".format(header, i)
            start = i * chunk_size
            end = (i + 1) * chunk_size
            if len(seq) - end < chunk_size:
                end = len(seq)

            #print(start, end)
            out_dict[chunk_hdr] = seq[start : end]

    fp.write_fasta_dict(out_dict, fasta_out)
Example #16
0
def split_into_chunks(fasta_in, chunk_size):
    out_dict = {}
    for header, seq in iteritems(fasta_in):
        #print len(seq)
        for i in range(0, max(len(seq) // chunk_size, 1)):
            chunk_hdr = "{0}$chunk_{1}".format(header, i)
            start = i * chunk_size
            end = (i + 1) * chunk_size
            if len(seq) - end < chunk_size:
                end = len(seq)

            #print(start, end)
            out_dict[chunk_hdr] = seq[start : end]

    return out_dict
Example #17
0
def _is_solid_kmer(profile, position, err_mode):
    """
    Checks if the kmer at given position is solid
    """
    MISSMATCH_RATE = cfg.vals["err_modes"][err_mode]["solid_missmatch"]
    INS_RATE = cfg.vals["err_modes"][err_mode]["solid_indel"]
    SOLID_LEN = cfg.vals["solid_kmer_length"]

    for i in range(position, position + SOLID_LEN):
        if profile[i].coverage == 0:
            return False
        local_missmatch = (profile[i].num_missmatch +
                           profile[i].num_deletions) / profile[i].coverage
        local_ins = profile[i].num_inserts / profile[i].coverage
        if local_missmatch > MISSMATCH_RATE or local_ins > INS_RATE:
            return False
    return True
Example #18
0
def _contig_profile(alignment, platform):
    """
    Computes alignment profile
    """

    if not alignment:
        return []

    genome_len = alignment[0].trg_len

    aln_errors = []
    profile = [Profile() for _ in range(genome_len)]
    #max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"]
    for aln in alignment:
        #if aln.err_rate > max_aln_err: continue
        aln_errors.append(aln.err_rate)

        #after gap shifting it is possible that
        #two gaps are aligned against each other
        qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq)
        trg_seq = shift_gaps(qry_seq, aln.trg_seq)

        trg_pos = aln.trg_start
        for trg_nuc, qry_nuc in zip(trg_seq, qry_seq):
            if trg_nuc == "-":
                trg_pos -= 1
            if trg_pos >= genome_len:
                trg_pos -= genome_len

            #total += 1
            prof_elem = profile[trg_pos]
            if trg_nuc == "-" and qry_nuc != "-":
                prof_elem.insertions[aln.qry_id] += qry_nuc
            else:
                prof_elem.nucl = trg_nuc
                prof_elem.matches[qry_nuc] += 1

            trg_pos += 1

    #print "len", genome_len, "median coverage", cov_threshold
    #print "total bases: ", total, "discarded bases: ", discarded
    #print "filtered", float(discarded) / total
    #print ""

    return profile, aln_errors
Example #19
0
def _compute_profile(alignment, platform, genome_len):
    """
    Computes alignment profile
    """
    max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"]
    min_aln_len = cfg.vals["min_polish_aln_len"]
    aln_errors = []
    #filtered = 0
    profile = [ProfileInfo() for _ in range(genome_len)]
    for aln in alignment:
        if aln.err_rate > max_aln_err or len(aln.qry_seq) < min_aln_len:
            #filtered += 1
            continue

        aln_errors.append(aln.err_rate)

        qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq)
        trg_seq = shift_gaps(qry_seq, aln.trg_seq)

        trg_pos = aln.trg_start
        for trg_nuc, qry_nuc in zip(trg_seq, qry_seq):
            if trg_nuc == "-":
                trg_pos -= 1
            if trg_pos >= genome_len:
                trg_pos -= genome_len

            prof_elem = profile[trg_pos]
            if trg_nuc == "-":
                prof_elem.num_inserts += 1
            else:
                prof_elem.nucl = trg_nuc
                prof_elem.coverage += 1

                if qry_nuc == "-":
                    prof_elem.num_deletions += 1
                elif trg_nuc != qry_nuc:
                    prof_elem.num_missmatch += 1

            trg_pos += 1

    #logger.debug("Filtered: {0} out of {1}".format(filtered, len(alignment)))
    return profile, aln_errors
Example #20
0
def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads,
           error_mode, output_progress):
    """
    High-level polisher interface
    """
    logger_state = logger.disabled
    if not output_progress:
        logger.disabled = True

    subs_matrix = os.path.join(
        cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["subs_matrix"])
    hopo_matrix = os.path.join(
        cfg.vals["pkg_root"], cfg.vals["err_modes"][error_mode]["hopo_matrix"])
    stats_file = os.path.join(work_dir, "contigs_stats.txt")

    prev_assembly = contig_seqs
    contig_lengths = None
    coverage_stats = None
    for i in range(num_iters):
        logger.info("Polishing genome (%d/%d)", i + 1, num_iters)

        #split into 1Mb chunks to reduce RAM usage
        #slightly vary chunk size between iterations
        CHUNK_SIZE = 1000000 - (i % 2) * 100000
        chunks_file = os.path.join(work_dir, "chunks_{0}.fasta".format(i + 1))
        chunks = split_into_chunks(fp.read_sequence_dict(prev_assembly),
                                   CHUNK_SIZE)
        fp.write_fasta_dict(chunks, chunks_file)

        ####
        logger.info("Running minimap2")
        alignment_file = os.path.join(work_dir,
                                      "minimap_{0}.sam".format(i + 1))
        make_alignment(chunks_file,
                       read_seqs,
                       num_threads,
                       work_dir,
                       error_mode,
                       alignment_file,
                       reference_mode=True,
                       sam_output=True)

        #####
        logger.info("Separating alignment into bubbles")
        contigs_info = get_contigs_info(chunks_file)
        bubbles_file = os.path.join(work_dir,
                                    "bubbles_{0}.fasta".format(i + 1))
        coverage_stats, mean_aln_error = \
            make_bubbles(alignment_file, contigs_info, chunks_file,
                         error_mode, num_threads,
                         bubbles_file)

        logger.info("Alignment error rate: %f", mean_aln_error)
        consensus_out = os.path.join(work_dir,
                                     "consensus_{0}.fasta".format(i + 1))
        polished_file = os.path.join(work_dir,
                                     "polished_{0}.fasta".format(i + 1))
        if os.path.getsize(bubbles_file) == 0:
            logger.info("No reads were aligned during polishing")
            if not output_progress:
                logger.disabled = logger_state
            open(stats_file, "w").write("#seq_name\tlength\tcoverage\n")
            open(polished_file, "w")
            return polished_file, stats_file

        #####
        logger.info("Correcting bubbles")
        _run_polish_bin(bubbles_file, subs_matrix, hopo_matrix, consensus_out,
                        num_threads, output_progress)
        polished_fasta, polished_lengths = _compose_sequence(consensus_out)
        merged_chunks = merge_chunks(polished_fasta)
        fp.write_fasta_dict(merged_chunks, polished_file)

        #Cleanup
        os.remove(chunks_file)
        os.remove(bubbles_file)
        os.remove(consensus_out)
        os.remove(alignment_file)

        contig_lengths = polished_lengths
        prev_assembly = polished_file

    #merge information from chunks
    contig_lengths = merge_chunks(contig_lengths, fold_function=sum)
    coverage_stats = merge_chunks(coverage_stats,
                                  fold_function=lambda l: sum(l) // len(l))

    with open(stats_file, "w") as f:
        f.write("#seq_name\tlength\tcoverage\n")
        for ctg_id in contig_lengths:
            f.write("{0}\t{1}\t{2}\n".format(ctg_id, contig_lengths[ctg_id],
                                             coverage_stats[ctg_id]))

    if not output_progress:
        logger.disabled = logger_state

    return prev_assembly, stats_file
Example #21
0
def _write_div_summary(div_sum_path, sum_header, positions, seq_len,
                       window_len):
    pos_list = sorted(positions["total"])
    av_div = 0.0
    if seq_len != 0:
        av_div = len(pos_list) / float(seq_len)

    position_gaps = [0 for _ in range(len(pos_list) + 1)]
    curr_pos = 0
    for i, p in enumerate(pos_list):
        position_gaps[i] = p - curr_pos
        curr_pos = p
    position_gaps[-1] = seq_len - curr_pos

    mean_position_gap = _mean(position_gaps)
    max_position_gap = max(position_gaps)

    window_len = 1000
    position_counts = [0 for _ in range(((seq_len - 1) // window_len) + 1)]
    window_divs = [0.0 for _ in range(((seq_len - 1) // window_len) + 1)]
    curr_p_i = 0
    for i in range(len(window_divs)):
        start = i * window_len
        end = (i + 1) * window_len - 1
        if i == len(window_divs) - 1:
            end = seq_len - 1

        curr_window_len = end - start + 1

        if curr_p_i < len(pos_list) and pos_list[curr_p_i] < start:
            raise PositionIOError('Problem with position indices')
        while curr_p_i < len(pos_list) and pos_list[curr_p_i] <= end:
            position_counts[i] += 1
            curr_p_i += 1

        window_divs[i] = 0.0
        if curr_window_len != 0:
            window_divs[i] = position_counts[i] / float(curr_window_len)

    mean_window_div = _mean(window_divs)
    median_window_div = _get_median(window_divs)
    min_window_div = min(window_divs)

    with open(div_sum_path, 'w') as f:
        f.write("{0}\n\n".format(sum_header))

        f.write("{0:33}\t{1}\n".format("Sequence Length:", seq_len))
        f.write("{0:33}\t{1:.4f}\n\n".format("Average Divergence:", av_div))

        f.write("{0:33}\t{1}\n".format("Total Substitution Positions:",
                                       len(positions["sub"])))
        f.write("{0:33}\t{1}\n".format("Total Deletion Positions:",
                                       len(positions["del"])))
        f.write("{0:33}\t{1}\n".format("Total Insertion Positions:",
                                       len(positions["ins"])))
        f.write("{0:33}\t{1}\n".format("Total Positions:",
                                       len(positions["total"])))
        mixed_count = (len(positions["sub"]) + len(positions["del"]) +
                       len(positions["ins"])) - len(positions["total"])
        f.write("{0:33}\t{1}\n\n".format("Mixed Positions:", mixed_count))

        f.write("{0:33}\t{1:.2f}\n".format("Mean Position Gap:",
                                           mean_position_gap))
        f.write("{0:33}\t{1}\n".format("Max Position Gap:", max_position_gap))

        f.write("{0:33}\t{1}\n".format("Window Length:", window_len))
        f.write("{0:33}\t{1:.5f}\n".format("Mean Window Divergence:",
                                           mean_window_div))
        f.write("{0:33}\t{1:.5f}\n".format("Median Window Divergence:",
                                           median_window_div))
        f.write("{0:33}\t{1:.5f}\n".format("Min Window Divergence:",
                                           min_window_div))
Example #22
0
def polish(contig_seqs, read_seqs, work_dir, num_iters, num_threads,
           read_platform, read_type, output_progress):
    """
    High-level polisher interface
    """
    logger_state = logger.disabled
    if not output_progress:
        logger.disabled = True

    subs_matrix = os.path.join(
        cfg.vals["pkg_root"],
        cfg.vals["err_modes"][read_platform]["subs_matrix"])
    hopo_matrix = os.path.join(
        cfg.vals["pkg_root"],
        cfg.vals["err_modes"][read_platform]["hopo_matrix"])
    use_hopo = cfg.vals["err_modes"][read_platform]["hopo_enabled"]
    use_hopo = use_hopo and (read_type == "raw")
    stats_file = os.path.join(work_dir, "contigs_stats.txt")

    bam_input = read_seqs[0].endswith("bam")

    prev_assembly = contig_seqs
    contig_lengths = None
    coverage_stats = None
    for i in range(num_iters):
        logger.info("Polishing genome (%d/%d)", i + 1, num_iters)

        ####
        if not bam_input:
            logger.info("Running minimap2")
            alignment_file = os.path.join(work_dir,
                                          "minimap_{0}.bam".format(i + 1))
            make_alignment(prev_assembly,
                           read_seqs,
                           num_threads,
                           work_dir,
                           read_platform,
                           alignment_file,
                           reference_mode=True,
                           sam_output=True)
        else:
            logger.info("Polishing with provided bam")
            alignment_file = read_seqs[0]

        #####
        logger.info("Separating alignment into bubbles")
        contigs_info = get_contigs_info(prev_assembly)
        bubbles_file = os.path.join(work_dir,
                                    "bubbles_{0}.fasta".format(i + 1))
        coverage_stats, mean_aln_error = \
            make_bubbles(alignment_file, contigs_info, prev_assembly,
                         read_platform, num_threads,
                         bubbles_file)

        logger.info("Alignment error rate: %f", mean_aln_error)
        consensus_out = os.path.join(work_dir,
                                     "consensus_{0}.fasta".format(i + 1))
        polished_file = os.path.join(work_dir,
                                     "polished_{0}.fasta".format(i + 1))
        if os.path.getsize(bubbles_file) == 0:
            logger.info("No reads were aligned during polishing")
            if not output_progress:
                logger.disabled = logger_state
            open(stats_file, "w").write("#seq_name\tlength\tcoverage\n")
            open(polished_file, "w")
            return polished_file, stats_file

        #####
        logger.info("Correcting bubbles")
        _run_polish_bin(bubbles_file, subs_matrix, hopo_matrix, consensus_out,
                        num_threads, output_progress, use_hopo)
        polished_fasta, polished_lengths = _compose_sequence(consensus_out)
        fp.write_fasta_dict(polished_fasta, polished_file)

        #Cleanup
        os.remove(bubbles_file)
        os.remove(consensus_out)
        if not bam_input:
            os.remove(alignment_file)

        contig_lengths = polished_lengths
        prev_assembly = polished_file

    with open(stats_file, "w") as f:
        f.write("#seq_name\tlength\tcoverage\n")
        for ctg_id in contig_lengths:
            f.write("{0}\t{1}\t{2}\n".format(ctg_id, contig_lengths[ctg_id],
                                             coverage_stats[ctg_id]))

    if not output_progress:
        logger.disabled = logger_state

    return prev_assembly, stats_file
Example #23
0
def get_uniform_alignments(alignments):
    """
    Leaves top alignments for each position within contig
    assuming uniform coverage distribution
    """
    if not alignments:
        return []

    WINDOW = 100
    MIN_COV = 20
    GOOD_RATE = 0.66
    MIN_QV = 20

    def is_reliable(aln):
        return not aln.is_secondary and not aln.is_supplementary and aln.map_qv >= MIN_QV

    seq_len = alignments[0].trg_len
    ctg_id = alignments[0].trg_id

    #split contig into windows, get median read coverage over all windows and
    #determine the quality threshold cutoffs for each window
    wnd_primary_cov = [0 for _ in range(seq_len // WINDOW + 1)]
    wnd_all_cov = [0 for _ in range(seq_len // WINDOW + 1)]

    for aln in alignments:
        for i in range(aln.trg_start // WINDOW, aln.trg_end // WINDOW + 1):
            if is_reliable(aln):
                wnd_primary_cov[i] += 1
            wnd_all_cov[i] += 1

    cov_threshold = max(int(get_median(wnd_primary_cov)), MIN_COV)
    orig_primary_cov = copy(wnd_primary_cov)

    selected_alignments = []
    original_sequence = 0
    primary_sequence = 0
    secondary_sequence = 0
    primary_aln = 0
    secondary_aln = 0

    def _aln_score(aln):
        wnd_good = 0
        wnd_bad = 0
        for i in range(aln.trg_start // WINDOW, aln.trg_end // WINDOW + 1):
            if wnd_primary_cov[i] < cov_threshold:
                wnd_good += 1
            else:
                wnd_bad += 1
        return wnd_good, wnd_bad

    sec_aln_scores = {}
    for aln in alignments:
        original_sequence += aln.trg_end - aln.trg_start

        #always keep primary alignments, regardless of local coverage
        if is_reliable(aln):
            primary_sequence += aln.trg_end - aln.trg_start
            primary_aln += 1
            selected_alignments.append(aln)

        #if alignment is secondary, count how many windows it helps to improve
        else:
            wnd_good, wnd_bad = _aln_score(aln)
            sec_aln_scores[aln.qry_id] = (wnd_good, wnd_bad, aln)

    #now, greedily add secondaty alignments, until they add useful coverage
    _score_fun = lambda x: (sec_aln_scores[x][0] - 2 * sec_aln_scores[x][1],
                            sec_aln_scores[x][2].trg_end - sec_aln_scores[x][2].trg_start)
    sorted_sec_aln = [x for x in sorted(sec_aln_scores, reverse=True, key=_score_fun)]
    for aln_id in sorted_sec_aln:
        aln = sec_aln_scores[aln_id][2]
        #recompute scores
        wnd_good, wnd_bad = _aln_score(aln)
        to_take = wnd_good / (wnd_good + wnd_bad) > GOOD_RATE
        if to_take:
            selected_alignments.append(aln)
            secondary_aln += 1
            secondary_sequence += aln.trg_end - aln.trg_start
            for i in range(aln.trg_start // WINDOW, aln.trg_end // WINDOW + 1):
                wnd_primary_cov[i] += 1

        #logger.debug("\tSec score: {} {} {} {}".format(aln_id, wnd_good, wnd_bad, to_take))

    #logger.debug("Seq: {0} pri_cov: {1} all_cov: {2}".format(ctg_id, get_median(orig_primary_cov),
    #                                                         get_median(wnd_all_cov)) + "\n" +
    #            "\tOriginal seq: {0}, reads: {1}".format(original_sequence, len(alignments)) + "\n" +
    #            "\tPrimary seq: {0}, reads: {1}".format(primary_sequence, primary_aln) + "\n" +
    #            "\tSecondary seq: {0}, reads: {1}".format(secondary_sequence, secondary_aln) + "\n" +
    #            "\tSelected size: {0}, median coverage: {1}".format(len(selected_alignments), get_median(wnd_primary_cov)))

    median_cov = get_median(wnd_primary_cov)
    #mean_cov = sum(wnd_primary_cov) / (len(wnd_primary_cov) + 1)

    return selected_alignments, median_cov
Example #24
0
    def _parse_cigar(self, cigar_str, read_str, ctg_str, ctg_pos):
        #ctg_str = self.ref_fasta[ctg_name]
        trg_seq = []
        qry_seq = []
        trg_start = ctg_pos - 1
        trg_pos = ctg_pos - 1
        qry_start = 0
        qry_pos = 0

        left_hard = True
        left_soft = True
        hard_clipped_left = 0
        hard_clipped_right = 0
        soft_clipped_left = 0
        soft_clipped_right = 0
        for token in self.cigar_parser.findall(cigar_str):
            size, op = int(token[:-1]), token[-1:]
            if op == b"H":
                if left_hard:
                    qry_start += size
                    hard_clipped_left += size
                else:
                    hard_clipped_right += size
            elif op == b"S":
                qry_pos += size
                if left_soft:
                    soft_clipped_left += size
                else:
                    soft_clipped_right += size
            elif op == b"M":
                qry_seq.append(read_str[qry_pos : qry_pos + size].upper())
                trg_seq.append(ctg_str[trg_pos : trg_pos + size].upper())
                qry_pos += size
                trg_pos += size
            elif op == b"I":
                qry_seq.append(read_str[qry_pos : qry_pos + size].upper())
                trg_seq.append(b"-" * size)
                qry_pos += size
            elif op == b"D":
                qry_seq.append(b"-" * size)
                trg_seq.append(ctg_str[trg_pos : trg_pos + size].upper())
                trg_pos += size
            else:
                raise AlignmentException("Unsupported CIGAR operation: " + str(op))
            left_hard = False
            if op != b"H":
                left_soft = False

        trg_seq = b"".join(trg_seq)
        qry_seq = b"".join(qry_seq)
        matches = 0
        for i in range(len(trg_seq)):
            if trg_seq[i] == qry_seq[i]:
                matches += 1
        err_rate = 1 - matches / len(trg_seq)

        trg_end = trg_pos
        qry_end = qry_pos + hard_clipped_left
        qry_len = qry_end + hard_clipped_right
        qry_start += soft_clipped_left
        qry_end -= soft_clipped_right

        return (trg_start, trg_end, len(ctg_str), trg_seq,
                qry_start, qry_end, qry_len, qry_seq, err_rate)
Example #25
0
def find_divergence(alignment_path, contigs_path, contigs_info, frequency_path,
                    positions_path, div_sum_path, min_aln_rate, platform,
                    num_proc, sub_thresh, del_thresh, ins_thresh):
    """
    Main function: takes in an alignment and finds the divergent positions
    """
    if not os.path.isfile(alignment_path) or not os.path.isfile(contigs_path):
        ctg_profile = []
        positions = _write_frequency_path(frequency_path, ctg_profile,
                                          sub_thresh, del_thresh, ins_thresh)
        total_header = "".join([
            "Total_positions_{0}_".format(len(positions["total"])),
            "with_thresholds_sub_{0}".format(sub_thresh),
            "_del_{0}_ins_{1}".format(del_thresh, ins_thresh)
        ])
        sub_header = "".join([
            "Sub_positions_{0}_".format(len(positions["sub"])),
            "with_threshold_sub_{0}".format(sub_thresh)
        ])
        del_header = "".join([
            "Del_positions_{0}_".format(len(positions["del"])),
            "with_threshold_del_{0}".format(del_thresh)
        ])
        ins_header = "".join([
            "Ins_positions_{0}_".format(len(positions["ins"])),
            "with_threshold_ins_{0}".format(ins_thresh)
        ])
        _write_positions(positions_path, positions, total_header, sub_header,
                         del_header, ins_header)

        window_len = 1000
        sum_header = "Tentative Divergent Position Summary"
        _write_div_summary(div_sum_path, sum_header, positions,
                           len(ctg_profile), window_len)
        return

    aln_reader = SynchronizedSamReader(alignment_path,
                                       fp.read_sequence_dict(contigs_path),
                                       config.vals["max_read_coverage"])
    manager = multiprocessing.Manager()
    results_queue = manager.Queue()
    error_queue = manager.Queue()

    #making sure the main process catches SIGINT
    orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN)
    threads = []
    for _ in range(num_proc):
        threads.append(
            multiprocessing.Process(target=_thread_worker,
                                    args=(aln_reader, contigs_info, platform,
                                          results_queue, error_queue)))
    signal.signal(signal.SIGINT, orig_sigint)

    for t in threads:
        t.start()
    try:
        for t in threads:
            t.join()
    except KeyboardInterrupt:
        for t in threads:
            t.terminate()

    if not error_queue.empty():
        raise error_queue.get()

    total_aln_errors = []
    while not results_queue.empty():
        _, ctg_profile, aln_errors = results_queue.get()

        positions = _write_frequency_path(frequency_path, ctg_profile,
                                          sub_thresh, del_thresh, ins_thresh)
        total_header = "".join([
            "Total_positions_{0}_".format(len(positions["total"])),
            "with_thresholds_sub_{0}".format(sub_thresh),
            "_del_{0}_ins_{1}".format(del_thresh, ins_thresh)
        ])
        sub_header = "".join([
            "Sub_positions_{0}_".format(len(positions["sub"])),
            "with_threshold_sub_{0}".format(sub_thresh)
        ])
        del_header = "".join([
            "Del_positions_{0}_".format(len(positions["del"])),
            "with_threshold_del_{0}".format(del_thresh)
        ])
        ins_header = "".join([
            "Ins_positions_{0}_".format(len(positions["ins"])),
            "with_threshold_ins_{0}".format(ins_thresh)
        ])
        _write_positions(positions_path, positions, total_header, sub_header,
                         del_header, ins_header)

        window_len = 1000
        sum_header = "Tentative Divergent Position Summary"
        _write_div_summary(div_sum_path, sum_header, positions,
                           len(ctg_profile), window_len)

        logger.debug("Total positions: %d", len(positions["total"]))
        total_aln_errors.extend(aln_errors)

    mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1)
    logger.debug("Alignment error rate: %f", mean_aln_error)
Example #26
0
def make_bubbles(alignment_path, contigs_info, contigs_path, err_mode,
                 num_proc, bubbles_out):
    """
    The main function: takes an alignment and returns bubbles
    """
    aln_reader = SynchronizedSamReader(alignment_path,
                                       fp.read_sequence_dict(contigs_path),
                                       cfg.vals["max_read_coverage"],
                                       use_secondary=True)
    manager = multiprocessing.Manager()
    results_queue = manager.Queue()
    error_queue = manager.Queue()

    #making sure the main process catches SIGINT
    orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN)
    threads = []
    bubbles_out_lock = multiprocessing.Lock()
    bubbles_out_handle = open(bubbles_out, "w")
    for _ in range(num_proc):
        threads.append(
            multiprocessing.Process(
                target=_thread_worker,
                args=(aln_reader, contigs_info, err_mode, results_queue,
                      error_queue, bubbles_out_handle, bubbles_out_lock)))
    signal.signal(signal.SIGINT, orig_sigint)

    for t in threads:
        t.start()
    try:
        for t in threads:
            t.join()
            if t.exitcode == -9:
                logger.error("Looks like the system ran out of memory")
            if t.exitcode != 0:
                raise Exception(
                    "One of the processes exited with code: {0}".format(
                        t.exitcode))
    except KeyboardInterrupt:
        for t in threads:
            t.terminate()
        raise

    if not error_queue.empty():
        raise error_queue.get()
    aln_reader.close()

    total_bubbles = 0
    total_long_bubbles = 0
    total_long_branches = 0
    total_empty = 0
    total_aln_errors = []
    coverage_stats = {}

    while not results_queue.empty():
        (ctg_id, num_bubbles, num_long_bubbles, num_empty, num_long_branch,
         aln_errors, mean_coverage) = results_queue.get()
        total_long_bubbles += num_long_bubbles
        total_long_branches += num_long_branch
        total_empty += num_empty
        total_aln_errors.extend(aln_errors)
        total_bubbles += num_bubbles
        coverage_stats[ctg_id] = mean_coverage

    mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1)
    logger.debug("Generated %d bubbles", total_bubbles)
    logger.debug("Split %d long bubbles", total_long_bubbles)
    logger.debug("Skipped %d empty bubbles", total_empty)
    logger.debug("Skipped %d bubbles with long branches", total_long_branches)

    return coverage_stats, mean_aln_error
Example #27
0
def _compute_profile(alignment, ref_sequence):
    """
    Computes alignment profile
    """
    if len(alignment) == 0:
        raise Exception("No alignmemnts!")
    genome_len = alignment[0].trg_len

    #max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"]
    min_aln_len = cfg.vals["min_polish_aln_len"]
    aln_errors = []
    #filtered = 0
    profile = [ProfileInfo() for _ in range(genome_len)]

    for i in range(genome_len):
        profile[i].nucl = ref_sequence[i]

    for aln in alignment:
        #if aln.err_rate > max_aln_err or len(aln.qry_seq) < min_aln_len:
        if len(aln.qry_seq) < min_aln_len:
            #filtered += 1
            continue

        aln_errors.append(aln.err_rate)

        qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq)
        trg_seq = shift_gaps(qry_seq, aln.trg_seq)

        trg_pos = aln.trg_start
        for trg_nuc, qry_nuc in zip(trg_seq, qry_seq):
            if trg_nuc == "-":
                trg_pos -= 1
            #if trg_pos >= genome_len:
            #    trg_pos -= genome_len

            prof_elem = profile[trg_pos]
            if trg_nuc == "-":
                prof_elem.insertions[aln.qry_id] += qry_nuc
                #prof_elem.num_inserts += 1
            else:
                #prof_elem.nucl = trg_nuc
                prof_elem.coverage += 1

                if qry_nuc == "-":
                    prof_elem.num_deletions += 1
                elif trg_nuc != qry_nuc:
                    prof_elem.num_missmatch += 1

            trg_pos += 1

    for i in range(genome_len):
        for ins_read, ins_str in profile[i].insertions.items():
            profile[i].propagated_ins += 1
            span = len(ins_str)
            for j in range(max(0, i - span), i):
                profile[j].propagated_ins += 1
            for j in range(i + 1, min(i + span + 1, genome_len)):
                profile[j].propagated_ins += 1

    #logger.debug("Filtered: {0} out of {1}".format(filtered, len(alignment)))
    return profile, aln_errors
Example #28
0
def extract_unique_plasmids(trimmed_reads_mapping,
                            trimmed_reads_path,
                            mapping_rate_threshold=0.8,
                            max_length_difference=500,
                            min_sequence_length=1000):
    trimmed_reads = set()
    for hit in read_paf(trimmed_reads_mapping):
        trimmed_reads.add(hit.query)
        trimmed_reads.add(hit.target)

    trimmed_reads = list(trimmed_reads)
    n_trimmed_reads = len(trimmed_reads)
    read2int = dict()
    int2read = dict()

    for i in range(n_trimmed_reads):
        read2int[trimmed_reads[i]] = i
        int2read[i] = trimmed_reads[i]

    similarity_graph = [[] for _ in range(n_trimmed_reads)]

    #each hit group stores alginmemnts for each (query, target) pair
    for hit_group in read_paf_grouped(trimmed_reads_mapping):
        if hit_group[0].query == hit_group[0].target:
            continue

        query_mapping_segments = []
        target_mapping_segments = []
        for hit in hit_group:
            query_mapping_segments.append(
                unmapped.MappingSegment(hit.query_start, hit.query_end))
            target_mapping_segments.append(
                unmapped.MappingSegment(hit.target_start, hit.target_end))

        query_length = hit_group[0].query_length
        target_length = hit_group[0].target_length
        query_mapping_rate = unmapped.calc_mapping_rate(
            query_length, query_mapping_segments)
        target_mapping_rate = unmapped.calc_mapping_rate(
            target_length, target_mapping_segments)

        if (query_mapping_rate > mapping_rate_threshold
                or target_mapping_rate > mapping_rate_threshold):
            #abs(query_length - target_length) < max_length_difference:
            vertex1 = read2int[hit_group[0].query]
            vertex2 = read2int[hit_group[0].target]
            similarity_graph[vertex1].append(vertex2)
            similarity_graph[vertex2].append(vertex1)

    connected_components, n_components = \
        utils.find_connected_components(similarity_graph)

    groups = [[] for _ in range(n_components)]
    for i in range(len(connected_components)):
        groups[connected_components[i]].append(int2read[i])

    #for g in groups:
    #    logger.debug("Group {0}".format(len(g)))
    #    for s in g:
    #        logger.debug("\t{0}".format(seq_lengths[s]))

    groups = [group for group in groups if len(group) > 1]
    trimmed_reads_dict = fp.read_sequence_dict(trimmed_reads_path)
    unique_plasmids = dict()

    for group in groups:
        sequence = trimmed_reads_dict[group[0]]
        if len(sequence) >= min_sequence_length:
            unique_plasmids[group[0]] = sequence

    return unique_plasmids