Beispiel #1
0
def _is_simple_kmer(profile, position):
    """
    Checks if the kmer with center at the given position is simple
    """
    SIMPLE_LEN = cfg.vals["simple_kmer_length"]

    extended_len = SIMPLE_LEN * 2
    nucl_str = [p.nucl for p in profile[position - extended_len // 2 :
                                        position + extended_len // 2]]

    #single nucleotide homopolymers
    for i in range(extended_len // 2 - SIMPLE_LEN // 2,
                   extended_len // 2 + SIMPLE_LEN // 2 - 1):
        if nucl_str[i] == nucl_str[i + 1]:
            return False

    #dinucleotide homopolymers
    for shift in [0, 1]:
        for i in range(SIMPLE_LEN - shift - 1):
            pos = extended_len // 2 - SIMPLE_LEN + shift + i * 2
            if (nucl_str[pos : pos + 2] == nucl_str[pos + 2 : pos + 4]):
                return False

    #trinucleotide homopolymers
    #for shift in [0, 1, 2]:
    #    for i in xrange(SIMPLE_LEN - shift - 1):
    #        pos = shift + i * 3
    #        if (nucl_str[pos : pos + 3] == nucl_str[pos + 3 : pos + 6]):
    #            #logger.debug("tri" + "".join(nucl_str))
    #            return False

    return True
Beispiel #2
0
def shift_gaps(seq_trg, seq_qry):
    """
    Shifts all ambigious query gaps to the right
    """
    lst_trg, lst_qry = list("$" + seq_trg + "$"), list("$" + seq_qry + "$")
    is_gap = False
    gap_start = 0
    for i in range(len(lst_trg)):
        if is_gap and lst_qry[i] != "-":
            is_gap = False
            swap_left = gap_start - 1
            swap_right = i - 1

            while (swap_left > 0 and swap_right >= gap_start
                   and lst_qry[swap_left] == lst_trg[swap_right]):
                lst_qry[swap_left], lst_qry[swap_right] = \
                            lst_qry[swap_right], lst_qry[swap_left]
                swap_left -= 1
                swap_right -= 1

        if not is_gap and lst_qry[i] == "-":
            is_gap = True
            gap_start = i

    return "".join(lst_qry[1:-1])
Beispiel #3
0
def _get_partition(profile, err_mode):
    """
    Partitions genome into sub-alignments at solid regions / simple kmers
    """
    #logger.debug("Partitioning genome")
    SOLID_LEN = cfg.vals["solid_kmer_length"]
    SIMPLE_LEN = cfg.vals["simple_kmer_length"]
    MAX_BUBBLE = cfg.vals["max_bubble_length"]

    solid_flags = [False for _ in range(len(profile))]
    prof_pos = 0
    while prof_pos < len(profile) - SOLID_LEN:
        if _is_solid_kmer(profile, prof_pos, err_mode):
            for i in range(prof_pos, prof_pos + SOLID_LEN):
                solid_flags[i] = True
            prof_pos += SOLID_LEN
        else:
            prof_pos += 1

    partition = []
    prev_partition = SOLID_LEN

    long_bubbles = 0
    prof_pos = SOLID_LEN
    while prof_pos < len(profile) - SOLID_LEN:
        cur_partition = prof_pos + SIMPLE_LEN // 2
        landmark = (all(solid_flags[prof_pos : prof_pos + SIMPLE_LEN]) and
                    _is_simple_kmer(profile, cur_partition))

        if prof_pos - prev_partition > MAX_BUBBLE:
            long_bubbles += 1

        if landmark or prof_pos - prev_partition > MAX_BUBBLE:
            partition.append(cur_partition)
            prev_partition = cur_partition
            prof_pos += SOLID_LEN
        else:
            prof_pos += 1

    #logger.debug("Partitioned into {0} segments".format(len(partition) + 1))
    #logger.debug("Long bubbles: {0}".format(long_bubbles))

    return partition, long_bubbles
Beispiel #4
0
def get_consensus(alignment_path, contigs_path, contigs_info, num_proc,
                  platform):
    """
    Main function
    """
    aln_reader = SynchronizedSamReader(alignment_path,
                                       fp.read_sequence_dict(contigs_path),
                                       max_coverage=cfg.vals["max_read_coverage"],
                                       use_secondary=True)
    manager = multiprocessing.Manager()
    results_queue = manager.Queue()
    error_queue = manager.Queue()

    #making sure the main process catches SIGINT
    orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN)
    threads = []
    for _ in range(num_proc):
        threads.append(multiprocessing.Process(target=_thread_worker,
                                               args=(aln_reader, contigs_info,
                                                     platform, results_queue,
                                                     error_queue)))
    signal.signal(signal.SIGINT, orig_sigint)

    for t in threads:
        t.start()
    try:
        for t in threads:
            t.join()
            if t.exitcode == -9:
                logger.error("Looks like the system ran out of memory")
            if t.exitcode != 0:
                raise Exception("One of the processes exited with code: {0}"
                                .format(t.exitcode))
    except KeyboardInterrupt:
        for t in threads:
            t.terminate()
        raise

    if not error_queue.empty():
        raise error_queue.get()

    out_fasta = {}
    total_aln_errors = []
    while not results_queue.empty():
        ctg_id, ctg_seq, aln_errors = results_queue.get()
        total_aln_errors.extend(aln_errors)
        if len(ctg_seq) > 0:
            out_fasta[ctg_id] = ctg_seq

    mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1)
    logger.info("Alignment error rate: %f", mean_aln_error)

    return out_fasta
Beispiel #5
0
def get_uniform_alignments(alignments, seq_len):
    """
    Leaves top alignments for each position within contig
    assuming uniform coverage distribution
    """
    def _get_median(lst):
        if not lst:
            raise ValueError("_get_median() arg is an empty sequence")
        sorted_list = sorted(lst)
        if len(lst) % 2 == 1:
            return sorted_list[len(lst) // 2]
        else:
            mid1 = sorted_list[(len(lst) // 2) - 1]
            mid2 = sorted_list[(len(lst) // 2)]
            return (mid1 + mid2) / 2

    WINDOW = 100
    MIN_COV = 10
    COV_RATE = 1.25

    #split contig into windows, get median read coverage over all windows and
    #determine the quality threshold cutoffs for each window
    wnd_primary_cov = [0 for _ in range(seq_len // WINDOW + 1)]
    wnd_aln_quality = [[] for _ in range(seq_len // WINDOW + 1)]
    wnd_qual_thresholds = [1.0 for _ in range(seq_len // WINDOW + 1)]
    for aln in alignments:
        for i in range(aln.trg_start // WINDOW, aln.trg_end // WINDOW):
            if not aln.is_secondary:
                wnd_primary_cov[i] += 1
            wnd_aln_quality[i].append(aln.err_rate)

    #for each window, select top X alignmetns, where X is the median read coverage
    cov_threshold = max(int(COV_RATE * _get_median(wnd_primary_cov)), MIN_COV)
    for i in range(len(wnd_aln_quality)):
        if len(wnd_aln_quality[i]) > cov_threshold:
            wnd_qual_thresholds[i] = sorted(wnd_aln_quality[i])[cov_threshold]

    #for each alignment, count in how many windows it passes the threshold
    filtered_alignments = []
    total_sequence = 0
    filtered_sequence = 0
    for aln in alignments:
        good_windows = 0
        total_windows = aln.trg_end // WINDOW - aln.trg_start // WINDOW
        total_sequence += aln.trg_end - aln.trg_start
        for i in range(aln.trg_start // WINDOW, aln.trg_end // WINDOW):
            if aln.err_rate <= wnd_qual_thresholds[i]:
                good_windows += 1

        if good_windows > total_windows // 2:
            filtered_alignments.append(aln)
            filtered_sequence += aln.trg_end - aln.trg_start

    #filtered_reads_rate = 1 - float(len(filtered_alignments)) / len(alignments)
    #filtered_seq_rate = 1 - float(filtered_sequence) / total_sequence
    #logger.debug("Filtered {0:7.2f}% reads, {1:7.2f}% sequence"
    #                .format(filtered_reads_rate * 100, filtered_seq_rate * 100))

    return filtered_alignments
Beispiel #6
0
def split_into_chunks(fasta_in, chunk_size):
    out_dict = {}
    for header, seq in iteritems(fasta_in):
        #print len(seq)
        for i in range(0, max(len(seq) // chunk_size, 1)):
            chunk_hdr = "{0}$chunk_{1}".format(header, i)
            start = i * chunk_size
            end = (i + 1) * chunk_size
            if len(seq) - end < chunk_size:
                end = len(seq)

            #print(start, end)
            out_dict[chunk_hdr] = seq[start:end]

    return out_dict
Beispiel #7
0
def _is_solid_kmer(profile, position, err_mode):
    """
    Checks if the kmer at given position is solid
    """
    MISSMATCH_RATE = cfg.vals["err_modes"][err_mode]["solid_missmatch"]
    INS_RATE = cfg.vals["err_modes"][err_mode]["solid_indel"]
    SOLID_LEN = cfg.vals["solid_kmer_length"]

    for i in range(position, position + SOLID_LEN):
        if profile[i].coverage == 0:
            return False
        local_missmatch = (profile[i].num_missmatch +
                           profile[i].num_deletions) / profile[i].coverage
        local_ins = profile[i].num_inserts / profile[i].coverage
        if local_missmatch > MISSMATCH_RATE or local_ins > INS_RATE:
            return False
    return True
Beispiel #8
0
def _contig_profile(alignment, platform, genome_len):
    """
    Computes alignment profile
    """

    #leave the best uniform alignments
    alignment = get_uniform_alignments(alignment, genome_len)

    aln_errors = []
    profile = [Profile() for _ in range(genome_len)]
    #max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"]
    for aln in alignment:
        #if aln.err_rate > max_aln_err: continue
        aln_errors.append(aln.err_rate)

        #after gap shifting it is possible that
        #two gaps are aligned against each other
        qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq)
        trg_seq = shift_gaps(qry_seq, aln.trg_seq)

        trg_pos = aln.trg_start
        for trg_nuc, qry_nuc in zip(trg_seq, qry_seq):
            if trg_nuc == "-":
                trg_pos -= 1
            if trg_pos >= genome_len:
                trg_pos -= genome_len

            #total += 1
            prof_elem = profile[trg_pos]
            if trg_nuc == "-" and qry_nuc != "-":
                prof_elem.insertions[aln.qry_id] += qry_nuc
            else:
                prof_elem.nucl = trg_nuc
                prof_elem.matches[qry_nuc] += 1

            trg_pos += 1

    #print "len", genome_len, "median coverage", cov_threshold
    #print "total bases: ", total, "discarded bases: ", discarded
    #print "filtered", float(discarded) / total
    #print ""

    return profile, aln_errors
Beispiel #9
0
def _compute_profile(alignment, platform, genome_len):
    """
    Computes alignment profile
    """
    max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"]
    min_aln_len = cfg.vals["min_polish_aln_len"]
    aln_errors = []
    #filtered = 0
    profile = [ProfileInfo() for _ in range(genome_len)]
    for aln in alignment:
        if aln.err_rate > max_aln_err or len(aln.qry_seq) < min_aln_len:
            #filtered += 1
            continue

        aln_errors.append(aln.err_rate)

        qry_seq = shift_gaps(aln.trg_seq, aln.qry_seq)
        trg_seq = shift_gaps(qry_seq, aln.trg_seq)

        trg_pos = aln.trg_start
        for trg_nuc, qry_nuc in zip(trg_seq, qry_seq):
            if trg_nuc == "-":
                trg_pos -= 1
            if trg_pos >= genome_len:
                trg_pos -= genome_len

            prof_elem = profile[trg_pos]
            if trg_nuc == "-":
                prof_elem.num_inserts += 1
            else:
                prof_elem.nucl = trg_nuc
                prof_elem.coverage += 1

                if qry_nuc == "-":
                    prof_elem.num_deletions += 1
                elif trg_nuc != qry_nuc:
                    prof_elem.num_missmatch += 1

            trg_pos += 1

    #logger.debug("Filtered: {0} out of {1}".format(filtered, len(alignment)))
    return profile, aln_errors
Beispiel #10
0
def make_bubbles(alignment_path, contigs_info, contigs_path,
                 err_mode, num_proc, bubbles_out):
    """
    The main function: takes an alignment and returns bubbles
    """
    aln_reader = SynchronizedSamReader(alignment_path,
                                       fp.read_sequence_dict(contigs_path),
                                       cfg.vals["max_read_coverage"],
                                       use_secondary=True)
    manager = multiprocessing.Manager()
    results_queue = manager.Queue()
    error_queue = manager.Queue()

    #making sure the main process catches SIGINT
    orig_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN)
    threads = []
    bubbles_out_lock = multiprocessing.Lock()
    bubbles_out_handle = open(bubbles_out, "w")
    for _ in range(num_proc):
        threads.append(multiprocessing.Process(target=_thread_worker,
                                               args=(aln_reader, contigs_info,
                                                     err_mode, results_queue,
                                                     error_queue, bubbles_out_handle,
                                                     bubbles_out_lock)))
    signal.signal(signal.SIGINT, orig_sigint)

    for t in threads:
        t.start()
    try:
        for t in threads:
            t.join()
            if t.exitcode == -9:
                logger.error("Looks like the system ran out of memory")
            if t.exitcode != 0:
                raise Exception("One of the processes exited with code: {0}"
                                .format(t.exitcode))
    except KeyboardInterrupt:
        for t in threads:
            t.terminate()
        raise

    if not error_queue.empty():
        raise error_queue.get()

    total_bubbles = 0
    total_long_bubbles = 0
    total_long_branches = 0
    total_empty = 0
    total_aln_errors = []
    coverage_stats = {}

    while not results_queue.empty():
        (ctg_id, num_bubbles, num_long_bubbles,
            num_empty, num_long_branch,
            aln_errors, mean_coverage) = results_queue.get()
        total_long_bubbles += num_long_bubbles
        total_long_branches += num_long_branch
        total_empty += num_empty
        total_aln_errors.extend(aln_errors)
        total_bubbles += num_bubbles
        coverage_stats[ctg_id] = mean_coverage

    mean_aln_error = sum(total_aln_errors) / (len(total_aln_errors) + 1)
    logger.debug("Generated %d bubbles", total_bubbles)
    logger.debug("Split %d long bubbles", total_long_bubbles)
    logger.debug("Skipped %d empty bubbles", total_empty)
    logger.debug("Skipped %d bubbles with long branches", total_long_branches)

    return coverage_stats, mean_aln_error
Beispiel #11
0
    def get_chunk(self):
        """
        Alignment file is expected to be sorted!
        """

        chunk_buffer = []
        parsed_contig = None

        with self.lock:
            self.aln_file.seek(self.position.value)
            if self.eof.value:
                return None, []

            current_contig = None
            while True:
                self.position.value = self.aln_file.tell()
                line = self.aln_file.readline()
                if not line: break
                if _is_sam_header(line): continue

                tokens = line.strip().split()
                if len(tokens) < 11:
                    continue
                    #raise AlignmentException("Error reading SAM file")

                read_contig = tokens[2]
                flags = int(tokens[1])
                is_unmapped = flags & 0x4
                is_secondary = flags & 0x100
                is_supplementary = flags & 0x800    #allow supplementary

                #if is_unmapped or is_secondary: continue
                if is_unmapped: continue
                if is_secondary and not self.use_secondary: continue
                if read_contig in self.processed_contigs:
                    raise AlignmentException("Alignment file is not sorted")

                if read_contig != current_contig:
                    prev_contig = current_contig
                    current_contig = read_contig

                    if prev_contig is not None:
                        self.processed_contigs.add(prev_contig)
                        parsed_contig = prev_contig
                        break
                    else:
                        chunk_buffer = [tokens]
                else:
                    chunk_buffer.append(tokens)

            if not parsed_contig:
                self.eof.value = True
                parsed_contig = current_contig
        #end with

        sequence_length = 0
        alignments = []
        for tokens in chunk_buffer:
            read_id = tokens[0]
            read_contig = tokens[2]
            cigar_str = tokens[5]
            qry_seq = tokens[10]
            trg_seq = tokens[9]
            ctg_pos = int(tokens[3])
            flags = int(tokens[1])
            is_reversed = flags & 0x16
            is_secondary = flags & 0x100

            if qry_seq == b"*":
                raise Exception("Error parsing SAM: record without read sequence")

            trg_start = ctg_pos
            qry_start = 0
            qry_end = len(qry_seq) - qry_seq.count(b'-')
            trg_end = trg_start + len(trg_seq) - trg_seq.count(b'-')
            qry_len= len(qry_seq) - qry_seq.count(b'-')
            trg_len = len(self.ref_fasta[read_contig])

            #(trg_start, trg_end, trg_len, trg_seq,
            #qry_start, qry_end, qry_len, qry_seq, err_rate) = \
            #        self.parse_cigar(cigar_str, read_str, read_contig, ctg_pos)

            #OVERHANG = cfg.vals["read_aln_overhang"]
            #if (float(qry_end - qry_start) / qry_len > self.min_aln_rate or
            #        trg_start < OVERHANG or trg_len - trg_end < OVERHANG):
            matches = 0
            for i in range(len(trg_seq)):
                if trg_seq[i] == qry_seq[i]:
                    matches += 1
            err_rate = 1 - float(matches) / len(trg_seq)
            aln = Alignment(_STR(read_id), _STR(read_contig),
                            qry_start, qry_end, "-" if is_reversed else "+", qry_len,
                            trg_start, trg_end, "+", trg_len,
                            _STR(qry_seq), _STR(trg_seq),
                            err_rate, is_secondary)
            alignments.append(aln)

            sequence_length += qry_end - qry_start
            #In rare cases minimap2 does not output SQ tag, so need to check
            if _STR(parsed_contig) in self.seq_lengths:
                contig_length = self.seq_lengths[_STR(parsed_contig)]
                if sequence_length // contig_length > self.max_coverage:
                    break

        if parsed_contig is None:
            return None, []
        return _STR(parsed_contig), alignments
Beispiel #12
0
    def parse_cigar(self, cigar_str, read_str, ctg_name, ctg_pos):
        ctg_str = self.ref_fasta[ctg_name]
        trg_seq = []
        qry_seq = []
        trg_start = ctg_pos - 1
        trg_pos = ctg_pos - 1
        qry_start = 0
        qry_pos = 0

        left_hard = True
        left_soft = True
        hard_clipped_left = 0
        hard_clipped_right = 0
        soft_clipped_left = 0
        soft_clipped_right = 0
        for token in self.cigar_parser.findall(cigar_str):
            size, op = int(token[:-1]), token[-1:]
            if op == b"H":
                if left_hard:
                    qry_start += size
                    hard_clipped_left += size
                else:
                    hard_clipped_right += size
            elif op == b"S":
                qry_pos += size
                if left_soft:
                    soft_clipped_left += size
                else:
                    soft_clipped_right += size
            elif op == b"M":
                qry_seq.append(read_str[qry_pos : qry_pos + size].upper())
                trg_seq.append(ctg_str[trg_pos : trg_pos + size].upper())
                qry_pos += size
                trg_pos += size
            elif op == b"I":
                qry_seq.append(read_str[qry_pos : qry_pos + size].upper())
                trg_seq.append(b"-" * size)
                qry_pos += size
            elif op == b"D":
                qry_seq.append(b"-" * size)
                trg_seq.append(ctg_str[trg_pos : trg_pos + size].upper())
                trg_pos += size
            else:
                raise AlignmentException("Unsupported CIGAR operation: " + str(op))
            left_hard = False
            if op != b"H":
                left_soft = False

        trg_seq = b"".join(trg_seq)
        qry_seq = b"".join(qry_seq)
        matches = 0
        for i in range(len(trg_seq)):
            if trg_seq[i] == qry_seq[i]:
                matches += 1
        err_rate = 1 - float(matches) / len(trg_seq)

        trg_end = trg_pos
        qry_end = qry_pos + hard_clipped_left
        qry_len = qry_end + hard_clipped_right
        qry_start += soft_clipped_left
        qry_end -= soft_clipped_right

        return (trg_start, trg_end, len(ctg_str), trg_seq,
                qry_start, qry_end, qry_len, qry_seq, err_rate)