def run_imerge_on_bin(cloneset, mismatch_rate, confidence): """Run IMerge algorithm. :param cloneset: a set of clones of equal length sequences. :param mismatch_rate: probability any given base in the cloneset is erroneous. float value with range [0,1]. :confidence: confidence level used for clone size estimation. """ if mismatch_rate == 0 or len(cloneset) == 0: return cloneset assert (mismatch_rate < 1.0) assert (0.0 < confidence and confidence < 1.0) seq_len = len(iter(cloneset).next().seq) z_conf = norm_ppf(confidence) n = float(cloneset.base_count) p = float(mismatch_rate) max_mutation_count = n * p + z_conf * sqrt(n * p * (1 - p)) if isinstance(cloneset, SearchableCloneSet): scls = cloneset else: scls = SearchableCloneSet(cloneset) del cloneset # Calculate binomial pmf for current sequence length and mismatch_rate. hd2prob = [ binom_pmf(hd, seq_len, mismatch_rate) for hd in xrange(seq_len + 1) ] clone2mergeinfo = {clone : MergeInfo(clone, hd2prob, z_conf) \ for clone in scls} cycle_maxhd = max([mergeinfo.maxhd for mergeinfo in \ clone2mergeinfo.itervalues()]) basemsg = lambda: "IMerge (len: %s): " % seq_len fullbasemsg = lambda: "IMerge (len: %s;cycle_maxhd: %s;\ #clones: %s;#sequences: %s;#bases: %s;#mutations: %s;max_mutations: %s): " % ( seq_len, cycle_maxhd, scls.count, scls.sequence_count, scls.base_count, scls.mutation_count, max_mutation_count) if isnan(cycle_maxhd): logger.info(fullbasemsg + "stopping because cycle_maxhd is 0"%\ seq_len) return scls logger.info(fullbasemsg() + "start") for cycle_hd in xrange(1, cycle_maxhd + 1): logger.debug(basemsg() + "cycle %s" % cycle_hd) for clone in sorted(scls, key = lambda clone : \ (-clone.count, clone.seq)): if not clone in scls: logger.debug(basemsg() + "%s not in scls" % clone.seq) continue mergeinfo = clone2mergeinfo[clone] if cycle_hd > mergeinfo.maxhd: logger.debug(basemsg() + "%s (maxhd: %s; cycle_hd: %s)" % (clone.seq, mergeinfo.maxhd, cycle_hd)) continue mergeinfo.sweep += mergeinfo.hd2nes[cycle_hd] logger.debug(basemsg() + "seq: %s (%s; sweep: %s)" % (clone.seq, clone.count, mergeinfo.sweep)) for hd, neighbor in sorted(mergeinfo.neighborhood(scls, cycle_hd), key=lambda (hd, neighbor): (hd, neighbor.count)): if clone == neighbor or neighbor not in scls: continue if hd > cycle_hd: break if mergeinfo.sweep >= neighbor.count: mc_diff, merged_clone = scls.premerge(clone, neighbor) if merged_clone is None: continue if scls.mutation_count + mc_diff > max_mutation_count: logger.debug(fullbasemsg() + "mc_diff = %s,\ clone.count = %s, neighbor.count = %s" % (mc_diff, clone.count, neighbor.count)) continue logger.debug( basemsg() + "merging: %s(%s) %s(%s)" % (clone.seq, clone.count, neighbor.seq, neighbor.count)) scls.enact_merge(clone, neighbor, merged_clone) mergeinfo.sweep -= neighbor.count del clone2mergeinfo[neighbor] del clone2mergeinfo[clone] mergeinfo.clone = merged_clone clone2mergeinfo[merged_clone] = mergeinfo clone = merged_clone logger.info(fullbasemsg() + "end") return scls
def run_lmerge(cloneset, mismatch_rate, insertion_rate, deletion_rate, confidence): if len(cloneset) == 0 or \ (mismatch_rate == insertion_rate == deletion_rate == 0): return cloneset assert (0 <= mismatch_rate < 1.0) assert (0 <= insertion_rate < 1.0) assert (0 <= deletion_rate < 1.0) assert (0 < confidence < 1.0) max_mutation_count = float(cloneset.base_count) * mismatch_rate max_insertion_count = float(cloneset.base_count) * insertion_rate max_deletion_count = float(cloneset.base_count) * deletion_rate logger.info("(LMerge start: ) mismatch_rate: %(mismatch_rate)s, \ insertion_rate: %(insertion_rate)s, deletion_rate: %(deletion_rate)s\n\ max_mutation_count: %(max_mutation_count)s, \ max_insertion_count: %(max_insertion_count)s, \ max_deletion_count: %(max_deletion_count)s" % locals()) z_conf = norm_ppf(confidence) insertion_count = 0 deletion_count = 0 oof2if_only = True # only perform out-of-frame to in-frame merges # Divide the cloneset into length families (i.e. clones with equal length # sequences) by_seqlen = lambda clone: len(clone.seq) lenfams = {} for seqlen, clones in groupby(sorted(cloneset, key=by_seqlen), by_seqlen): lenfams[seqlen] = CloneSet(clones) clone2mergeinfo = {} mismatch_probs = {} # TODO: calculate more precise upper bound on the maximum length difference max_lendiff = max(lenfams.iterkeys()) logger.info("LMerge max length difference: %s" % max_lendiff) for lendiff in xrange(1, max_lendiff + 1): logger.info("LMerge: processing length difference %s" % lendiff) for seqlen in sorted(lenfams, reverse=True): lenfam = lenfams[seqlen] if oof2if_only and seqlen % 3 != 0: # skip out-of-frame length family logger.debug( "Skipping out-of-frame length family (seqlen: %s)"%\ seqlen) continue del_prob = binom_pmf(lendiff, seqlen, deletion_rate) # note: there is one extra place for an insertion to take place # than there are bases in the sequences (hence the seqlen + 1). ins_prob = binom_pmf(lendiff, seqlen + 1, insertion_rate) max_indel_prob = max(ins_prob, del_prob) if 1.0 / lenfam.sequence_count > max_indel_prob: logger.debug( "Expecting < 1 indels for length family (seqlen: %s)"%\ seqlen) continue else: # TODO: it might be better to calculate what the estimated # true clone count should be. Since then clones that had a # 0 count in the raw data are then enabled to absorb indel # sequences (though this is probably an exceedingly rare event) # Since we will compare against clone.orig_count values, # we need to calculate the minimum orig_count min_clone_orig_count = (1.0 / max_indel_prob) * \ binom_pmf(0, seqlen, mismatch_rate) logger.debug( "(seqlen: %s; lendiff: %s) min_clone_orig_count: %s"%(\ seqlen, lendiff, min_clone_orig_count)) dels_lenfam = lenfams.get(seqlen - lendiff, CloneSet()) ins_lenfam = lenfams.get(seqlen + lendiff, CloneSet()) if dels_lenfam.count == 0 and ins_lenfam.count == 0: logger.debug("lenfams %s and %s contain no sequences" % (seqlen - lendiff, seqlen + lendiff)) continue for clone in sorted(lenfam, key=lambda clone: -clone.orig_count): if clone.orig_count < min_clone_orig_count: break if not clone in cloneset: continue if oof2if_only and not len(clone.seq) % 3 == 0: continue hd2prob = mismatch_probs.setdefault(seqlen, [ binom_pmf(hd, seqlen, mismatch_rate) \ for hd in xrange(seqlen + 1) ]) mergeinfo = clone2mergeinfo.setdefault( clone, MergeInfo(clone, hd2prob, z_conf)) # Maximum levenshtein distance max_ld = lendiff + mergeinfo.maxhd for nb_lenfam, p in ((dels_lenfam, del_prob), (ins_lenfam, ins_prob)): max_neighbor_count = ceil(p * clone.count) # TODO: calculate banded Levenshtein distance for speedup? nbs = [(nb, ed(nb.seq, clone.seq)) for nb in nb_lenfam] nbs = [ (nb, levdist) for nb,levdist in nbs \ if levdist <= max_ld ] for nb, levdist in sorted(nbs, key = \ lambda (nb, levdist):(levdist, nb.count, min(nb.qual))): hd = levdist - lendiff if hd * nb.count + cloneset.mutation_count > \ max_mutation_count and hd > 0: logger.debug("Skipping merge: \ hd (%s) * nb.count (%s) + cloneset.mutation_count (%s) > \ max_mutation_count(%s)" % (hd, nb.count, cloneset.mutation_count, max_mutation_count)) continue if nb.seq > seqlen: if lendiff*nb.count + deletion_count > \ max_deletion_count: logger.debug("Skipping merge: \ lendiff (%s) * nb.count (%s) + deletion_count (%s) > \ max_deletion_count (%s)" % (lendiff, nb.count, deletion_count, max_deletion_count)) continue else: if lendiff*nb.count + insertion_count > \ max_insertion_count: logger.debug("Skipping merge: \ lendiff (%s) * nb.count (%s) + insertion_count (%s) > \ max_insertion_count(%s)" % (lendiff, nb.count, insertion_count, max_insertion_count)) continue if nb.count > max_neighbor_count: continue if not nb in cloneset: continue # TODO: do a proper merge here, calculating pairwise # sequence alignment # Since the clone.count cannot be modified directly, # first create a new neighbor with the same # sequence as clone, and then add this new neighbor # to clone to update its count that way. try: nb2 = type(nb)(nb.v, nb.d, nb.j, nb.c, other=(clone.seq, clone.qual, nb.count)) except: # apparently nb is not an AnnotatedClone nb2 = type(nb)((clone.seq, clone.qual, nb.count)) new_clone = clone.add(nb2) if not new_clone is None: cloneset.remove(nb) cloneset.remove(clone) cloneset.add(new_clone) if len(nb.seq) > clone.seq: deletion_count += lendiff * nb.count else: insertion_count += lendiff * nb.count logger.debug("merged: %s to %s" % (nb, clone)) return cloneset
def run_lmerge(cloneset, mismatch_rate, insertion_rate, deletion_rate, confidence): if len(cloneset) == 0 or \ (mismatch_rate == insertion_rate == deletion_rate == 0): return cloneset assert(0 <= mismatch_rate < 1.0) assert(0 <= insertion_rate < 1.0) assert(0 <= deletion_rate < 1.0) assert(0 < confidence < 1.0) max_mutation_count = float(cloneset.base_count) * mismatch_rate max_insertion_count = float(cloneset.base_count) * insertion_rate max_deletion_count = float(cloneset.base_count) * deletion_rate logger.info("(LMerge start: ) mismatch_rate: %(mismatch_rate)s, \ insertion_rate: %(insertion_rate)s, deletion_rate: %(deletion_rate)s\n\ max_mutation_count: %(max_mutation_count)s, \ max_insertion_count: %(max_insertion_count)s, \ max_deletion_count: %(max_deletion_count)s"%locals()) z_conf = norm_ppf(confidence) insertion_count = 0 deletion_count = 0 oof2if_only = True # only perform out-of-frame to in-frame merges # Divide the cloneset into length families (i.e. clones with equal length # sequences) by_seqlen = lambda clone:len(clone.seq) lenfams = {} for seqlen, clones in groupby(sorted(cloneset, key = by_seqlen), by_seqlen): lenfams[seqlen] = CloneSet(clones) clone2mergeinfo = {} mismatch_probs = {} # TODO: calculate more precise upper bound on the maximum length difference max_lendiff = max(lenfams.iterkeys()) logger.info("LMerge max length difference: %s"%max_lendiff) for lendiff in xrange(1, max_lendiff + 1): logger.info("LMerge: processing length difference %s"%lendiff) for seqlen in sorted(lenfams, reverse = True): lenfam = lenfams[seqlen] if oof2if_only and seqlen % 3 != 0: # skip out-of-frame length family logger.debug( "Skipping out-of-frame length family (seqlen: %s)"%\ seqlen) continue del_prob = binom_pmf(lendiff, seqlen, deletion_rate) # note: there is one extra place for an insertion to take place # than there are bases in the sequences (hence the seqlen + 1). ins_prob = binom_pmf(lendiff, seqlen + 1, insertion_rate) max_indel_prob = max(ins_prob, del_prob) if 1.0 / lenfam.sequence_count > max_indel_prob: logger.debug( "Expecting < 1 indels for length family (seqlen: %s)"%\ seqlen) continue else: # TODO: it might be better to calculate what the estimated # true clone count should be. Since then clones that had a # 0 count in the raw data are then enabled to absorb indel # sequences (though this is probably an exceedingly rare event) # Since we will compare against clone.orig_count values, # we need to calculate the minimum orig_count min_clone_orig_count = (1.0 / max_indel_prob) * \ binom_pmf(0, seqlen, mismatch_rate) logger.debug( "(seqlen: %s; lendiff: %s) min_clone_orig_count: %s"%(\ seqlen, lendiff, min_clone_orig_count)) dels_lenfam = lenfams.get(seqlen - lendiff, CloneSet()) ins_lenfam = lenfams.get(seqlen + lendiff, CloneSet()) if dels_lenfam.count == 0 and ins_lenfam.count == 0: logger.debug("lenfams %s and %s contain no sequences"%( seqlen - lendiff, seqlen + lendiff)) continue for clone in sorted(lenfam, key = lambda clone:-clone.orig_count): if clone.orig_count < min_clone_orig_count: break if not clone in cloneset: continue if oof2if_only and not len(clone.seq) % 3 == 0: continue hd2prob = mismatch_probs.setdefault(seqlen, [ binom_pmf(hd, seqlen, mismatch_rate) \ for hd in xrange(seqlen + 1) ]) mergeinfo = clone2mergeinfo.setdefault(clone, MergeInfo(clone, hd2prob, z_conf)) # Maximum levenshtein distance max_ld = lendiff + mergeinfo.maxhd for nb_lenfam, p in ((dels_lenfam, del_prob), (ins_lenfam, ins_prob)): max_neighbor_count = ceil(p * clone.count) # TODO: calculate banded Levenshtein distance for speedup? nbs = [ (nb, ed(nb.seq, clone.seq)) for nb in nb_lenfam] nbs = [ (nb, levdist) for nb,levdist in nbs \ if levdist <= max_ld ] for nb, levdist in sorted(nbs, key = \ lambda (nb, levdist):(levdist, nb.count, min(nb.qual))): hd = levdist - lendiff if hd * nb.count + cloneset.mutation_count > \ max_mutation_count and hd > 0: logger.debug("Skipping merge: \ hd (%s) * nb.count (%s) + cloneset.mutation_count (%s) > \ max_mutation_count(%s)"%(hd, nb.count, cloneset.mutation_count, max_mutation_count)) continue if nb.seq > seqlen: if lendiff*nb.count + deletion_count > \ max_deletion_count: logger.debug("Skipping merge: \ lendiff (%s) * nb.count (%s) + deletion_count (%s) > \ max_deletion_count (%s)"%(lendiff, nb.count, deletion_count, max_deletion_count)) continue else: if lendiff*nb.count + insertion_count > \ max_insertion_count: logger.debug("Skipping merge: \ lendiff (%s) * nb.count (%s) + insertion_count (%s) > \ max_insertion_count(%s)"%(lendiff, nb.count, insertion_count, max_insertion_count)) continue if nb.count > max_neighbor_count: continue if not nb in cloneset: continue # TODO: do a proper merge here, calculating pairwise # sequence alignment # Since the clone.count cannot be modified directly, # first create a new neighbor with the same # sequence as clone, and then add this new neighbor # to clone to update its count that way. try: nb2 = type(nb)(nb.v, nb.d, nb.j, nb.c, other = (clone.seq, clone.qual, nb.count)) except: # apparently nb is not an AnnotatedClone nb2 = type(nb)((clone.seq, clone.qual, nb.count)) new_clone = clone.add(nb2) if not new_clone is None: cloneset.remove(nb) cloneset.remove(clone) cloneset.add(new_clone) if len(nb.seq) > clone.seq: deletion_count += lendiff * nb.count else: insertion_count += lendiff * nb.count logger.debug("merged: %s to %s"%(nb, clone)) return cloneset
def run_qmerge_on_bin(cloneset, mismatch_rate, confidence, max_Q): if mismatch_rate == 0 or len(cloneset) == 0: return cloneset assert(mismatch_rate < 1.0) seq_len = len(iter(cloneset).next().seq) z_conf = norm_ppf(confidence) n = float(cloneset.base_count) p = float(mismatch_rate) max_mutation_count = n*p + z_conf*sqrt(n*p*(1-p)) # Calculate binomial pmf for current sequence length and mismatch_rate. hd2prob = [ binom_pmf(hd, seq_len, mismatch_rate) for hd in xrange(seq_len + 1) ] hd2nes = [cloneset.sequence_count * prob for prob in hd2prob] max_Qm = int(ceil(10 * log10(cloneset.base_count))) max_hd = max([i if r >= 1.0 else float("nan") \ for i, r in enumerate(hd2nes)]) if isnan(max_hd) or max_hd < 1: logger.info("maxhd < 1, not performing QMerge (len: %s)"%seq_len) return cloneset if isinstance(cloneset, SearchableCloneSet): scls = cloneset else: scls = SearchableCloneSet(cloneset) del cloneset tasks = Heap() basemsg = lambda:"QMerge (len: %s): "%seq_len fullbasemsg = lambda:"QMerge (len: %s;max_hd: %s, max_Qm: %s;max_Q: %s;\ #clones: %s;#sequences: %s;#bases: %s;#mutations: %s;max_mutations: %s;\ #tasks: %s): "%( seq_len, max_hd, max_Qm, max_Q, scls.count, scls.sequence_count, scls.base_count, scls.mutation_count, max_mutation_count, len(tasks)) logger.info(fullbasemsg() + "start") clone2task = {} for hd, clone1, clone2 in scls.pairs(seq_len, max_hd): add_task(scls, clone2task, tasks, clone1, clone2, max_Qm, max_Q) logger.debug(fullbasemsg() + "finished searching for pairs") while len(tasks) > 0 and scls.mutation_count <= max_mutation_count: task = tasks.pop() clone1, clone2 = task.clone1, task.clone2 # Remove current task from clone2task dict to keep it in sync with # the tasks heap. if clone1 in clone2task and task == clone2task[clone1]: del clone2task[clone1] if clone2 in clone2task and task == clone2task[clone2]: del clone2task[clone2] # Check if task can be performed if not clone1 in scls or not clone2 in scls: # Task cannot be performed, need to search for an alternate # "best task" for one or both clones. for clone in (clone1, clone2): if merged_clone != clone and clone in scls: # New tasks may need to be created for hd, neighbor in scls.neighbors(clone, max_hd): add_task(scls, clone2task, tasks, clone, neighbor, max_Qm, max_Q) continue # Recalculate merge because clones may have changed by other merges. mc_diff, merged_clone = scls.premerge(clone1, clone2) if merged_clone is None: continue if scls.mutation_count + mc_diff > max_mutation_count: continue hd, Qm, max_mut_q = clone_merge_stats(clone1, clone2) if Qm > max_Qm: continue if max_mut_q >= max_Q: continue logger.debug(basemsg() + "merging: %s(%s) %s(%s)"%( clone1.seq, clone1.count, clone2.seq, clone2.count)) new_clone = not merged_clone in scls scls.enact_merge(clone1, clone2, merged_clone) if new_clone: logger.debug(basemsg() + "new clone: %s"%(merged_clone.seq)) for hd, neighbor in scls.neighbors(merged_clone, max_hd): add_task(scls, clone2task, tasks, merged_clone, neighbor, max_Qm, max_Q) logger.info(fullbasemsg() + "end") return scls
def run_imerge_on_bin(cloneset, mismatch_rate, confidence): """Run IMerge algorithm. :param cloneset: a set of clones of equal length sequences. :param mismatch_rate: probability any given base in the cloneset is erroneous. float value with range [0,1]. :confidence: confidence level used for clone size estimation. """ if mismatch_rate == 0 or len(cloneset) == 0: return cloneset assert(mismatch_rate < 1.0) assert(0.0 < confidence and confidence < 1.0) seq_len = len(iter(cloneset).next().seq) z_conf = norm_ppf(confidence) n = float(cloneset.base_count) p = float(mismatch_rate) max_mutation_count = n*p + z_conf*sqrt(n*p*(1-p)) if isinstance(cloneset, SearchableCloneSet): scls = cloneset else: scls = SearchableCloneSet(cloneset) del cloneset # Calculate binomial pmf for current sequence length and mismatch_rate. hd2prob = [ binom_pmf(hd, seq_len, mismatch_rate) for hd in xrange(seq_len + 1) ] clone2mergeinfo = {clone : MergeInfo(clone, hd2prob, z_conf) \ for clone in scls} cycle_maxhd = max([mergeinfo.maxhd for mergeinfo in \ clone2mergeinfo.itervalues()]) basemsg = lambda:"IMerge (len: %s): "%seq_len fullbasemsg = lambda:"IMerge (len: %s;cycle_maxhd: %s;\ #clones: %s;#sequences: %s;#bases: %s;#mutations: %s;max_mutations: %s): "%( seq_len, cycle_maxhd, scls.count, scls.sequence_count, scls.base_count, scls.mutation_count, max_mutation_count) if isnan(cycle_maxhd): logger.info(fullbasemsg + "stopping because cycle_maxhd is 0"%\ seq_len) return scls logger.info(fullbasemsg() + "start") for cycle_hd in xrange(1, cycle_maxhd + 1): logger.debug(basemsg() + "cycle %s"%cycle_hd) for clone in sorted(scls, key = lambda clone : \ (-clone.count, clone.seq)): if not clone in scls: logger.debug(basemsg() + "%s not in scls"%clone.seq) continue mergeinfo = clone2mergeinfo[clone] if cycle_hd > mergeinfo.maxhd: logger.debug(basemsg() + "%s (maxhd: %s; cycle_hd: %s)"%( clone.seq, mergeinfo.maxhd, cycle_hd)) continue mergeinfo.sweep += mergeinfo.hd2nes[cycle_hd] logger.debug(basemsg() + "seq: %s (%s; sweep: %s)"%(clone.seq, clone.count, mergeinfo.sweep)) for hd, neighbor in sorted(mergeinfo.neighborhood(scls, cycle_hd), key = lambda (hd, neighbor): (hd, neighbor.count)): if clone == neighbor or neighbor not in scls: continue if hd > cycle_hd: break if mergeinfo.sweep >= neighbor.count: mc_diff, merged_clone = scls.premerge(clone, neighbor) if merged_clone is None: continue if scls.mutation_count + mc_diff > max_mutation_count: logger.debug(fullbasemsg() + "mc_diff = %s,\ clone.count = %s, neighbor.count = %s"%(mc_diff, clone.count, neighbor.count)) continue logger.debug(basemsg() + "merging: %s(%s) %s(%s)"%( clone.seq, clone.count, neighbor.seq, neighbor.count)) scls.enact_merge(clone, neighbor, merged_clone) mergeinfo.sweep -= neighbor.count del clone2mergeinfo[neighbor] del clone2mergeinfo[clone] mergeinfo.clone = merged_clone clone2mergeinfo[merged_clone] = mergeinfo clone = merged_clone logger.info(fullbasemsg() + "end") return scls