Beispiel #1
0
def run_imerge_on_bin(cloneset, mismatch_rate, confidence):
    """Run IMerge algorithm.

    :param cloneset: a set of clones of equal length sequences.
    :param mismatch_rate: probability any given base in the cloneset is
    erroneous. float value with range [0,1].
    :confidence: confidence level used for clone size estimation. 
    """

    if mismatch_rate == 0 or len(cloneset) == 0:
        return cloneset

    assert (mismatch_rate < 1.0)
    assert (0.0 < confidence and confidence < 1.0)

    seq_len = len(iter(cloneset).next().seq)
    z_conf = norm_ppf(confidence)
    n = float(cloneset.base_count)
    p = float(mismatch_rate)
    max_mutation_count = n * p + z_conf * sqrt(n * p * (1 - p))

    if isinstance(cloneset, SearchableCloneSet):
        scls = cloneset
    else:
        scls = SearchableCloneSet(cloneset)
    del cloneset

    # Calculate binomial pmf for current sequence length and mismatch_rate.
    hd2prob = [
        binom_pmf(hd, seq_len, mismatch_rate) for hd in xrange(seq_len + 1)
    ]
    clone2mergeinfo = {clone : MergeInfo(clone, hd2prob, z_conf) \
            for clone in scls}

    cycle_maxhd = max([mergeinfo.maxhd for mergeinfo in \
            clone2mergeinfo.itervalues()])

    basemsg = lambda: "IMerge (len: %s): " % seq_len
    fullbasemsg = lambda: "IMerge (len: %s;cycle_maxhd: %s;\
#clones: %s;#sequences: %s;#bases: %s;#mutations: %s;max_mutations: %s): " % (
        seq_len, cycle_maxhd, scls.count, scls.sequence_count, scls.base_count,
        scls.mutation_count, max_mutation_count)

    if isnan(cycle_maxhd):
        logger.info(fullbasemsg + "stopping because cycle_maxhd is 0"%\
                seq_len)
        return scls

    logger.info(fullbasemsg() + "start")
    for cycle_hd in xrange(1, cycle_maxhd + 1):
        logger.debug(basemsg() + "cycle %s" % cycle_hd)
        for clone in sorted(scls, key = lambda clone : \
                (-clone.count, clone.seq)):
            if not clone in scls:
                logger.debug(basemsg() + "%s not in scls" % clone.seq)
                continue

            mergeinfo = clone2mergeinfo[clone]

            if cycle_hd > mergeinfo.maxhd:
                logger.debug(basemsg() + "%s (maxhd: %s; cycle_hd: %s)" %
                             (clone.seq, mergeinfo.maxhd, cycle_hd))
                continue

            mergeinfo.sweep += mergeinfo.hd2nes[cycle_hd]

            logger.debug(basemsg() + "seq: %s (%s; sweep: %s)" %
                         (clone.seq, clone.count, mergeinfo.sweep))

            for hd, neighbor in sorted(mergeinfo.neighborhood(scls, cycle_hd),
                                       key=lambda (hd, neighbor):
                                       (hd, neighbor.count)):
                if clone == neighbor or neighbor not in scls:
                    continue
                if hd > cycle_hd:
                    break
                if mergeinfo.sweep >= neighbor.count:
                    mc_diff, merged_clone = scls.premerge(clone, neighbor)

                    if merged_clone is None:
                        continue

                    if scls.mutation_count + mc_diff > max_mutation_count:
                        logger.debug(fullbasemsg() + "mc_diff = %s,\
clone.count = %s, neighbor.count = %s" %
                                     (mc_diff, clone.count, neighbor.count))
                        continue

                    logger.debug(
                        basemsg() + "merging: %s(%s) %s(%s)" %
                        (clone.seq, clone.count, neighbor.seq, neighbor.count))
                    scls.enact_merge(clone, neighbor, merged_clone)

                    mergeinfo.sweep -= neighbor.count
                    del clone2mergeinfo[neighbor]
                    del clone2mergeinfo[clone]
                    mergeinfo.clone = merged_clone
                    clone2mergeinfo[merged_clone] = mergeinfo
                    clone = merged_clone
    logger.info(fullbasemsg() + "end")
    return scls
Beispiel #2
0
def run_lmerge(cloneset, mismatch_rate, insertion_rate, deletion_rate,
               confidence):
    if len(cloneset) == 0 or \
            (mismatch_rate == insertion_rate == deletion_rate == 0):
        return cloneset

    assert (0 <= mismatch_rate < 1.0)
    assert (0 <= insertion_rate < 1.0)
    assert (0 <= deletion_rate < 1.0)
    assert (0 < confidence < 1.0)

    max_mutation_count = float(cloneset.base_count) * mismatch_rate
    max_insertion_count = float(cloneset.base_count) * insertion_rate
    max_deletion_count = float(cloneset.base_count) * deletion_rate

    logger.info("(LMerge start: ) mismatch_rate: %(mismatch_rate)s, \
insertion_rate: %(insertion_rate)s, deletion_rate: %(deletion_rate)s\n\
max_mutation_count: %(max_mutation_count)s, \
max_insertion_count: %(max_insertion_count)s, \
max_deletion_count: %(max_deletion_count)s" % locals())

    z_conf = norm_ppf(confidence)

    insertion_count = 0
    deletion_count = 0

    oof2if_only = True  # only perform out-of-frame to in-frame merges

    # Divide the cloneset into length families (i.e. clones with equal length
    # sequences)
    by_seqlen = lambda clone: len(clone.seq)
    lenfams = {}
    for seqlen, clones in groupby(sorted(cloneset, key=by_seqlen), by_seqlen):
        lenfams[seqlen] = CloneSet(clones)

    clone2mergeinfo = {}
    mismatch_probs = {}

    # TODO: calculate more precise upper bound on the maximum length difference
    max_lendiff = max(lenfams.iterkeys())
    logger.info("LMerge max length difference: %s" % max_lendiff)
    for lendiff in xrange(1, max_lendiff + 1):
        logger.info("LMerge: processing length difference %s" % lendiff)
        for seqlen in sorted(lenfams, reverse=True):
            lenfam = lenfams[seqlen]

            if oof2if_only and seqlen % 3 != 0:
                # skip out-of-frame length family
                logger.debug(
                        "Skipping out-of-frame length family (seqlen: %s)"%\
                                seqlen)
                continue

            del_prob = binom_pmf(lendiff, seqlen, deletion_rate)
            # note: there is one extra place for an insertion to take place
            # than there are bases in the sequences (hence the seqlen + 1).
            ins_prob = binom_pmf(lendiff, seqlen + 1, insertion_rate)

            max_indel_prob = max(ins_prob, del_prob)

            if 1.0 / lenfam.sequence_count > max_indel_prob:
                logger.debug(
                        "Expecting < 1 indels for length family (seqlen: %s)"%\
                                seqlen)
                continue
            else:
                # TODO: it might be better to calculate what the estimated
                # true clone count should be. Since then clones that had a
                # 0 count in the raw data are then enabled to absorb indel
                # sequences (though this is probably an exceedingly rare event)

                # Since we will compare against clone.orig_count values,
                # we need to calculate the minimum orig_count
                min_clone_orig_count = (1.0 / max_indel_prob) * \
                        binom_pmf(0, seqlen, mismatch_rate)
                logger.debug(
                        "(seqlen: %s; lendiff: %s) min_clone_orig_count: %s"%(\
                        seqlen, lendiff, min_clone_orig_count))

            dels_lenfam = lenfams.get(seqlen - lendiff, CloneSet())
            ins_lenfam = lenfams.get(seqlen + lendiff, CloneSet())

            if dels_lenfam.count == 0 and ins_lenfam.count == 0:
                logger.debug("lenfams %s and %s contain no sequences" %
                             (seqlen - lendiff, seqlen + lendiff))
                continue

            for clone in sorted(lenfam, key=lambda clone: -clone.orig_count):
                if clone.orig_count < min_clone_orig_count:
                    break
                if not clone in cloneset:
                    continue
                if oof2if_only and not len(clone.seq) % 3 == 0:
                    continue

                hd2prob = mismatch_probs.setdefault(seqlen,
                        [ binom_pmf(hd, seqlen, mismatch_rate) \
                                for hd in xrange(seqlen + 1) ])
                mergeinfo = clone2mergeinfo.setdefault(
                    clone, MergeInfo(clone, hd2prob, z_conf))

                # Maximum levenshtein distance
                max_ld = lendiff + mergeinfo.maxhd

                for nb_lenfam, p in ((dels_lenfam, del_prob), (ins_lenfam,
                                                               ins_prob)):
                    max_neighbor_count = ceil(p * clone.count)

                    # TODO: calculate banded Levenshtein distance for speedup?
                    nbs = [(nb, ed(nb.seq, clone.seq)) for nb in nb_lenfam]
                    nbs = [ (nb, levdist) for nb,levdist in nbs \
                            if levdist <= max_ld ]

                    for nb, levdist in sorted(nbs, key = \
                            lambda (nb, levdist):(levdist, nb.count,
                                min(nb.qual))):
                        hd = levdist - lendiff
                        if hd * nb.count + cloneset.mutation_count > \
                                max_mutation_count and hd > 0:
                            logger.debug("Skipping merge: \
hd (%s) * nb.count (%s) + cloneset.mutation_count (%s) > \
max_mutation_count(%s)" % (hd, nb.count, cloneset.mutation_count,
                            max_mutation_count))
                            continue
                        if nb.seq > seqlen:
                            if lendiff*nb.count + deletion_count > \
                                    max_deletion_count:
                                logger.debug("Skipping merge: \
lendiff (%s) * nb.count (%s) + deletion_count (%s) > \
max_deletion_count (%s)" % (lendiff, nb.count, deletion_count,
                                max_deletion_count))
                                continue
                        else:
                            if lendiff*nb.count + insertion_count > \
                                    max_insertion_count:
                                logger.debug("Skipping merge: \
lendiff (%s) * nb.count (%s) + insertion_count (%s) > \
max_insertion_count(%s)" % (lendiff, nb.count, insertion_count,
                                max_insertion_count))
                                continue

                        if nb.count > max_neighbor_count:
                            continue
                        if not nb in cloneset:
                            continue

                        # TODO: do a proper merge here, calculating pairwise
                        # sequence alignment

                        # Since the clone.count cannot be modified directly,
                        # first create a new neighbor with the same
                        # sequence as clone, and then add this new neighbor
                        # to clone to update its count that way.
                        try:
                            nb2 = type(nb)(nb.v,
                                           nb.d,
                                           nb.j,
                                           nb.c,
                                           other=(clone.seq, clone.qual,
                                                  nb.count))
                        except:  # apparently nb is not an AnnotatedClone
                            nb2 = type(nb)((clone.seq, clone.qual, nb.count))

                        new_clone = clone.add(nb2)
                        if not new_clone is None:
                            cloneset.remove(nb)
                            cloneset.remove(clone)
                            cloneset.add(new_clone)
                            if len(nb.seq) > clone.seq:
                                deletion_count += lendiff * nb.count
                            else:
                                insertion_count += lendiff * nb.count
                        logger.debug("merged: %s to %s" % (nb, clone))

    return cloneset
Beispiel #3
0
def run_lmerge(cloneset, mismatch_rate, insertion_rate, deletion_rate,
        confidence):
    if len(cloneset) == 0 or \
            (mismatch_rate == insertion_rate == deletion_rate == 0):
        return cloneset

    assert(0 <= mismatch_rate < 1.0)
    assert(0 <= insertion_rate < 1.0)
    assert(0 <= deletion_rate < 1.0)
    assert(0 < confidence < 1.0)

    max_mutation_count = float(cloneset.base_count) * mismatch_rate
    max_insertion_count = float(cloneset.base_count) * insertion_rate
    max_deletion_count = float(cloneset.base_count) * deletion_rate

    logger.info("(LMerge start: ) mismatch_rate: %(mismatch_rate)s, \
insertion_rate: %(insertion_rate)s, deletion_rate: %(deletion_rate)s\n\
max_mutation_count: %(max_mutation_count)s, \
max_insertion_count: %(max_insertion_count)s, \
max_deletion_count: %(max_deletion_count)s"%locals())

    z_conf = norm_ppf(confidence)

    insertion_count = 0
    deletion_count = 0

    oof2if_only = True # only perform out-of-frame to in-frame merges

    # Divide the cloneset into length families (i.e. clones with equal length
    # sequences)
    by_seqlen = lambda clone:len(clone.seq)
    lenfams = {}
    for seqlen, clones in groupby(sorted(cloneset, key = by_seqlen),
            by_seqlen):
        lenfams[seqlen] = CloneSet(clones)

    clone2mergeinfo = {}
    mismatch_probs = {}

    # TODO: calculate more precise upper bound on the maximum length difference
    max_lendiff = max(lenfams.iterkeys())
    logger.info("LMerge max length difference: %s"%max_lendiff)
    for lendiff in xrange(1, max_lendiff + 1):
        logger.info("LMerge: processing length difference %s"%lendiff)
        for seqlen in sorted(lenfams, reverse = True):
            lenfam = lenfams[seqlen]

            if oof2if_only and seqlen % 3 != 0:
                # skip out-of-frame length family
                logger.debug(
                        "Skipping out-of-frame length family (seqlen: %s)"%\
                                seqlen)
                continue

            del_prob = binom_pmf(lendiff, seqlen, deletion_rate)
            # note: there is one extra place for an insertion to take place
            # than there are bases in the sequences (hence the seqlen + 1).
            ins_prob = binom_pmf(lendiff, seqlen + 1, insertion_rate)

            max_indel_prob = max(ins_prob, del_prob)

            if 1.0 / lenfam.sequence_count > max_indel_prob:
                logger.debug(
                        "Expecting < 1 indels for length family (seqlen: %s)"%\
                                seqlen)
                continue
            else:
                # TODO: it might be better to calculate what the estimated
                # true clone count should be. Since then clones that had a
                # 0 count in the raw data are then enabled to absorb indel
                # sequences (though this is probably an exceedingly rare event)

                # Since we will compare against clone.orig_count values,
                # we need to calculate the minimum orig_count
                min_clone_orig_count = (1.0 / max_indel_prob) * \
                        binom_pmf(0, seqlen, mismatch_rate)
                logger.debug(
                        "(seqlen: %s; lendiff: %s) min_clone_orig_count: %s"%(\
                        seqlen, lendiff, min_clone_orig_count))

            dels_lenfam = lenfams.get(seqlen - lendiff, CloneSet())
            ins_lenfam = lenfams.get(seqlen + lendiff, CloneSet())

            if dels_lenfam.count == 0 and ins_lenfam.count == 0:
                logger.debug("lenfams %s and %s contain no sequences"%(
                    seqlen - lendiff, seqlen + lendiff))
                continue

            for clone in sorted(lenfam, key = lambda clone:-clone.orig_count):
                if clone.orig_count < min_clone_orig_count:
                    break
                if not clone in cloneset:
                    continue
                if oof2if_only and not len(clone.seq) % 3 == 0:
                    continue

                hd2prob = mismatch_probs.setdefault(seqlen,
                        [ binom_pmf(hd, seqlen, mismatch_rate) \
                                for hd in xrange(seqlen + 1) ])
                mergeinfo = clone2mergeinfo.setdefault(clone,
                        MergeInfo(clone, hd2prob, z_conf))

                # Maximum levenshtein distance
                max_ld = lendiff + mergeinfo.maxhd

                for nb_lenfam, p in ((dels_lenfam, del_prob),
                        (ins_lenfam, ins_prob)):
                    max_neighbor_count = ceil(p * clone.count)

                    # TODO: calculate banded Levenshtein distance for speedup?
                    nbs = [ (nb, ed(nb.seq, clone.seq)) for nb in nb_lenfam]
                    nbs = [ (nb, levdist) for nb,levdist in nbs \
                            if levdist <= max_ld ]

                    for nb, levdist in sorted(nbs, key = \
                            lambda (nb, levdist):(levdist, nb.count,
                                min(nb.qual))):
                        hd = levdist - lendiff
                        if hd * nb.count + cloneset.mutation_count > \
                                max_mutation_count and hd > 0:
                            logger.debug("Skipping merge: \
hd (%s) * nb.count (%s) + cloneset.mutation_count (%s) > \
max_mutation_count(%s)"%(hd, nb.count, cloneset.mutation_count,
    max_mutation_count))
                            continue
                        if nb.seq > seqlen:
                            if lendiff*nb.count + deletion_count > \
                                    max_deletion_count:
                                logger.debug("Skipping merge: \
lendiff (%s) * nb.count (%s) + deletion_count (%s) > \
max_deletion_count (%s)"%(lendiff, nb.count, deletion_count,
    max_deletion_count))
                                continue
                        else:
                            if lendiff*nb.count + insertion_count > \
                                    max_insertion_count:
                                logger.debug("Skipping merge: \
lendiff (%s) * nb.count (%s) + insertion_count (%s) > \
max_insertion_count(%s)"%(lendiff, nb.count, insertion_count,
    max_insertion_count))
                                continue

                        if nb.count > max_neighbor_count:
                            continue
                        if not nb in cloneset:
                            continue

                        # TODO: do a proper merge here, calculating pairwise
                        # sequence alignment

                        # Since the clone.count cannot be modified directly,
                        # first create a new neighbor with the same
                        # sequence as clone, and then add this new neighbor
                        # to clone to update its count that way.
                        try:
                            nb2 = type(nb)(nb.v, nb.d, nb.j, nb.c,
                                    other = (clone.seq, clone.qual, nb.count))
                        except: # apparently nb is not an AnnotatedClone
                            nb2 = type(nb)((clone.seq, clone.qual, nb.count))

                        new_clone = clone.add(nb2)
                        if not new_clone is None:
                            cloneset.remove(nb)
                            cloneset.remove(clone)
                            cloneset.add(new_clone)
                            if len(nb.seq) > clone.seq:
                                deletion_count += lendiff * nb.count
                            else:
                                insertion_count += lendiff * nb.count
                        logger.debug("merged: %s to %s"%(nb, clone))

    return cloneset
Beispiel #4
0
def run_qmerge_on_bin(cloneset, mismatch_rate, confidence, max_Q):
    if mismatch_rate == 0 or len(cloneset) == 0:
        return cloneset

    assert(mismatch_rate < 1.0)

    seq_len = len(iter(cloneset).next().seq)
    z_conf = norm_ppf(confidence)
    n = float(cloneset.base_count)
    p = float(mismatch_rate)
    max_mutation_count = n*p + z_conf*sqrt(n*p*(1-p))

    # Calculate binomial pmf for current sequence length and mismatch_rate.
    hd2prob = [ binom_pmf(hd, seq_len, mismatch_rate)
            for hd in xrange(seq_len + 1) ]
 
    hd2nes = [cloneset.sequence_count * prob for prob in hd2prob]

    max_Qm = int(ceil(10 * log10(cloneset.base_count)))
    max_hd = max([i if r >= 1.0 else float("nan") \
            for i, r in enumerate(hd2nes)])
    if isnan(max_hd) or max_hd < 1:
        logger.info("maxhd < 1, not performing QMerge (len: %s)"%seq_len)
        return cloneset

    if isinstance(cloneset, SearchableCloneSet):
        scls = cloneset
    else:
        scls = SearchableCloneSet(cloneset)
    del cloneset

    tasks = Heap()
    basemsg = lambda:"QMerge (len: %s): "%seq_len
    fullbasemsg = lambda:"QMerge (len: %s;max_hd: %s, max_Qm: %s;max_Q: %s;\
#clones: %s;#sequences: %s;#bases: %s;#mutations: %s;max_mutations: %s;\
#tasks: %s): "%(
            seq_len, max_hd, max_Qm, max_Q, scls.count, scls.sequence_count,
            scls.base_count, scls.mutation_count, max_mutation_count,
            len(tasks))
    logger.info(fullbasemsg() + "start")
    clone2task = {}
    for hd, clone1, clone2 in scls.pairs(seq_len, max_hd):
        add_task(scls, clone2task, tasks, clone1, clone2, max_Qm, max_Q)
    logger.debug(fullbasemsg() + "finished searching for pairs")

    while len(tasks) > 0 and scls.mutation_count <= max_mutation_count:
        task = tasks.pop()
        clone1, clone2 = task.clone1, task.clone2

        # Remove current task from clone2task dict to keep it in sync with
        # the tasks heap.
        if clone1 in clone2task and task == clone2task[clone1]:
            del clone2task[clone1]
        if clone2 in clone2task and task == clone2task[clone2]:
            del clone2task[clone2]

        # Check if task can be performed
        if not clone1 in scls or not clone2 in scls:
            # Task cannot be performed, need to search for an alternate
            # "best task" for one or both clones.
            for clone in (clone1, clone2):
                if merged_clone != clone and clone in scls:
                    # New tasks may need to be created
                    for hd, neighbor in scls.neighbors(clone, max_hd):
                        add_task(scls, clone2task, tasks, clone, neighbor,
                                max_Qm, max_Q)
            continue

        # Recalculate merge because clones may have changed by other merges.
        mc_diff, merged_clone = scls.premerge(clone1, clone2)

        if merged_clone is None:
            continue

        if scls.mutation_count + mc_diff > max_mutation_count:
            continue

        hd, Qm, max_mut_q = clone_merge_stats(clone1, clone2)
        if Qm > max_Qm:
            continue

        if max_mut_q >= max_Q:
            continue

        logger.debug(basemsg() + "merging: %s(%s) %s(%s)"%(
            clone1.seq, clone1.count, clone2.seq, clone2.count))

        new_clone = not merged_clone in scls
        scls.enact_merge(clone1, clone2, merged_clone)

        if new_clone:
            logger.debug(basemsg() + "new clone: %s"%(merged_clone.seq))
            for hd, neighbor in scls.neighbors(merged_clone, max_hd):
                add_task(scls, clone2task, tasks, merged_clone, neighbor,
                        max_Qm, max_Q)

    logger.info(fullbasemsg() + "end")

    return scls
Beispiel #5
0
def run_imerge_on_bin(cloneset, mismatch_rate, confidence):
    """Run IMerge algorithm.

    :param cloneset: a set of clones of equal length sequences.
    :param mismatch_rate: probability any given base in the cloneset is
    erroneous. float value with range [0,1].
    :confidence: confidence level used for clone size estimation. 
    """

    if mismatch_rate == 0 or len(cloneset) == 0:
        return cloneset

    assert(mismatch_rate < 1.0)
    assert(0.0 < confidence and confidence < 1.0)

    seq_len = len(iter(cloneset).next().seq)
    z_conf = norm_ppf(confidence)
    n = float(cloneset.base_count)
    p = float(mismatch_rate)
    max_mutation_count = n*p + z_conf*sqrt(n*p*(1-p))

    if isinstance(cloneset, SearchableCloneSet):
        scls = cloneset
    else:
        scls = SearchableCloneSet(cloneset)
    del cloneset

    # Calculate binomial pmf for current sequence length and mismatch_rate.
    hd2prob = [ binom_pmf(hd, seq_len, mismatch_rate)
            for hd in xrange(seq_len + 1) ]
    clone2mergeinfo = {clone : MergeInfo(clone, hd2prob, z_conf) \
            for clone in scls}

    cycle_maxhd = max([mergeinfo.maxhd for mergeinfo in \
            clone2mergeinfo.itervalues()])

    basemsg = lambda:"IMerge (len: %s): "%seq_len
    fullbasemsg = lambda:"IMerge (len: %s;cycle_maxhd: %s;\
#clones: %s;#sequences: %s;#bases: %s;#mutations: %s;max_mutations: %s): "%(
            seq_len, cycle_maxhd, scls.count, scls.sequence_count,
            scls.base_count, scls.mutation_count, max_mutation_count)

    if isnan(cycle_maxhd):
        logger.info(fullbasemsg + "stopping because cycle_maxhd is 0"%\
                seq_len)
        return scls
   
    logger.info(fullbasemsg() + "start")
    for cycle_hd in xrange(1, cycle_maxhd + 1):
        logger.debug(basemsg() + "cycle %s"%cycle_hd)
        for clone in sorted(scls, key = lambda clone : \
                (-clone.count, clone.seq)):
            if not clone in scls:
                logger.debug(basemsg() + "%s not in scls"%clone.seq)
                continue

            mergeinfo = clone2mergeinfo[clone]

            if cycle_hd > mergeinfo.maxhd:
                logger.debug(basemsg() + "%s (maxhd: %s; cycle_hd: %s)"%(
                    clone.seq, mergeinfo.maxhd, cycle_hd))
                continue

            mergeinfo.sweep += mergeinfo.hd2nes[cycle_hd]

            logger.debug(basemsg() + "seq: %s (%s; sweep: %s)"%(clone.seq,
                clone.count, mergeinfo.sweep))

            for hd, neighbor in sorted(mergeinfo.neighborhood(scls, cycle_hd),
                    key = lambda (hd, neighbor): (hd, neighbor.count)):
                if clone == neighbor or neighbor not in scls:
                    continue
                if hd > cycle_hd:
                    break
                if mergeinfo.sweep >= neighbor.count:
                    mc_diff, merged_clone = scls.premerge(clone, neighbor)

                    if merged_clone is None:
                        continue

                    if scls.mutation_count + mc_diff > max_mutation_count:
                        logger.debug(fullbasemsg() + "mc_diff = %s,\
clone.count = %s, neighbor.count = %s"%(mc_diff, clone.count, neighbor.count))
                        continue

                    logger.debug(basemsg() + "merging: %s(%s) %s(%s)"%(
                        clone.seq, clone.count, neighbor.seq, neighbor.count))
                    scls.enact_merge(clone, neighbor, merged_clone)

                    mergeinfo.sweep -= neighbor.count
                    del clone2mergeinfo[neighbor]
                    del clone2mergeinfo[clone]
                    mergeinfo.clone = merged_clone
                    clone2mergeinfo[merged_clone] = mergeinfo
                    clone = merged_clone
    logger.info(fullbasemsg() + "end")
    return scls
Beispiel #6
0
def run_qmerge_on_bin(cloneset, mismatch_rate, confidence, max_Q):
    if mismatch_rate == 0 or len(cloneset) == 0:
        return cloneset

    assert(mismatch_rate < 1.0)

    seq_len = len(iter(cloneset).next().seq)
    z_conf = norm_ppf(confidence)
    n = float(cloneset.base_count)
    p = float(mismatch_rate)
    max_mutation_count = n*p + z_conf*sqrt(n*p*(1-p))

    # Calculate binomial pmf for current sequence length and mismatch_rate.
    hd2prob = [ binom_pmf(hd, seq_len, mismatch_rate)
            for hd in xrange(seq_len + 1) ]
 
    hd2nes = [cloneset.sequence_count * prob for prob in hd2prob]

    max_Qm = int(ceil(10 * log10(cloneset.base_count)))
    max_hd = max([i if r >= 1.0 else float("nan") \
            for i, r in enumerate(hd2nes)])
    if isnan(max_hd) or max_hd < 1:
        logger.info("maxhd < 1, not performing QMerge (len: %s)"%seq_len)
        return cloneset

    if isinstance(cloneset, SearchableCloneSet):
        scls = cloneset
    else:
        scls = SearchableCloneSet(cloneset)
    del cloneset

    tasks = Heap()
    basemsg = lambda:"QMerge (len: %s): "%seq_len
    fullbasemsg = lambda:"QMerge (len: %s;max_hd: %s, max_Qm: %s;max_Q: %s;\
#clones: %s;#sequences: %s;#bases: %s;#mutations: %s;max_mutations: %s;\
#tasks: %s): "%(
            seq_len, max_hd, max_Qm, max_Q, scls.count, scls.sequence_count,
            scls.base_count, scls.mutation_count, max_mutation_count,
            len(tasks))
    logger.info(fullbasemsg() + "start")
    clone2task = {}
    for hd, clone1, clone2 in scls.pairs(seq_len, max_hd):
        add_task(scls, clone2task, tasks, clone1, clone2, max_Qm, max_Q)
    logger.debug(fullbasemsg() + "finished searching for pairs")

    while len(tasks) > 0 and scls.mutation_count <= max_mutation_count:
        task = tasks.pop()
        clone1, clone2 = task.clone1, task.clone2

        # Remove current task from clone2task dict to keep it in sync with
        # the tasks heap.
        if clone1 in clone2task and task == clone2task[clone1]:
            del clone2task[clone1]
        if clone2 in clone2task and task == clone2task[clone2]:
            del clone2task[clone2]

        # Check if task can be performed
        if not clone1 in scls or not clone2 in scls:
            # Task cannot be performed, need to search for an alternate
            # "best task" for one or both clones.
            for clone in (clone1, clone2):
                if merged_clone != clone and clone in scls:
                    # New tasks may need to be created
                    for hd, neighbor in scls.neighbors(clone, max_hd):
                        add_task(scls, clone2task, tasks, clone, neighbor,
                                max_Qm, max_Q)
            continue

        # Recalculate merge because clones may have changed by other merges.
        mc_diff, merged_clone = scls.premerge(clone1, clone2)

        if merged_clone is None:
            continue

        if scls.mutation_count + mc_diff > max_mutation_count:
            continue

        hd, Qm, max_mut_q = clone_merge_stats(clone1, clone2)
        if Qm > max_Qm:
            continue

        if max_mut_q >= max_Q:
            continue

        logger.debug(basemsg() + "merging: %s(%s) %s(%s)"%(
            clone1.seq, clone1.count, clone2.seq, clone2.count))

        new_clone = not merged_clone in scls
        scls.enact_merge(clone1, clone2, merged_clone)

        if new_clone:
            logger.debug(basemsg() + "new clone: %s"%(merged_clone.seq))
            for hd, neighbor in scls.neighbors(merged_clone, max_hd):
                add_task(scls, clone2task, tasks, merged_clone, neighbor,
                        max_Qm, max_Q)

    logger.info(fullbasemsg() + "end")

    return scls