def tree_threeway_counts(tree, lca_depths, alphabet=DnaPairs, attr='Sequence'):
    """From tree and array of lca_depths, returns n*n*n array of Count objects.

    n is number of leaves.

    lca_depths: array (leaf * leaf) of depths of last common ancestor.
    alphabet: pair alphabet for input sequences.

    Returns dict containing counts for (i, j, k) and (j, i, k) where k is the
    outgroup of the three sequences. Will pick an arbitrary node to be the
    outgroup if there is a polytomy.
    
    Note: Leaves of tree must have sequences already assigned.
    """
    outgroup_last = tree.outgroupLast
    leaves = list(tree.traverse())
    result = {}
    for first, second, third in three_item_combos(leaves):
        new_first, new_second, new_third = outgroup_last(first, second, third)
        #get the sequence from each node
        seq_1 = getattr(new_first, attr)
        seq_2 = getattr(new_second, attr)
        seq_3 = getattr(new_third, attr)

        result[(new_first.Id, new_second.Id, new_third.Id)] = \
            Counts.fromTriple(seq_1, seq_2, seq_3, alphabet)
        #don't forget to do counts from both  the non-outgroups
        result[(new_second.Id, new_first.Id, new_third.Id)] = \
            Counts.fromTriple(seq_2, seq_1, seq_3, alphabet)
    return result
Exemple #2
0
def tree_threeway_counts(tree, lca_depths, alphabet=DnaPairs, attr='Sequence'):
    """From tree and array of lca_depths, returns n*n*n array of Count objects.

    n is number of leaves.

    lca_depths: array (leaf * leaf) of depths of last common ancestor.
    alphabet: pair alphabet for input sequences.

    Returns dict containing counts for (i, j, k) and (j, i, k) where k is the
    outgroup of the three sequences. Will pick an arbitrary node to be the
    outgroup if there is a polytomy.
    
    Note: Leaves of tree must have sequences already assigned.
    """
    outgroup_last = tree.outgroupLast
    leaves = list(tree.traverse())
    result = {}
    for first, second, third in three_item_combos(leaves):
        new_first, new_second, new_third = outgroup_last(first, second, third)
        #get the sequence from each node
        seq_1 = getattr(new_first, attr)
        seq_2 = getattr(new_second, attr)
        seq_3 = getattr(new_third, attr)
        
        result[(new_first.Id, new_second.Id, new_third.Id)] = \
            Counts.fromTriple(seq_1, seq_2, seq_3, alphabet)
        #don't forget to do counts from both  the non-outgroups
        result[(new_second.Id, new_first.Id, new_third.Id)] = \
            Counts.fromTriple(seq_2, seq_1, seq_3, alphabet)
    return result
def tree_threeway_counts_sample(tree, lca_depths, alphabet=DnaPairs, \
    attr='Sequence', n=1000, check_rates=True, clean_f=None):
    """Like tree_threeway_counts, but takes random sample (w/o replacement)."""
    leaves = list(tree.traverse())
    num_leaves = len(leaves)
    #do normal threeway counts if number of triples < n
    num_triples = num_leaves * (num_leaves - 1) * (num_leaves - 2) / 3
    if num_triples < n:
        counts = tree_threeway_counts(tree, lca_depths, alphabet, attr)
        if clean_f:
            result = {}
            for k, v in counts.items():
                result[k] = clean_f(v)
            return result
        else:
            return counts
    #if we got here, need to sample
    outgroup_last = tree.outgroupLast
    i = 0
    seen = {}
    result = {}
    while i < n and len(seen) < num_triples:
        #bail out if same node picked twice, or if resampling same combo
        curr = choice(leaves), choice(leaves), choice(leaves)
        ids = tuple([c.Id for c in curr])
        if len(dict.fromkeys(ids)) < len(curr):  #picked same thing twice
            continue
        if curr in seen:
            continue
        first, second, third = curr
        new_first, new_second, new_third = outgroup_last(first, second, third)
        seq_1 = getattr(new_first, attr)
        seq_2 = getattr(new_second, attr)
        seq_3 = getattr(new_third, attr)

        counts = Counts.fromTriple(seq_1, seq_2, seq_3, alphabet)
        if clean_f:
            counts = clean_f(counts)
        key = (new_first.Id, new_second.Id, new_third.Id)
        #check rates if we need to
        if check_rates:
            try:
                #skip probs with zero rows
                if not min(max(counts._data, 1)):
                    continue
                probs = counts.toProbs()
                rates = probs.toRates()
            except (ZeroDivisionError, OverflowError, ValueError, \
                FloatingPointError):
                continue
            result[key] = counts
        i += 1
    return result
Exemple #4
0
def tree_threeway_counts_sample(tree, lca_depths, alphabet=DnaPairs, \
    attr='Sequence', n=1000, check_rates=True, clean_f=None):
    """Like tree_threeway_counts, but takes random sample (w/o replacement)."""
    leaves = list(tree.traverse())
    num_leaves = len(leaves)
    #do normal threeway counts if number of triples < n
    num_triples = num_leaves * (num_leaves - 1) * (num_leaves-2) / 3
    if num_triples < n:
        counts = tree_threeway_counts(tree, lca_depths, alphabet, attr)
        if clean_f:
            result = {}
            for k, v in counts.items():
                result[k] = clean_f(v)
            return result
        else:
            return counts
    #if we got here, need to sample
    outgroup_last = tree.outgroupLast
    i = 0
    seen = {}
    result = {}
    while i < n and len(seen) < num_triples:
        #bail out if same node picked twice, or if resampling same combo
        curr = choice(leaves), choice(leaves), choice(leaves)
        ids = tuple([c.Id for c in curr])
        if len(dict.fromkeys(ids)) < len(curr):     #picked same thing twice
            continue
        if curr in seen:
            continue
        first, second, third = curr
        new_first, new_second, new_third = outgroup_last(first, second, third)
        seq_1 = getattr(new_first, attr)
        seq_2 = getattr(new_second, attr)
        seq_3 = getattr(new_third, attr)

        counts = Counts.fromTriple(seq_1, seq_2, seq_3, alphabet)
        if clean_f:
            counts = clean_f(counts)
        key = (new_first.Id, new_second.Id, new_third.Id)
        #check rates if we need to
        if check_rates:
            try:
                #skip probs with zero rows
                if not min(max(counts._data,1)):
                    continue
                probs = counts.toProbs()
                rates = probs.toRates()
            except (ZeroDivisionError, OverflowError, ValueError, \
                FloatingPointError):
                continue
            result[key] = counts
        i += 1
    return result
def tree_twoway_counts(tree, alphabet=DnaPairs, average=True, attr='Sequence'):
    """From tree, return dict of Count objects.

    Note: if average is True, only has counts in m[i,j] or m[j,i], not both.
    """
    leaves = list(tree.traverse())
    result = {}
    if average:
        #return symmetric matrix
        for first, second in two_item_combos(leaves):
            seq_1 = getattr(first, attr)
            seq_2 = getattr(second, attr)
            result[(first.Id, second.Id)] = \
                Counts.fromPair(seq_1, seq_2, alphabet)
    else:
        for first, second in two_item_combos(leaves):
            seq_1 = getattr(first, attr)
            seq_2 = getattr(second, attr)
            result[(first.Id, second.Id)] = \
                Counts.fromPair(seq_1, seq_2, alphabet,False)
            result[(second.Id, first.Id)] = \
                Counts.fromPair(seq_2, seq_1, alphabet,False)
    return result
Exemple #6
0
def tree_twoway_counts(tree, alphabet=DnaPairs, average=True, attr='Sequence'):
    """From tree, return dict of Count objects.

    Note: if average is True, only has counts in m[i,j] or m[j,i], not both.
    """
    leaves = list(tree.traverse())
    result = {}
    if average:
        #return symmetric matrix
        for first, second in two_item_combos(leaves):
            seq_1 = getattr(first, attr)
            seq_2 = getattr(second, attr)
            result[(first.Id, second.Id)] = \
                Counts.fromPair(seq_1, seq_2, alphabet)
    else:
        for first, second in two_item_combos(leaves):
            seq_1 = getattr(first, attr)
            seq_2 = getattr(second, attr)
            result[(first.Id, second.Id)] = \
                Counts.fromPair(seq_1, seq_2, alphabet,False)
            result[(second.Id, first.Id)] = \
                Counts.fromPair(seq_2, seq_1, alphabet,False)
    return result
def dna_count_cleaner(counts):
    """Cleans DNA counts to just the 4-letter alphabet."""
    return Counts(counts._data[:4, :4], DnaPairs)