Esempio n. 1
0
def majority_consensus(trees, cutoff=0):
    """Search majority rule consensus tree from multiple trees.

    This is a extend majority rule method, which means the you can set any
    cutoff between 0 ~ 1 instead of 0.5. The default value of cutoff is 0 to
    create a relaxed binary consensus tree in any condition (as long as one of
    the provided trees is a binary tree). The branch length of each consensus
    clade in the result consensus tree is the average length of all counts for
    that clade.

    :Parameters:
        trees : iterable
            iterable of trees to produce consensus tree.

    """
    tree_iter = iter(trees)
    first_tree = next(tree_iter)

    terms = first_tree.get_terminals()
    bitstr_counts, tree_count = _count_clades(
        itertools.chain([first_tree], tree_iter))

    # Sort bitstrs by descending #occurrences, then #tips, then tip order
    bitstrs = sorted(
        bitstr_counts.keys(),
        key=lambda bitstr:
        (bitstr_counts[bitstr][0], bitstr.count('1'), str(bitstr)),
        reverse=True)
    root = BaseTree.Clade()
    if bitstrs[0].count('1') == len(terms):
        root.clades.extend(terms)
    else:
        raise ValueError('Taxons in provided trees should be consistent')
    # Make a bitstr-to-clades dict and store root clade
    bitstr_clades = {bitstrs[0]: root}
    # create inner clades
    for bitstr in bitstrs[1:]:
        # apply majority rule
        count_in_trees, branch_length_sum = bitstr_counts[bitstr]
        confidence = 100.0 * count_in_trees / tree_count
        if confidence < cutoff * 100.0:
            break
        clade_terms = [terms[i] for i in bitstr.index_one()]
        clade = BaseTree.Clade()
        clade.clades.extend(clade_terms)
        clade.confidence = confidence
        clade.branch_length = branch_length_sum / count_in_trees
        bsckeys = sorted(bitstr_clades,
                         key=lambda bs: bs.count('1'),
                         reverse=True)

        # check if current clade is compatible with previous clades and
        # record it's possible parent and child clades.
        compatible = True
        parent_bitstr = None
        child_bitstrs = []  # multiple independent childs
        for bs in bsckeys:
            if not bs.iscompatible(bitstr):
                compatible = False
                break
            # assign the closest ancestor as its parent
            # as bsckeys is sorted, it should be the last one
            if bs.contains(bitstr):
                parent_bitstr = bs
            # assign the closest descendant as its child
            # the largest and independent clades
            if (bitstr.contains(bs) and bs != bitstr
                    and all(c.independent(bs) for c in child_bitstrs)):
                child_bitstrs.append(bs)
        if not compatible:
            continue

        if parent_bitstr:
            # insert current clade; remove old bitstring
            parent_clade = bitstr_clades.pop(parent_bitstr)
            # update parent clade childs
            parent_clade.clades = [
                c for c in parent_clade.clades if c not in clade_terms
            ]
            # set current clade as child of parent_clade
            parent_clade.clades.append(clade)
            # update bitstring
            # parent = parent ^ bitstr
            # update clade
            bitstr_clades[parent_bitstr] = parent_clade

        if child_bitstrs:
            remove_list = []
            for c in child_bitstrs:
                remove_list.extend(c.index_one())
                child_clade = bitstr_clades[c]
                parent_clade.clades.remove(child_clade)
                clade.clades.append(child_clade)
            remove_terms = [terms[i] for i in remove_list]
            clade.clades = [c for c in clade.clades if c not in remove_terms]
        # put new clade
        bitstr_clades[bitstr] = clade
        if ((len(bitstr_clades) == len(terms) - 1) or
            (len(bitstr_clades) == len(terms) - 2 and len(root.clades) == 3)):
            break
    return BaseTree.Tree(root=root)
Esempio n. 2
0
def majority_consensus(trees, cutoff=0, mcmc=False, n=1):
    """Search majority rule consensus tree from multiple trees.

    This is a extend majority rule method, which means the you can set any
    cutoff between 0 ~ 1 instead of 0.5. The default value of cutoff is 0 to
    create a relaxed binary consensus tree in any condition (as long as one of
    the provided trees is a binary tree). The branch length of each consensus
    clade in the result consensus tree is the average length of all counts for
    that clade.

    :Parameters:
        trees : iterable
            iterable of trees to produce consensus tree or a list of tuples
            output of mcmc if mcmc=True, tuples like (tree, number of occurences in MCMC)

        cutoff : float
            Must be between 0 and 1. cutoff=0.5 means, that all clades in the consensus tree
            must occur in at least 50% of trees, cutoff=1 is the same as strict consensus.

        mcmc : Boolean
            True if parameter trees is a tuple, output of mcmc

        n : integer
            Maximum number of best consensus trees returned - if the number is too big,
            it may be impossible to produce that many different consensus tree.

    """
    if not (0 <= cutoff <= 1):
        raise ValueError("Cutoff must be a number between 0 and 1")
    tree_iter = iter(trees)
    first_tree = next(tree_iter)
    if mcmc:
        terms = first_tree[0].get_terminals()
        term_names = [term.name for term in terms]
        bitstr_counts, tree_count = _count_clades_mcmc(itertools.chain([first_tree], tree_iter), term_names)
    else:
        terms = first_tree.get_terminals()
        term_names = [term.name for term in terms]
        bitstr_counts, tree_count = _count_clades(itertools.chain([first_tree], tree_iter), term_names)
    # Sort bitstrs by descending #occurrences, then #tips, then tip order
    bitstrs = sorted(
        bitstr_counts.keys(),
        key=lambda bitstr: (bitstr_counts[bitstr][0], bitstr.count("1"), str(bitstr)),
        reverse=True,
    )
    if not bitstrs[0].count("1") == len(terms):
        raise ValueError("Taxons in provided trees should be consistent")
    # Make a bitstr-to-clades dict and store root clade
    # create inner clades
    possible_starts = queue.Queue()
    possible_starts.put([bitstrs[0]])
    clades_used = set()
    consensus_trees = []
    # we will try to produce n different consensus trees, starting with bitstrings
    # that were not compatible with previous trees
    while len(consensus_trees) < n and not possible_starts.empty():
        root = BaseTree.Clade()
        root.clades.extend(terms)

        bitstr_clades = {bitstrs[0]: root}
        new_start = possible_starts.get()
        new_start_appeared = 0
        for bitstr in itertools.chain(new_start, bitstrs[1:]):
            if bitstr == new_start[0]:
                new_start_appeared += 1
                if new_start_appeared > 1:
                    continue
            if bitstr == bitstrs[0]:
                continue
            # apply majority rule
            count_in_trees, branch_length_sum = bitstr_counts[bitstr]
            confidence = count_in_trees / tree_count
            if confidence < cutoff:
                break
            clade_terms = [terms[i] for i in bitstr.index_one()]
            clade = BaseTree.Clade()
            clade.clades.extend(clade_terms)
            clade.confidence = confidence
            clade.branch_length = branch_length_sum / count_in_trees
            bsckeys = sorted(bitstr_clades, key=lambda bs: bs.count("1"), reverse=True)

            # check if current clade is compatible with previous clades and
            # record it's possible parent and child clades.
            compatible = True
            parent_bitstr = None
            child_bitstrs = []  # multiple independent childs
            for bs in bsckeys:
                if not bs.iscompatible(bitstr):
                    if bitstr not in clades_used:
                        possible_starts.put([bitstr])
                    compatible = False
                    break
                # assign the closest ancestor as its parent
                # as bsckeys is sorted, it should be the last one
                if bs.contains(bitstr):
                    parent_bitstr = bs
                # assign the closest descendant as its child
                # the largest and independent clades
                if (
                    bitstr.contains(bs)
                    and bs != bitstr
                    and all(c.independent(bs) for c in child_bitstrs)
                ):
                    child_bitstrs.append(bs)
            if not compatible:
                continue

            if parent_bitstr:
                # insert current clade; remove old bitstring
                parent_clade = bitstr_clades.pop(parent_bitstr)
                # update parent clade childs
                parent_clade.clades = [
                    c for c in parent_clade.clades if c not in clade_terms
                ]
                # set current clade as child of parent_clade
                parent_clade.clades.append(clade)
                # update bitstring
                # parent = parent ^ bitstr
                # update clade
                bitstr_clades[parent_bitstr] = parent_clade

            if child_bitstrs:
                remove_list = []
                for c in child_bitstrs:
                    remove_list.extend(c.index_one())
                    child_clade = bitstr_clades[c]
                    parent_clade.clades.remove(child_clade)
                    clade.clades.append(child_clade)
                remove_terms = [terms[i] for i in remove_list]
                clade.clades = [c for c in clade.clades if c not in remove_terms]
            # put new clade
            bitstr_clades[bitstr] = clade
            clades_used.add(bitstr)
        consensus_trees.append(BaseTree.Tree(root=root))
    if n == 1:
        return consensus_trees[0]
    return consensus_trees
Esempio n. 3
0
    def nj(self, distance_matrix):
        """Construct and return a Neighbor Joining tree.

        :Parameters:
            distance_matrix : DistanceMatrix
                The distance matrix for tree construction.

        """
        if not isinstance(distance_matrix, DistanceMatrix):
            raise TypeError("Must provide a DistanceMatrix object.")

        # make a copy of the distance matrix to be used
        dm = copy.deepcopy(distance_matrix)
        # init terminal clades
        clades = [BaseTree.Clade(None, name) for name in dm.names]
        # init node distance
        node_dist = [0] * len(dm)
        # init minimum index
        min_i = 0
        min_j = 0
        inner_count = 0
        while len(dm) > 2:
            # calculate nodeDist
            for i in range(0, len(dm)):
                node_dist[i] = 0
                for j in range(0, len(dm)):
                    node_dist[i] += dm[i, j]
                node_dist[i] = node_dist[i] / (len(dm) - 2)

            # find minimum distance pair
            min_dist = dm[1, 0] - node_dist[1] - node_dist[0]
            min_i = 0
            min_j = 1
            for i in range(1, len(dm)):
                for j in range(0, i):
                    temp = dm[i, j] - node_dist[i] - node_dist[j]
                    if min_dist > temp:
                        min_dist = temp
                        min_i = i
                        min_j = j
            # create clade
            clade1 = clades[min_i]
            clade2 = clades[min_j]
            inner_count += 1
            inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count))
            inner_clade.clades.append(clade1)
            inner_clade.clades.append(clade2)
            # assign branch length
            clade1.branch_length = (dm[min_i, min_j] + node_dist[min_i] -
                                    node_dist[min_j]) / 2.0
            clade2.branch_length = dm[min_i, min_j] - clade1.branch_length

            # update node list
            clades[min_j] = inner_clade
            del clades[min_i]

            # rebuild distance matrix,
            # set the distances of new node at the index of min_j
            for k in range(0, len(dm)):
                if k != min_i and k != min_j:
                    dm[min_j, k] = (dm[min_i, k] + dm[min_j, k] -
                                    dm[min_i, min_j]) / 2.0

            dm.names[min_j] = "Inner" + str(inner_count)
            del dm[min_i]

        # set the last clade as one of the child of the inner_clade
        root = None
        if clades[0] == inner_clade:
            clades[0].branch_length = 0
            clades[1].branch_length = dm[1, 0]
            clades[0].clades.append(clades[1])
            root = clades[0]
        else:
            clades[0].branch_length = dm[1, 0]
            clades[1].branch_length = 0
            clades[1].clades.append(clades[0])
            root = clades[1]

        return BaseTree.Tree(root, rooted=False)
Esempio n. 4
0
    def nj(self, distance_matrix):
        if not isinstance(distance_matrix, DistanceMatrix):
            raise TypeError("Must provide a DistanceMatrix object.")

        # make a copy of the distance matrix to be used
        dm = copy.deepcopy(distance_matrix)
        # init terminal clades
        clades = [BaseTree.Clade(None, name) for name in dm.names]
        # init node distance
        node_dist = [0] * len(dm)
        # init minimum index
        min_i = 0
        min_j = 0
        inner_count = 0
        total_time = 0
        total_time2 = 0
        # special cases for Minimum Alignment Matrices
        if len(dm) == 1:
            root = clades[0]

            return BaseTree.Tree(root, rooted=False)
        elif len(dm) == 2:
            # minimum distance will always be [1,0]
            min_i = 1
            min_j = 0
            clade1 = clades[min_i]
            clade2 = clades[min_j]
            clade1.branch_length = dm[min_i, min_j] / 2.0
            clade2.branch_length = dm[min_i, min_j] - clade1.branch_length
            inner_clade = BaseTree.Clade(None, "Inner")
            inner_clade.clades.append(clade1)
            inner_clade.clades.append(clade2)
            clades[0] = inner_clade
            root = clades[0]

            return BaseTree.Tree(root, rooted=False)

        mod = SourceModule("""
                          #include <stdio.h>
                          #include <stdlib.h>
                          __global__ void DeviceNodeDist(float *device_dm, float *device_node_dist, float N)
                          {
                              const int tid = threadIdx.y + blockIdx.y* blockDim.y;
                              if (tid >= N) return;
                              for(int i = 0; i< N; i++){
                                if(tid< i){
                                    device_node_dist[tid] += device_dm[(i*(i+1))/2 + tid];

                                }else{
                                    device_node_dist[tid] += device_dm[(tid*(tid+1))/2 + i];

                                }

                               }

                          device_node_dist[tid]= device_node_dist[tid]/ (N-2.0);
                          }""")



        while len(dm) > 2:

            # calculate nodeDist
            host_dm = []  # 1D list for distance matrix
            for list in dm.matrix:
                host_dm.extend(list)

            host_dm = np.array(host_dm)
            host_dm = host_dm.astype(np.float32)
            length = len(dm)
            host_node_dist = np.zeros((length,), dtype=float)
            host_node_dist = host_node_dist.astype(np.float32)

            ###GPU code
            start = cuda.Event()
            end = cuda.Event()

            # get the optimum block size based on dataset size
            if (length < 128):
                BLOCKSIZE = 128
            elif (length < 256):
                BLOCKSIZE = 256
            elif (length < 512):
                BLOCKSIZE = 512
            else:
                BLOCKSIZE = 1024


            ###Allocate GPU device memory
            device_dm = cuda.mem_alloc(host_dm.nbytes)
            device_node_dist = cuda.mem_alloc(host_node_dist.nbytes)

            ###Memcopy from host to device
            cuda.memcpy_htod(device_dm, host_dm)




            DeviceNodeDist = mod.get_function("DeviceNodeDist")

            blockDim = (1, BLOCKSIZE, 1)
            gridDim = (1, length / BLOCKSIZE + 1, 1)

            start.record()

            DeviceNodeDist(device_dm, device_node_dist, np.float32(length), block=blockDim, grid=gridDim)
            end.record()
            end.synchronize()

            node_dist1 = np.empty_like(host_node_dist)
            cuda.memcpy_dtoh(node_dist1, device_node_dist)
            node_dist2 = node_dist1.tolist()
            node_dist[0:len(node_dist2)]= node_dist2

            device_dm.free()
            device_node_dist.free()
            del host_dm
            del host_node_dist



            #minimum distance calculation
            in_t2= time.time()
            min_dist = dm[1, 0] - node_dist[1] - node_dist[0]
            min_i = 0
            min_j = 1

            for i in range(1, len(dm)):
                for j in range(0, i):
                    temp = dm[i, j] - node_dist[i] - node_dist[j]
                    if min_dist > temp:
                        min_dist = temp
                        min_i = i
                        min_j = j


            total_time2+= time.time()- in_t2




            # create clade
            clade1 = clades[min_i]
            clade2 = clades[min_j]
            inner_count += 1
            inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count))
            inner_clade.clades.append(clade1)
            inner_clade.clades.append(clade2)
            # assign branch length
            clade1.branch_length = (dm[min_i, min_j] + node_dist[min_i] -
                                    node_dist[min_j]) / 2.0
            clade2.branch_length = dm[min_i, min_j] - clade1.branch_length

            # update node list
            clades[min_j] = inner_clade
            del clades[min_i]





            # rebuild distance matrix,
            # set the distances of new node at the index of min_j
            for k in range(0, len(dm)):
                if k != min_i and k != min_j:
                    dm[min_j, k] = (dm[min_i, k] + dm[min_j, k] -
                                    dm[min_i, min_j]) / 2.0

            dm.names[min_j] = "Inner" + str(inner_count)
            del dm[min_i]





        # set the last clade as one of the child of the inner_clade
        root = None
        if clades[0] == inner_clade:
            clades[0].branch_length = 0
            clades[1].branch_length = dm[1, 0]
            clades[0].clades.append(clades[1])
            root = clades[0]
        else:
            clades[0].branch_length = dm[1, 0]
            clades[1].branch_length = 0
            clades[1].clades.append(clades[0])
            root = clades[1]

        return BaseTree.Tree(root, rooted=False)
Esempio n. 5
0
    def upgma(self, distance_matrix):
        """Construct and return an UPGMA tree.

        Constructs and returns an Unweighted Pair Group Method
        with Arithmetic mean (UPGMA) tree.

        :Parameters:
            distance_matrix : DistanceMatrix
                The distance matrix for tree construction.

        """
        if not isinstance(distance_matrix, DistanceMatrix):
            raise TypeError("Must provide a DistanceMatrix object.")

        # make a copy of the distance matrix to be used
        dm = copy.deepcopy(distance_matrix)
        # init terminal clades
        clades = [BaseTree.Clade(None, name) for name in dm.names]
        # init minimum index
        min_i = 0
        min_j = 0
        inner_count = 0
        while len(dm) > 1:
            min_dist = dm[1, 0]
            # find minimum index
            for i in range(1, len(dm)):
                for j in range(0, i):
                    if min_dist >= dm[i, j]:
                        min_dist = dm[i, j]
                        min_i = i
                        min_j = j

            # create clade
            clade1 = clades[min_i]
            clade2 = clades[min_j]
            inner_count += 1
            inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count))
            inner_clade.clades.append(clade1)
            inner_clade.clades.append(clade2)
            # assign branch length
            if clade1.is_terminal():
                clade1.branch_length = min_dist * 1.0 / 2
            else:
                clade1.branch_length = min_dist * \
                    1.0 / 2 - self._height_of(clade1)

            if clade2.is_terminal():
                clade2.branch_length = min_dist * 1.0 / 2
            else:
                clade2.branch_length = min_dist * \
                    1.0 / 2 - self._height_of(clade2)

            # update node list
            clades[min_j] = inner_clade
            del clades[min_i]

            # rebuild distance matrix,
            # set the distances of new node at the index of min_j
            for k in range(0, len(dm)):
                if k != min_i and k != min_j:
                    dm[min_j, k] = (dm[min_i, k] + dm[min_j, k]) * 1.0 / 2

            dm.names[min_j] = "Inner" + str(inner_count)

            del dm[min_i]
        inner_clade.branch_length = 0
        return BaseTree.Tree(inner_clade)