Beispiel #1
0
    def __init__(self, transmat, tree, ncat=1, alpha=1):
        """
        Initialise the simulator with a transition matrix and a tree.
        The tree should have branch lengths. If it doesn't this will
        trigger a warning, but will continue.
        """
        # store the tree
        self.tree = tree
        self.states = np.array(transmat.model.states)
        self.state_indices = np.array(list(range(transmat.model.size)), dtype=np.intc)
        # initialise equilibrium frequency distribution
        self.freqs = transmat.freqs
        # Gamma rate categories
        self.ncat = ncat
        self.alpha = alpha
        self.gamma_rates = discrete_gamma(alpha, ncat)
        
        # initialise probabilities on tree
        for node in self.tree.preorder(skip_seed=True):
            l = node.edge.length or 0
            if l == 0:
                print ('warning')
                #logger.warn('This tree has zero length edges')
            nstates = self.states.shape[0]
            node.pmats = np.empty((ncat, nstates, nstates))
            for i in range(ncat):
                node.pmats[i] = transmat.get_p_matrix(l*self.gamma_rates[i])

        self.sequences = {}
Beispiel #2
0
    def __init__(self, transmat, tree, ncat=1, alpha=1):
        """
        Initialise the simulator with a transition matrix and a tree.
        The tree should have branch lengths. If it doesn't this will
        trigger a warning, but will continue.
        """
        # store the tree
        self.tree = tree
        self.states = np.array(transmat.model.states)
        self.state_indices = np.array(list(range(transmat.model.size)), dtype=np.intc)
        # initialise equilibrium frequency distribution
        self.freqs = transmat.freqs
        # Gamma rate categories
        self.ncat = ncat
        self.alpha = alpha
        self.gamma_rates = discrete_gamma(alpha, ncat)
        
        # initialise probabilities on tree
        for node in self.tree.preorder(skip_seed=True):
            l = node.edge.length or 0
            if l == 0:
                print ('warning')
                #logger.warn('This tree has zero length edges')
            nstates = self.states.shape[0]
            node.pmats = np.empty((ncat, nstates, nstates))
            for i in range(ncat):
                node.pmats[i] = transmat.get_p_matrix(l*self.gamma_rates[i])

        self.sequences = {}
Beispiel #3
0
def pairdists(alignment, subs_model, alpha=None, ncat=4, tolerance=1e-6, verbose=False):
    """ Load an alignment, calculate all pairwise distances and variances
        model parameter must be a Substitution model type from phylo_utils """

    # Check
    if not isinstance(subs_model, phylo_utils.models.Model):
        raise ValueError("Can't handle this model: {}".format(model))

    if alpha is None:
        alpha = 1.0
        ncat = 1

    # Set up markov model
    tm = TransitionMatrix(subs_model)

    gamma_rates = discrete_gamma(alpha, ncat)
    partials = alignment_to_partials(alignment)
    seqnames = alignment.get_names()
    nseq = len(seqnames)
    distances = np.zeros((nseq, nseq))
    variances = np.zeros((nseq, nseq))

    # Check the model has the appropriate size
    if not subs_model.size == partials[seqnames[0]].shape[1]:
        raise ValueError("Model {} expects {} states, but the alignment has {}".format(model.name,
                                                                                       model.size,
                                                                                       partials[seqnames[0]].shape[1]))

    nodes = [phylo_utils.likelihood.LnlModel(tm) for seq in range(nseq)]
    for node, header in zip(nodes, seqnames):
        node.set_partials(partials[header])  # retrieve partial likelihoods from partials dictionary

    for i, j in itertools.combinations(range(nseq), 2):
        brlen, var = brent_optimise(nodes[i], nodes[j], verbose=verbose)
        distances[i, j] = distances[j, i] = brlen
        variances[i, j] = variances[j, i] = var
    dm = DistanceMatrix.from_array(distances, names=seqnames)
    vm = DistanceMatrix.from_array(variances, names=seqnames)
    return dm, vm
Beispiel #4
0
def pairdists(alignment, subs_model, alpha=None, ncat=4, tolerance=1e-6, verbose=False):
    """ Load an alignment, calculate all pairwise distances and variances
        model parameter must be a Substitution model type from phylo_utils """

    # Check
    if not isinstance(subs_model, phylo_utils.models.Model):
        raise ValueError("Can't handle this model: {}".format(model))

    if alpha is None:
        alpha = 1.0
        ncat = 1

    # Set up markov model
    tm = TransitionMatrix(subs_model)

    gamma_rates = discrete_gamma(alpha, ncat)
    partials = alignment_to_partials(alignment)
    seqnames = alignment.get_names()
    nseq = len(seqnames)
    distances = np.zeros((nseq, nseq))
    variances = np.zeros((nseq, nseq))

    # Check the model has the appropriate size
    if not subs_model.size == partials[seqnames[0]].shape[1]:
        raise ValueError("Model {} expects {} states, but the alignment has {}".format(model.name,
                                                                                       model.size,
                                                                                       partials[seqnames[0]].shape[1]))

    nodes = [phylo_utils.likelihood.LnlModel(tm) for seq in range(nseq)]
    for node, header in zip(nodes, seqnames):
        node.set_partials(partials[header])  # retrieve partial likelihoods from partials dictionary

    for i, j in itertools.combinations(range(nseq), 2):
        brlen, var = brent_optimise(nodes[i], nodes[j], verbose=verbose)
        distances[i, j] = distances[j, i] = brlen
        variances[i, j] = variances[j, i] = var
    dm = DistanceMatrix.from_array(distances, names=seqnames)
    vm = DistanceMatrix.from_array(variances, names=seqnames)
    return dm, vm
Beispiel #5
0
def pairdists(alignment, ncat=4, tolerance=1e-6):
    """ Load an alignment, calculate all pairwise distances and variances """
    def calc(brlen):
        """
        Inner function calculates l'hood + derivs at branch length = brlen
        """
        result = sum([sitewise_lik_derivs(tm.get_p_matrix(gamma_rates[k]*brlen),
                                          tm.get_dp_matrix(gamma_rates[k]*brlen),
                                          tm.get_d2p_matrix(gamma_rates[k]*brlen),
                                          tm.freqs, partials[key1], partials[key2])*(1.0/ncat)
                              for k in range(ncat)])
        lk = np.log(result[:,0]).sum()
        dlk = (result[:,1]/result[:,0]).sum()
        d2lk = ((result[:,0]*result[:,2] - result[:,1]**2)/result[:,0]**2).sum()
        return lk, dlk, d2lk

    def get_step(dlk, d2lk):
        step = dlk / np.abs(d2lk) # abs makes optimiser backtrack from a minimum likelihood
        while (brlen + step) < 0:
            step *= 0.5
        return step

    try:
        model = alignment.parameters.partitions.model
        freqs = alignment.parameters.partitions.frequencies
        alpha = alignment.parameters.partitions.alpha
    except:
        logger.error('No parameters available')
        return

    if model == 'LG':
        subs_model = LG(freqs)
    elif model == 'GTR':
        rates = alignment.parameters.partitions.rates
        subs_model = GTR(rates, freqs, True)
    else:
        raise ValueError("Can't handle this model: {}".format(model))

    # Set up markov model
    tm = TransitionMatrix(subs_model)
    gamma_rates = discrete_gamma(alpha, ncat)
    partials = alignment_to_partials(alignment)
    seqnames = alignment.get_names()
    nseq = len(seqnames)
    distances = np.zeros((nseq, nseq))
    variances = np.zeros((nseq, nseq))

    for i, j in itertools.combinations(range(nseq), 2):
        key1 = seqnames[i]
        key2 = seqnames[j]
        maxiter = 100

        brlen = 1.0  # just have a guess
        lk, dlk, d2lk = calc(brlen)
        maxlk = lk
        niter = 0
        step = get_step(dlk, d2lk)

        # This is the newton optimiser
        while True:
            niter += 1
            if niter > maxiter:
                break  # failed to converge somehow

            # Do the calculation to work out the new step
            lk, dlk, d2lk = calc(brlen + step)
            if (lk - maxlk) < -1000*tolerance:
                # the likelihood got worse, so the step was too big
                # so restore the old values and halve the step, try again
                step *= 0.5
                continue

            else:
                # successful move. update brlen
                brlen = brlen + step
                maxlk = lk
                step = get_step(dlk, d2lk)

            if np.abs(dlk) < tolerance:
                break  # Converged

        distances[i, j] = distances[j, i] = brlen
        variances[i, j] = variances[j, i] = np.abs(-1.0/d2lk)
    dm = DistanceMatrix.from_array(distances, names=seqnames)
    vm = DistanceMatrix.from_array(variances, names=seqnames)
    return dm, vm