def test_coarse_vs_fine_grained_missingness():
    # Check that log likelihoods are the same in both representations.

    # Get the analysis results for the coarse-grained missingness.
    scene = get_partial_scene()
    coarse_observations = dict(nodes=[0, 0, 1],
                               variables=[0, 1, 0],
                               iid_observations=[[0, 0, 1], [0, 1,
                                                             0], [0, 0, 0],
                                                 [1, 1, 1], [0, 0, 0],
                                                 [1, 0, 1]])
    scene['observed_data'] = coarse_observations
    request = {'property': 'DNNLOGL'}
    j_in = dict(scene=scene, requests=[request])
    j_out_coarse = process_json_in(j_in)

    # Get the analysis results for the fine-grained missingness.
    scene = get_partial_scene()
    fine_observations = dict(nodes=[0, 0, 1, 1],
                             variables=[0, 1, 0, 1],
                             iid_observations=[[0, 0, 1, -1], [0, 1, 0, -1],
                                               [0, 0, 0, -1], [1, 1, 1, -1],
                                               [0, 0, 0, -1], [1, 0, 1, -1]])
    scene['observed_data'] = fine_observations
    request = {'property': 'DNNLOGL'}
    j_in = dict(scene=scene, requests=[request])
    j_out_fine = process_json_in(j_in)

    # Assert that the log likelihoods are the same.
    # Note that this uses exact equality comparison for floating point,
    # so if this test fails in the future then consider allowing
    # some epsilon of closeness.
    assert_equal(j_out_coarse, j_out_fine)
Example #2
0
def run_analysis(scene, likelihoods, kappa):

    # Copy the scene because we are going to do some surgery.
    scene = copy.deepcopy(scene)

    # Change the edge rate scaling factors.
    scene['tree'] = get_analysis_tree()

    # Define the model to be used for the analysis.
    analysis_process = get_K80_process_definition(kappa)
    scene['process_definitions'] = [analysis_process]

    # Request nucleotide transition count expectations.
    ts_transition_request = dict(
            property = 'WSNTRAN',
            observation_reduction = get_observation_reduction(likelihoods),
            transition_reduction = get_ts_reduction())

    # Request nucleotide transversion count expectations.
    tv_transition_request = dict(
            property = 'WSNTRAN',
            observation_reduction = get_observation_reduction(likelihoods),
            transition_reduction = get_tv_reduction())

    # Define the requests.
    requests = [ts_transition_request, tv_transition_request]

    # Run the analysis.
    j_in = dict(
            scene = scene,
            requests = requests)
    j_out = process_json_in(j_in)
    posterior_ts, posterior_tv = j_out['responses']

    # Re-run the analysis without any observations.
    scene['observed_data'] = dict(
            nodes = [],
            variables = [],
            iid_observations = [[] for p in likelihoods])
    j_in = dict(
            scene = scene,
            requests = requests)
    j_out = process_json_in(j_in)
    prior_ts, prior_tv = j_out['responses']

    return prior_ts, prior_tv, posterior_ts, posterior_tv
def test_coarse_vs_fine_grained_missingness():
    # Check that log likelihoods are the same in both representations.

    # Get the analysis results for the coarse-grained missingness.
    scene = get_partial_scene()
    coarse_observations = dict(
        nodes = [0, 0, 1],
        variables = [0, 1, 0],
        iid_observations = [
            [0, 0, 1],
            [0, 1, 0],
            [0, 0, 0],
            [1, 1, 1],
            [0, 0, 0],
            [1, 0, 1]])
    scene['observed_data'] = coarse_observations
    request = {'property' : 'DNNLOGL'}
    j_in = dict(scene = scene, requests = [request])
    j_out_coarse = process_json_in(j_in)

    # Get the analysis results for the fine-grained missingness.
    scene = get_partial_scene()
    fine_observations = dict(
        nodes = [0, 0, 1, 1],
        variables = [0, 1, 0, 1],
        iid_observations = [
            [0, 0, 1, -1],
            [0, 1, 0, -1],
            [0, 0, 0, -1],
            [1, 1, 1, -1],
            [0, 0, 0, -1],
            [1, 0, 1, -1]])
    scene['observed_data'] = fine_observations
    request = {'property' : 'DNNLOGL'}
    j_in = dict(scene = scene, requests = [request])
    j_out_fine = process_json_in(j_in)

    # Assert that the log likelihoods are the same.
    # Note that this uses exact equality comparison for floating point,
    # so if this test fails in the future then consider allowing
    # some epsilon of closeness.
    assert_equal(j_out_coarse, j_out_fine)
def _process_request(r):
    j_out = interface.process_json_in(dict(scene=_get_scene(), requests=[r]))
    assert_equal(set(j_out), {'status', 'responses'})
    assert_equal(j_out['status'], 'feasible')
    assert_equal(len(j_out['responses']), 1)
    out = j_out['responses'][0]
    prefix = r['property'][:3].lower()
    suffix = r['property'][-4:].lower()
    if suffix == 'node':
        assert_equal(len(np.array(out).shape), prefix.count('d') + 1)
    else:
        assert_equal(len(np.array(out).shape), prefix.count('d'))
    return out
Example #5
0
def main():
    ts = np.linspace(1e-5, 30, 100)
    n = len(ts)
    j_in = {
        "scene" : {
            "node_count" : n+1,
            "process_count" : 1,
            "state_space_shape" : [4],
            "tree" : {
                "row_nodes" : [n]*n,
                "column_nodes" : range(n),
                "edge_rate_scaling_factors" : (0.5 * ts).tolist(),
                "edge_processes" : [0]*n
            },
            "root_prior" : {
                "states" : [[0]],
                "probabilities" : [1]
            },
            "process_definitions" : [{
                "row_states" : [[0], [1], [2]],
                "column_states" : [[1], [2], [3]],
                "transition_rates" : [1, 2, 3]
            }],
            "observed_data" : {
                "nodes" : [],
                "variables" : [],
                "iid_observations" : [[]]
            }
        },
        "requests" : [{"property" : "SDDDWEL"}]
    }
    j_out = process_json_in(j_in)
    a, b, c, d = zip(*j_out['responses'][0])
    lines = plt.plot(
            ts, a, 'blue',
            ts, b, 'green',
            ts, c, 'red',
            ts, d, 'skyblue')
    plt.ylabel("Time-averaged Expected sojourn time")
    plt.xlabel("Time")

    # Use a transparent legend frame.
    plt.legend(
            lines,
            ('State 1', 'State 2', 'State 3', 'State 4 (absorbing)'),
            loc='center right',
            framealpha=0)

    # Use a transparent background for the figure.
    plt.savefig('out00.svg', transparent=True)
Example #6
0
def main():
    ts = np.linspace(1e-5, 30, 100)
    n = len(ts)
    j_in = {
        "scene": {
            "node_count":
            n + 1,
            "process_count":
            1,
            "state_space_shape": [4],
            "tree": {
                "row_nodes": [n] * n,
                "column_nodes": range(n),
                "edge_rate_scaling_factors": (0.5 * ts).tolist(),
                "edge_processes": [0] * n
            },
            "root_prior": {
                "states": [[0]],
                "probabilities": [1]
            },
            "process_definitions": [{
                "row_states": [[0], [1], [2]],
                "column_states": [[1], [2], [3]],
                "transition_rates": [1, 2, 3]
            }],
            "observed_data": {
                "nodes": [],
                "variables": [],
                "iid_observations": [[]]
            }
        },
        "requests": [{
            "property": "SDDDWEL"
        }]
    }
    j_out = process_json_in(j_in)
    a, b, c, d = zip(*j_out['responses'][0])
    lines = plt.plot(ts, a, 'blue', ts, b, 'green', ts, c, 'red', ts, d,
                     'skyblue')
    plt.ylabel("Time-averaged Expected sojourn time")
    plt.xlabel("Time")

    # Use a transparent legend frame.
    plt.legend(lines, ('State 1', 'State 2', 'State 3', 'State 4 (absorbing)'),
               loc='center right',
               framealpha=0)

    # Use a transparent background for the figure.
    plt.savefig('out00.svg', transparent=True)
Example #7
0
def get_pattern_likelihoods(scene, rate_scaling_factor):
    scene = copy.deepcopy(scene)
    scene['tree'] = get_simulation_tree(rate_scaling_factor)

    # Define the request for per-pattern log likelihoods.
    log_likelihoods_request = {'property' : 'DNNLOGL'}

    # Get the per-pattern log likelihoods.
    j_in = dict(
            scene = scene,
            requests = [log_likelihoods_request])
    j_out = process_json_in(j_in)
    pattern_log_likelihoods = j_out['responses'][0]
    pattern_likelihoods = np.exp(pattern_log_likelihoods)

    # The sum of likelihoods over all patterns should be 1.
    assert_allclose(pattern_likelihoods.sum(), 1)

    # Return the pattern likelihoods.
    return pattern_likelihoods.tolist()
Example #8
0
def run_partitioned_analysis(scene, partitioned_likelihoods, kappa):

    # Copy the scene because we are going to do some surgery.
    scene = copy.deepcopy(scene)
    scene['tree'] = get_analysis_tree()
    analysis_process = get_K80_process_definition(kappa)
    scene['process_definitions'] = [analysis_process]

    # Request nucleotide transition count expectations.
    ts_requests = []
    for likelihoods in partitioned_likelihoods:
        ts_request = dict(
                property = 'WSNTRAN',
                observation_reduction = get_observation_reduction(likelihoods),
                transition_reduction = get_ts_reduction())
        ts_requests.append(ts_request)

    # Request nucleotide transversion count expectations.
    tv_requests = []
    for likelihoods in partitioned_likelihoods:
        tv_request = dict(
                property = 'WSNTRAN',
                observation_reduction = get_observation_reduction(likelihoods),
                transition_reduction = get_tv_reduction())
        tv_requests.append(tv_request)

    # Run the analysis.
    j_in = dict(
            scene = scene,
            requests = ts_requests + tv_requests)
    j_out = process_json_in(j_in)
    ts_responses = j_out['responses'][:3]
    tv_responses = j_out['responses'][3:]

    partitioned_ts = np.mean(ts_responses)
    partitioned_tv = np.mean(tv_responses)

    return partitioned_ts, partitioned_tv
Example #9
0
def main():
    with open('in01.json') as fin:
        j_in = json.load(fin)
    scene = j_in['scene']
    observation_reduction = j_in['requests'][0]['observation_reduction']
    node_count = scene['node_count']
    edge_count = node_count - 1

    # These starting points for EM are OK.
    rates = [0.001 for r in range(edge_count)]
    #rates = [0.01 for r in range(edge_count)]
    #rates = [0.1 for r in range(edge_count)]

    # These starting points are questionable.
    #rates = [0.15 for r in range(edge_count)]
    #rates = [0.2 for r in range(edge_count)]

    # These starting points are not really feasible.
    #rates = [0.5 for r in range(edge_count)]
    #rates = [0.95 for r in range(edge_count)]
    #rates = [1 for r in range(edge_count)]

    # Initialize rates.
    scene['tree']['edge_rate_scaling_factors'] = rates

    # Update rates according to EM.
    rates = optimize_em(j_in['scene'], observation_reduction, 3)

    # Show the log likelihood
    scene['tree']['edge_rate_scaling_factors'] = rates
    j_in = dict(
            scene = scene,
            requests = [dict(
                property = 'WNNLOGL',
                observation_reduction = observation_reduction)])
    ll = process_json_in(j_in)['responses'][0]
    print(ll)
def _process_ex(Q, d, observable_node, debug=False):
    """
    Use the more advanced interface.

    """
    state_space_shape = (2, 3)
    nstates = np.prod(state_space_shape)
    nnodes = 4
    nedges = nnodes - 1
    nodes = range(nnodes)
    edges = range(nedges)
    states = list(product(
        range(state_space_shape[0]),
        range(state_space_shape[1]),
        ))
    state_pairs = list(permutations(states, 2))
    ntrans = len(state_pairs)
    assert_equal(ntrans, nstates * (nstates - 1))
    row, col = zip(*state_pairs)
    idx_row = np.ravel_multi_index(np.transpose(row), state_space_shape)
    idx_col = np.ravel_multi_index(np.transpose(col), state_space_shape)
    transition_rates = [Q[i, j] for i, j in zip(idx_row, idx_col)]

    dwell_states = [[0, 0], [0, 1], [0, 2]]
    dwell_expect = [1, 1, 1]

    scene = dict(
            node_count = nnodes,
            process_count = 1,
            state_space_shape = state_space_shape,
            root_prior = dict(
                states = states,
                probabilities = d.tolist()),
            tree = dict(
                row_nodes = nodes[:-1],
                column_nodes = nodes[1:],
                edge_processes = [0]*nedges,
                edge_rate_scaling_factors = [0.2]*nedges,
                ),
            process_definitions = [dict(
                row_states = [i for i, j in state_pairs],
                column_states = [j for i, j in state_pairs],
                transition_rates = transition_rates,
                )],
            observed_data = dict(
                nodes = [observable_node],
                variables = [1],
                iid_observations = [
                    [0],
                    [2],
                    [1],
                    [0],
                    [1],
                    ]))

    dwell_request = dict(
            property = 'ddwdwel',
            state_reduction = dict(
                states = dwell_states,
                weights = dwell_expect))

    transition_request = dict(
            property = 'ddntran',
            transition_reduction = dict(
                row_states = [i for i, j in state_pairs],
                column_states = [j for i, j in state_pairs],
                weights = [1 for i, j in state_pairs]))

    j_in = dict(
        scene=scene,
        requests=[dwell_request, transition_request])

    return interface.process_json_in(j_in, debug=debug)
Example #11
0
def main():
    nstates = len(s_aas)
    assert_equal(nstates, 20)
    d = {a: i for i, a in enumerate(s_aas)}
    distn = [float(x) for x in s_distn.strip().split()]
    assert_equal(len(distn), nstates)
    lines = s_mtmam.splitlines()
    assert_equal(len(lines), nstates - 1)
    rate_matrix = np.zeros((nstates, nstates), dtype=int)
    for i, line in enumerate(lines):
        row_index = i + 1
        row = [int(x) for x in line.strip().split()]
        assert_equal(len(row), row_index)
        rate_matrix[row_index, :row_index] = row
    rate_matrix = np.multiply(rate_matrix + rate_matrix.T, distn)
    exit_rates = rate_matrix.sum(axis=1)

    # This is a partial scene, missing the root distribution,
    # the process definition, and the observed data.
    scene = {
        "node_count": 12,
        "process_count": 1,
        "state_space_shape": [20],
        "tree": {
            "row_nodes": [0, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11],
            "column_nodes": [8, 1, 2, 7, 9, 3, 10, 6, 11, 4, 5],
            "edge_rate_scaling_factors": [
                0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
                0.001, 0.001
            ],
            "edge_processes": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        }
    }

    # Add the root distribution.
    scene['root_prior'] = {
        "states": [[i] for i in range(nstates)],
        "probabilities": distn
    }

    # Add the process definition.
    triples = []
    for i in range(nstates):
        for j in range(nstates):
            r = rate_matrix[i, j]
            if i != j and r:
                triples.append((i, j, r))
    row_states, col_states, rates = zip(*triples)
    scene['process_definitions'] = [{
        "row_states": [[s] for s in row_states],
        "column_states": [[s] for s in col_states],
        "transition_rates": rates
    }]

    # Add the observed data.
    sequences = []
    with open('mtCDNApri.aa') as fin:
        lines = fin.readlines()
        header = lines[0]
        for line in lines[1:]:
            name, sequence = line.strip().split()
            sequences.append([d[x] for x in sequence])
    columns = [list(x) for x in zip(*sequences)]
    nsites = len(columns)
    scene['observed_data'] = {
        "nodes": [0, 1, 2, 3, 4, 5, 6],
        "variables": [0, 0, 0, 0, 0, 0, 0],
        "iid_observations": columns
    }

    # Update the edge rates according to a few iterations of EM.
    observation_reduction = None
    em_iterations = 6
    edge_rates = optimize_em(scene, observation_reduction, em_iterations)
    scene['tree']['edge_rate_scaling_factors'] = edge_rates

    # Report the log likelihood for the updated edge rates.
    j_in = dict(scene=scene, requests=[dict(property='SNNLOGL')])
    ll = process_json_in(j_in)['responses'][0]
    print(ll)
Example #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--n", type=int, default=1000, help="population size")
    parser.add_argument("--mu", type=float, default=0.3, help="extinction")
    parser.add_argument("--lam", type=float, default=0.5, help="speciation")
    args = parser.parse_args()

    n = args.n
    mu = args.mu
    lam = args.lam

    edge_rates = [5, 10, 5, 5]
    wrap_scene = get_scene(edge_rates, mu, lam, n, WRAP)
    absorb_scene = get_scene(edge_rates, mu, lam, n, ABSORB)

    # Log likelihood.
    logl_request = dict(property="snnlogl")

    # Unweighted sum over observations and over edges,
    # and weighted sum over transitions consisting of the unweighted sum
    # over transitions corresponding to extinction events.
    extinction_request = dict(
        property="ssntran",
        transition_reduction=dict(
            row_states=[[1, i] for i in range(2, n)],
            column_states=[[1, i - 1] for i in range(2, n)],
            weights=[1] * (n - 2),
        ),
    )

    # Unweighted sum over observations, and weighted sum over states.
    extant_request = dict(property="snwnode", state_reduction=dict(states=[[1, i] for i in range(n)], weights=range(n)))

    # Unweighted sum over observations, weighted sum over edges,
    # and weighted sum over states.
    dwell_request = dict(
        property="swwdwel",
        edge_reduction=dict(edges=[0, 1, 2, 3], weights=edge_rates),
        state_reduction=dict(states=[[1, i] for i in range(n)], weights=range(n)),
    )

    # Compute only the likelihood for the absorbing high population boundary.
    j_out = process_json_in(dict(scene=absorb_scene, requests=[logl_request]))
    absorb_likelihood = exp(j_out["responses"][0])

    # Compute more stuff for the wrapping boundary.
    j_in = dict(scene=wrap_scene, requests=[logl_request, extinction_request, extant_request, dwell_request])
    j_out = process_json_in(j_in)

    logl, extinction, extant, dwell = j_out["responses"]
    wrap_likelihood = exp(logl)
    print("gene population limit:", n)
    print("gene birth rate:", lam)
    print("gene death rate:", mu)
    print("likelihood:", wrap_likelihood)
    print("upper bound likelihood for unbounded population:", absorb_likelihood)
    print("unconditional probability of exceeding the population cap:", absorb_likelihood - wrap_likelihood)
    print("expected number of extinctions:", extinction)
    print("expected number of extant lineages at each node:")
    for i, x in enumerate(extant):
        print(i, ":", x)
    print("expected total size of the gene tree:", dwell)
Example #13
0
def main():
    nstates = len(s_aas)
    assert_equal(nstates, 20)
    d = {a : i for i, a in enumerate(s_aas)}
    distn = [float(x) for x in s_distn.strip().split()]
    assert_equal(len(distn), nstates)
    lines = s_mtmam.splitlines()
    assert_equal(len(lines), nstates-1)
    rate_matrix = np.zeros((nstates, nstates), dtype=int)
    for i, line in enumerate(lines):
        row_index = i + 1
        row = [int(x) for x in line.strip().split()]
        assert_equal(len(row), row_index)
        rate_matrix[row_index, :row_index] = row
    rate_matrix = np.multiply(rate_matrix + rate_matrix.T, distn)
    exit_rates = rate_matrix.sum(axis=1)

    # This is a partial scene, missing the root distribution,
    # the process definition, and the observed data.
    scene = {
            "node_count" : 12,
            "process_count" : 1,
            "state_space_shape" : [20],
            "tree" : {
                "row_nodes" : [
                    0, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11],
                "column_nodes" : [
                    8, 1, 2, 7, 9, 3, 10, 6, 11, 4, 5],
                "edge_rate_scaling_factors" : [
                    0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
                    0.001, 0.001, 0.001, 0.001, 0.001],
                "edge_processes" : [
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                }
            }

    # Add the root distribution.
    scene['root_prior'] = {
            "states" : [[i] for i in range(nstates)],
            "probabilities" : distn
            }

    # Add the process definition.
    triples = []
    for i in range(nstates):
        for j in range(nstates):
            r = rate_matrix[i, j]
            if i != j and r:
                triples.append((i, j, r))
    row_states, col_states, rates = zip(*triples)
    scene['process_definitions'] = [{
        "row_states" : [[s] for s in row_states],
        "column_states" : [[s] for s in col_states],
        "transition_rates" : rates
        }]

    # Add the observed data.
    sequences = []
    with open('mtCDNApri.aa') as fin:
        lines = fin.readlines()
        header = lines[0]
        for line in lines[1:]:
            name, sequence = line.strip().split()
            sequences.append([d[x] for x in sequence])
    columns = [list(x) for x in zip(*sequences)]
    nsites = len(columns)
    scene['observed_data'] = {
            "nodes" : [0, 1, 2, 3, 4, 5, 6],
            "variables" : [0, 0, 0, 0, 0, 0, 0],
            "iid_observations" : columns
            }

    # Update the edge rates according to a few iterations of EM.
    observation_reduction = None
    em_iterations = 6
    edge_rates = optimize_em(scene, observation_reduction, em_iterations)
    scene['tree']['edge_rate_scaling_factors'] = edge_rates

    # Report the log likelihood for the updated edge rates.
    j_in = dict(
            scene = scene,
            requests = [dict(property = 'SNNLOGL')])
    ll = process_json_in(j_in)['responses'][0]
    print(ll)
Example #14
0
def main(args):

    # Get the paralog names.
    paralog_names = args.paralogs

    # Read the tree.
    with open(args.tree) as fin:
        tree_string = fin.read().strip()
    name_to_node, edges = get_tree_info(tree_string)
    edge_count = len(edges)
    node_count = edge_count + 1

    # Read the alignment.
    with open(args.alignment) as alignment_fd:
        info = get_alignment_info(alignment_fd, name_to_node, paralog_names)
    nodes, variables, iid_observations = info
    nsites = len(iid_observations)

    print('number of sites in the alignment:', nsites)
    print('number of sequences:', len(nodes))

    # Compute the empirical distribution of the nucleotides.
    counts = np.zeros(4)
    for k in np.ravel(iid_observations):
        counts[k] += 1
    empirical_pi = counts / counts.sum()

    # Initialize some guesses.
    edge_rates = [0.01] * edge_count
    pi = empirical_pi
    kappa = 2.0

    # Define the tree component of the scene
    row_nodes, column_nodes = zip(*edges)
    tree = dict(
            row_nodes = list(row_nodes),
            column_nodes = list(column_nodes),
            edge_rate_scaling_factors = edge_rates,
            edge_processes = [0] * edge_count)

    # Define the root distribution.
    root_prior = get_root_prior(pi)

    # Define the observed data.
    observed_data = dict(
            nodes = nodes,
            variables = variables,
            iid_observations = iid_observations)

    # Assemble the scene.
    scene = dict(
            node_count = node_count,
            process_count = 1,
            state_space_shape = [4, 4],
            tree = tree,
            root_prior = root_prior,
            observed_data = observed_data)

    arr = []
    j_out = None
    iterative_improvement_count = 5

    tm_start = time.time()
    for i in range(iterative_improvement_count):

        # if j_out is available, recompute kappa and edge rates
        if j_out is not None:
            responses = j_out['responses']
            (
                    ll,
                    per_edge_opportunity,
                    per_edge_change,
                    ts_opportunity,
                    tv_opportunity,
                    ts_change,
                    tv_change) = responses
            edge_rates = []
            for change, dwell in zip(per_edge_change, per_edge_opportunity):
                # In this model, edge rates are with respect to
                # the univariate process.
                bivariate_rate = change / dwell
                univariate_rate = bivariate_rate / 2
                edge_rates.append(univariate_rate)
            kappa = (ts_change / ts_opportunity) / (tv_change / tv_opportunity)

        defn = get_joint_hky_process_definition(pi, kappa)
        j_in = dict(scene = scene)
        j_in['scene']['tree']['edge_rate_scaling_factors'] = edge_rates
        j_in['scene']['process_definitions'] = [defn]
        j_in['requests'] = get_requests(edge_rates, pi, kappa)
        j_out = process_json_in(j_in)
        arr.append(copy.deepcopy(j_out))
    tm_stop = time.time()
    print(
            'seconds for', iterative_improvement_count,
            'initial iterations:', tm_stop - tm_start)

    # Improve the estimates using a numerical search.
    P0 = pack_global_params(pi, kappa)
    B0 = np.log(edge_rates)
    tm_start = time.time()
    verbose = False
    observation_reduction = None
    result, P_opt, B_opt = optimize_quasi_newton(
            verbose,
            scene,
            observation_reduction,
            _get_process_definitions,
            _get_root_prior,
            P0, B0)
    tm_stop = time.time()
    print('seconds for quasi-newton search:', tm_stop - tm_start)

    # Unpack and report the results.
    pi, kappa = unpack_global_params(P_opt)
    edge_rates = np.exp(B_opt)
    print('negative log likelihood:', result.fun)
    print('nucleotide distribution:')
    for nt, p in zip('ACGT', pi):
        print(nt, ':', p)
    print('kappa:', kappa)
    print('edge rates:')
    print(edge_rates)
Example #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--n', type=int, default=1000, help='population size')
    parser.add_argument('--mu', type=float, default=0.3, help='extinction')
    parser.add_argument('--lam', type=float, default=0.5, help='speciation')
    args = parser.parse_args()

    n = args.n
    mu = args.mu
    lam = args.lam

    edge_rates = [5, 10, 5, 5]
    wrap_scene = get_scene(edge_rates, mu, lam, n, WRAP)
    absorb_scene = get_scene(edge_rates, mu, lam, n, ABSORB)

    # Log likelihood.
    logl_request = dict(property='snnlogl')

    # Unweighted sum over observations and over edges,
    # and weighted sum over transitions consisting of the unweighted sum
    # over transitions corresponding to extinction events.
    extinction_request = dict(property='ssntran',
                              transition_reduction=dict(
                                  row_states=[[1, i] for i in range(2, n)],
                                  column_states=[[1, i - 1]
                                                 for i in range(2, n)],
                                  weights=[1] * (n - 2),
                              ))

    # Unweighted sum over observations, and weighted sum over states.
    extant_request = dict(property='snwnode',
                          state_reduction=dict(
                              states=[[1, i] for i in range(n)],
                              weights=range(n),
                          ))

    # Unweighted sum over observations, weighted sum over edges,
    # and weighted sum over states.
    dwell_request = dict(property='swwdwel',
                         edge_reduction=dict(edges=[0, 1, 2, 3],
                                             weights=edge_rates),
                         state_reduction=dict(states=[[1, i]
                                                      for i in range(n)],
                                              weights=range(n)))

    # Compute only the likelihood for the absorbing high population boundary.
    j_out = process_json_in(dict(scene=absorb_scene, requests=[logl_request]))
    absorb_likelihood = exp(j_out['responses'][0])

    # Compute more stuff for the wrapping boundary.
    j_in = dict(scene=wrap_scene,
                requests=[
                    logl_request,
                    extinction_request,
                    extant_request,
                    dwell_request,
                ])
    j_out = process_json_in(j_in)

    logl, extinction, extant, dwell = j_out['responses']
    wrap_likelihood = exp(logl)
    print('gene population limit:', n)
    print('gene birth rate:', lam)
    print('gene death rate:', mu)
    print('likelihood:', wrap_likelihood)
    print('upper bound likelihood for unbounded population:',
          absorb_likelihood)
    print('unconditional probability of exceeding the population cap:',
          absorb_likelihood - wrap_likelihood)
    print('expected number of extinctions:', extinction)
    print('expected number of extant lineages at each node:')
    for i, x in enumerate(extant):
        print(i, ':', x)
    print('expected total size of the gene tree:', dwell)
def main():

    print('initializing...')

    # Initialize some values for one of the analyses.
    (
            pi,
            kappa,
            omega,
            tau,
            suffix_length,
            paralog_to_index,
            fasta_filename,
            edges, edge_rates, name_to_node,
            ) = initialization_a2()
    use_empirical_pi = True
    use_uninformative_edge_rates = True
    use_zerotau = True
    #use_empirical_pi = False
    #use_uninformative_edge_rates = False

    print('building the tree...')

    edge_count = len(edges)
    node_count = edge_count + 1
    if use_uninformative_edge_rates:
        edge_rates = [0.1] * edge_count
    row_nodes, column_nodes = zip(*edges)
    tree = dict(
            row_nodes = list(row_nodes),
            column_nodes = list(column_nodes),
            edge_rate_scaling_factors = edge_rates,
            edge_processes = [0] * edge_count)

    print('reading the genetic code...')

    # Define the genetic code.
    codon_residue_pairs = []
    for line in _code.splitlines():
        line = line.strip()
        if line:
            row = line.upper().split()
            idx_string, residue, codon = row
            if residue != 'STOP':
                codon_residue_pairs.append((codon, residue))
    nstates = 61
    assert_equal(len(codon_residue_pairs), nstates)
    codon_to_state = {c : i for i, (c, r) in enumerate(codon_residue_pairs)}

    print('reading the fasta file...')

    # Read the fasta file.
    # At the same time, compute the empirical nucleotide distribution
    # without regard to the underlying tree.
    acgt_counts = np.zeros(4)
    observable_nodes = []
    sequences = []
    variables = []
    with open(fasta_filename) as fin:
        lines = [line.strip() for line in fin]
        lines = [line for line in lines if line]
    for name_line, sequence_line in grouper(2, lines):
        for c in sequence_line.upper():
            acgt_counts['ACGT'.index(c)] += 1
        assert_(name_line.startswith('>'))
        suffix = name_line[-suffix_length:]
        name = name_line[1:-suffix_length]
        paralog_idx = paralog_to_index[suffix]
        sequence = []
        for triple in grouper(3, sequence_line):
            codon = ''.join(triple)
            state = codon_to_state[codon]
            sequence.append(state)
        variables.append(paralog_idx)
        observable_nodes.append(name_to_node[name])
        sequences.append(sequence)

    print('defining the observed data...')

    # Define the observed data.
    columns = zip(*sequences)
    nsites = len(columns)
    print('number of sites in the alignment:', nsites)
    observed_data = dict(
            nodes = observable_nodes,
            variables = variables,
            iid_observations = [list(column) for column in columns])

    if use_empirical_pi:
        print('computing the empirical nucleotide distribution...')

        # Define the empirical nucleotide distribution.
        pi = acgt_counts / acgt_counts.sum()
        print('empirical nucleotide distribution:', pi)

    print('defining the distribution over codons...')

    # Define the distribution over codons.
    codon_weights = np.zeros(nstates)
    for i, (codon, r) in enumerate(codon_residue_pairs):
        codon_weights[i] = np.prod([pi['ACGT'.index(x)] for x in codon])
    codon_distribution = codon_weights / codon_weights.sum()
    root_prior = dict(
            states = [[i, i] for i in range(nstates)],
            probabilities = codon_distribution.tolist())

    print('defining the codon gene conversion process...')

    # Define the process.
    process_definition = get_geneconv_process_definition(
            pi, kappa, omega, tau, codon_distribution, codon_residue_pairs)


    print('assembling the scene...')
    
    # Assemble the scene.
    scene = dict(
            node_count = node_count,
            process_count = 1,
            state_space_shape = [nstates, nstates],
            tree = tree,
            root_prior = root_prior,
            process_definitions = [process_definition],
            observed_data = observed_data)

    print('computing the log likelihood...')

    # Ask for the log likelihood, summed over sites.
    log_likelihood_request = dict(property = 'SNNLOGL')
    j_in = dict(
            scene = scene,
            requests = [log_likelihood_request])
    j_out = process_json_in(j_in)
    print(j_out)

    print('updating edge specific rate scaling factors using EM...')

    # Use the generic EM edge rate scaling factor updating function.
    observation_reduction = None
    em_iterations = 1
    edge_rates = optimize_em(scene, observation_reduction, em_iterations)

    # Update the scene to reflect the edge rates.
    print('updated edge rate scaling factors:')
    print(edge_rates)
    scene['tree']['edge_rate_scaling_factors'] = edge_rates

    print('checking log likelihood after having updated edge rates...')

    # Check the log likelihood again.
    j_in = dict(
            scene = scene,
            requests = [log_likelihood_request])
    j_out = process_json_in(j_in)
    print(j_out)

    print('computing the maximum likelihood estimates...')

    # Improve the estimates using a numerical search.
    if use_zerotau:
        P0 = pack_global_params_zerotau(pi, kappa, omega)
        get_process_definitions = partial(
                _get_process_definitions_zerotau, codon_residue_pairs)
        get_root_prior = partial(
                _get_root_prior_zerotau, codon_residue_pairs)
    else:
        P0 = pack_global_params(pi, kappa, omega, tau)
        get_process_definitions = partial(
                _get_process_definitions, codon_residue_pairs)
        get_root_prior = partial(
                _get_root_prior, codon_residue_pairs)
    B0 = np.log(edge_rates)
    verbose = True
    observation_reduction = None
    result, P_opt, B_opt = optimize_quasi_newton(
            verbose,
            scene,
            observation_reduction,
            get_process_definitions,
            get_root_prior,
            P0, B0)

    # Unpack and report the results.
    if use_zerotau:
        tau = 0
        pi, kappa, omega = unpack_global_params_zerotau(P_opt)
    else:
        pi, kappa, omega, tau = unpack_global_params(P_opt)
    edge_rates = np.exp(B_opt)
    print('pi:', pi)
    print('kappa:', kappa)
    print('omega:', omega)
    print('tau:', tau)
    print('edge rates:')
    for rate in edge_rates:
        print(rate)
    print()
Example #17
0
def main(args):

    # Get the paralog names.
    paralog_names = args.paralogs

    # Read the tree.
    with open(args.tree) as fin:
        tree_string = fin.read().strip()
    name_to_node, edges = get_tree_info(tree_string)
    edge_count = len(edges)
    node_count = edge_count + 1

    # Read the alignment.
    with open(args.alignment) as alignment_fd:
        info = get_alignment_info(alignment_fd, name_to_node, paralog_names)
    nodes, variables, iid_observations = info
    nsites = len(iid_observations)

    print('number of sites in the alignment:', nsites)
    print('number of sequences:', len(nodes))

    # Compute the empirical distribution of the nucleotides.
    counts = np.zeros(4)
    for k in np.ravel(iid_observations):
        counts[k] += 1
    empirical_pi = counts / counts.sum()

    # Initialize some guesses.
    edge_rates = [0.01] * edge_count
    pi = empirical_pi
    kappa = 2.0

    # Define the tree component of the scene
    row_nodes, column_nodes = zip(*edges)
    tree = dict(
            row_nodes = list(row_nodes),
            column_nodes = list(column_nodes),
            edge_rate_scaling_factors = edge_rates,
            edge_processes = [0] * edge_count)

    # Define the root distribution.
    root_prior = get_root_prior(pi)

    # Define the observed data.
    observed_data = dict(
            nodes = nodes,
            variables = variables,
            iid_observations = iid_observations)

    # Assemble the scene.
    process_defn = get_joint_hky_process_definition(pi, kappa)
    scene = dict(
            node_count = node_count,
            process_count = 1,
            state_space_shape = [4, 4],
            tree = tree,
            root_prior = root_prior,
            process_definitions = [process_defn],
            observed_data = observed_data)

    print('computing the log likelihood...')

    # Ask for the log likelihood, summed over sites.
    log_likelihood_request = dict(property = 'SNNLOGL')
    j_in = dict(
            scene = scene,
            requests = [log_likelihood_request])
    j_out = process_json_in(j_in)
    print(j_out)

    print('updating edge specific rate scaling factors using EM...')

    # Use the generic EM edge rate scaling factor updating function.
    observation_reduction = None
    em_iterations = 1
    edge_rates = optimize_em(scene, observation_reduction, em_iterations)

    # Update the scene to reflect the edge rates.
    print('updated edge rate scaling factors:')
    print(edge_rates)
    scene['tree']['edge_rate_scaling_factors'] = edge_rates

    print('checking log likelihood after having updated edge rates...')

    # Check the log likelihood again.
    j_in = dict(
            scene = scene,
            requests = [log_likelihood_request])
    j_out = process_json_in(j_in)
    print(j_out)

    print('computing the maximum likelihood estimates...')

    # Improve the estimates using a numerical search.
    P0 = pack_global_params(pi, kappa)
    B0 = np.log(edge_rates)
    verbose = False
    observation_reduction = None
    result, P_opt, B_opt = optimize_quasi_newton(
            verbose,
            scene,
            observation_reduction,
            _get_process_definitions,
            _get_root_prior,
            P0, B0)

    # Unpack and report the results.
    pi, kappa = unpack_global_params(P_opt)
    edge_rates = np.exp(B_opt)
    print('negative log likelihood:', result.fun)
    print('nucleotide distribution:')
    for nt, p in zip('ACGT', pi):
        print(nt, ':', p)
    print('kappa:', kappa)
    print('edge rates:')
    print(edge_rates)
Example #18
0
def main():

    print('initializing...')

    # Initialize some values for one of the analyses.
    (
        pi,
        kappa,
        omega,
        tau,
        suffix_length,
        paralog_to_index,
        fasta_filename,
        edges,
        edge_rates,
        name_to_node,
    ) = initialization_a2()
    use_empirical_pi = True
    use_uninformative_edge_rates = True
    use_zerotau = True
    #use_empirical_pi = False
    #use_uninformative_edge_rates = False

    print('building the tree...')

    edge_count = len(edges)
    node_count = edge_count + 1
    if use_uninformative_edge_rates:
        edge_rates = [0.1] * edge_count
    row_nodes, column_nodes = zip(*edges)
    tree = dict(row_nodes=list(row_nodes),
                column_nodes=list(column_nodes),
                edge_rate_scaling_factors=edge_rates,
                edge_processes=[0] * edge_count)

    print('reading the genetic code...')

    # Define the genetic code.
    codon_residue_pairs = []
    for line in _code.splitlines():
        line = line.strip()
        if line:
            row = line.upper().split()
            idx_string, residue, codon = row
            if residue != 'STOP':
                codon_residue_pairs.append((codon, residue))
    nstates = 61
    assert_equal(len(codon_residue_pairs), nstates)
    codon_to_state = {c: i for i, (c, r) in enumerate(codon_residue_pairs)}

    print('reading the fasta file...')

    # Read the fasta file.
    # At the same time, compute the empirical nucleotide distribution
    # without regard to the underlying tree.
    acgt_counts = np.zeros(4)
    observable_nodes = []
    sequences = []
    variables = []
    with open(fasta_filename) as fin:
        lines = [line.strip() for line in fin]
        lines = [line for line in lines if line]
    for name_line, sequence_line in grouper(2, lines):
        for c in sequence_line.upper():
            acgt_counts['ACGT'.index(c)] += 1
        assert_(name_line.startswith('>'))
        suffix = name_line[-suffix_length:]
        name = name_line[1:-suffix_length]
        paralog_idx = paralog_to_index[suffix]
        sequence = []
        for triple in grouper(3, sequence_line):
            codon = ''.join(triple)
            state = codon_to_state[codon]
            sequence.append(state)
        variables.append(paralog_idx)
        observable_nodes.append(name_to_node[name])
        sequences.append(sequence)

    print('defining the observed data...')

    # Define the observed data.
    columns = zip(*sequences)
    nsites = len(columns)
    print('number of sites in the alignment:', nsites)
    observed_data = dict(nodes=observable_nodes,
                         variables=variables,
                         iid_observations=[list(column) for column in columns])

    if use_empirical_pi:
        print('computing the empirical nucleotide distribution...')

        # Define the empirical nucleotide distribution.
        pi = acgt_counts / acgt_counts.sum()
        print('empirical nucleotide distribution:', pi)

    print('defining the distribution over codons...')

    # Define the distribution over codons.
    codon_weights = np.zeros(nstates)
    for i, (codon, r) in enumerate(codon_residue_pairs):
        codon_weights[i] = np.prod([pi['ACGT'.index(x)] for x in codon])
    codon_distribution = codon_weights / codon_weights.sum()
    root_prior = dict(states=[[i, i] for i in range(nstates)],
                      probabilities=codon_distribution.tolist())

    print('defining the codon gene conversion process...')

    # Define the process.
    process_definition = get_geneconv_process_definition(
        pi, kappa, omega, tau, codon_distribution, codon_residue_pairs)

    print('assembling the scene...')

    # Assemble the scene.
    scene = dict(node_count=node_count,
                 process_count=1,
                 state_space_shape=[nstates, nstates],
                 tree=tree,
                 root_prior=root_prior,
                 process_definitions=[process_definition],
                 observed_data=observed_data)

    print('computing the log likelihood...')

    # Ask for the log likelihood, summed over sites.
    log_likelihood_request = dict(property='SNNLOGL')
    j_in = dict(scene=scene, requests=[log_likelihood_request])
    j_out = process_json_in(j_in)
    print(j_out)

    print('updating edge specific rate scaling factors using EM...')

    # Use the generic EM edge rate scaling factor updating function.
    observation_reduction = None
    em_iterations = 1
    edge_rates = optimize_em(scene, observation_reduction, em_iterations)

    # Update the scene to reflect the edge rates.
    print('updated edge rate scaling factors:')
    print(edge_rates)
    scene['tree']['edge_rate_scaling_factors'] = edge_rates

    print('checking log likelihood after having updated edge rates...')

    # Check the log likelihood again.
    j_in = dict(scene=scene, requests=[log_likelihood_request])
    j_out = process_json_in(j_in)
    print(j_out)

    print('computing the maximum likelihood estimates...')

    # Improve the estimates using a numerical search.
    if use_zerotau:
        P0 = pack_global_params_zerotau(pi, kappa, omega)
        get_process_definitions = partial(_get_process_definitions_zerotau,
                                          codon_residue_pairs)
        get_root_prior = partial(_get_root_prior_zerotau, codon_residue_pairs)
    else:
        P0 = pack_global_params(pi, kappa, omega, tau)
        get_process_definitions = partial(_get_process_definitions,
                                          codon_residue_pairs)
        get_root_prior = partial(_get_root_prior, codon_residue_pairs)
    B0 = np.log(edge_rates)
    verbose = True
    observation_reduction = None
    result, P_opt, B_opt = optimize_quasi_newton(verbose, scene,
                                                 observation_reduction,
                                                 get_process_definitions,
                                                 get_root_prior, P0, B0)

    # Unpack and report the results.
    if use_zerotau:
        tau = 0
        pi, kappa, omega = unpack_global_params_zerotau(P_opt)
    else:
        pi, kappa, omega, tau = unpack_global_params(P_opt)
    edge_rates = np.exp(B_opt)
    print('pi:', pi)
    print('kappa:', kappa)
    print('omega:', omega)
    print('tau:', tau)
    print('edge rates:')
    for rate in edge_rates:
        print(rate)
    print()