def test_coarse_vs_fine_grained_missingness(): # Check that log likelihoods are the same in both representations. # Get the analysis results for the coarse-grained missingness. scene = get_partial_scene() coarse_observations = dict(nodes=[0, 0, 1], variables=[0, 1, 0], iid_observations=[[0, 0, 1], [0, 1, 0], [0, 0, 0], [1, 1, 1], [0, 0, 0], [1, 0, 1]]) scene['observed_data'] = coarse_observations request = {'property': 'DNNLOGL'} j_in = dict(scene=scene, requests=[request]) j_out_coarse = process_json_in(j_in) # Get the analysis results for the fine-grained missingness. scene = get_partial_scene() fine_observations = dict(nodes=[0, 0, 1, 1], variables=[0, 1, 0, 1], iid_observations=[[0, 0, 1, -1], [0, 1, 0, -1], [0, 0, 0, -1], [1, 1, 1, -1], [0, 0, 0, -1], [1, 0, 1, -1]]) scene['observed_data'] = fine_observations request = {'property': 'DNNLOGL'} j_in = dict(scene=scene, requests=[request]) j_out_fine = process_json_in(j_in) # Assert that the log likelihoods are the same. # Note that this uses exact equality comparison for floating point, # so if this test fails in the future then consider allowing # some epsilon of closeness. assert_equal(j_out_coarse, j_out_fine)
def run_analysis(scene, likelihoods, kappa): # Copy the scene because we are going to do some surgery. scene = copy.deepcopy(scene) # Change the edge rate scaling factors. scene['tree'] = get_analysis_tree() # Define the model to be used for the analysis. analysis_process = get_K80_process_definition(kappa) scene['process_definitions'] = [analysis_process] # Request nucleotide transition count expectations. ts_transition_request = dict( property = 'WSNTRAN', observation_reduction = get_observation_reduction(likelihoods), transition_reduction = get_ts_reduction()) # Request nucleotide transversion count expectations. tv_transition_request = dict( property = 'WSNTRAN', observation_reduction = get_observation_reduction(likelihoods), transition_reduction = get_tv_reduction()) # Define the requests. requests = [ts_transition_request, tv_transition_request] # Run the analysis. j_in = dict( scene = scene, requests = requests) j_out = process_json_in(j_in) posterior_ts, posterior_tv = j_out['responses'] # Re-run the analysis without any observations. scene['observed_data'] = dict( nodes = [], variables = [], iid_observations = [[] for p in likelihoods]) j_in = dict( scene = scene, requests = requests) j_out = process_json_in(j_in) prior_ts, prior_tv = j_out['responses'] return prior_ts, prior_tv, posterior_ts, posterior_tv
def test_coarse_vs_fine_grained_missingness(): # Check that log likelihoods are the same in both representations. # Get the analysis results for the coarse-grained missingness. scene = get_partial_scene() coarse_observations = dict( nodes = [0, 0, 1], variables = [0, 1, 0], iid_observations = [ [0, 0, 1], [0, 1, 0], [0, 0, 0], [1, 1, 1], [0, 0, 0], [1, 0, 1]]) scene['observed_data'] = coarse_observations request = {'property' : 'DNNLOGL'} j_in = dict(scene = scene, requests = [request]) j_out_coarse = process_json_in(j_in) # Get the analysis results for the fine-grained missingness. scene = get_partial_scene() fine_observations = dict( nodes = [0, 0, 1, 1], variables = [0, 1, 0, 1], iid_observations = [ [0, 0, 1, -1], [0, 1, 0, -1], [0, 0, 0, -1], [1, 1, 1, -1], [0, 0, 0, -1], [1, 0, 1, -1]]) scene['observed_data'] = fine_observations request = {'property' : 'DNNLOGL'} j_in = dict(scene = scene, requests = [request]) j_out_fine = process_json_in(j_in) # Assert that the log likelihoods are the same. # Note that this uses exact equality comparison for floating point, # so if this test fails in the future then consider allowing # some epsilon of closeness. assert_equal(j_out_coarse, j_out_fine)
def _process_request(r): j_out = interface.process_json_in(dict(scene=_get_scene(), requests=[r])) assert_equal(set(j_out), {'status', 'responses'}) assert_equal(j_out['status'], 'feasible') assert_equal(len(j_out['responses']), 1) out = j_out['responses'][0] prefix = r['property'][:3].lower() suffix = r['property'][-4:].lower() if suffix == 'node': assert_equal(len(np.array(out).shape), prefix.count('d') + 1) else: assert_equal(len(np.array(out).shape), prefix.count('d')) return out
def main(): ts = np.linspace(1e-5, 30, 100) n = len(ts) j_in = { "scene" : { "node_count" : n+1, "process_count" : 1, "state_space_shape" : [4], "tree" : { "row_nodes" : [n]*n, "column_nodes" : range(n), "edge_rate_scaling_factors" : (0.5 * ts).tolist(), "edge_processes" : [0]*n }, "root_prior" : { "states" : [[0]], "probabilities" : [1] }, "process_definitions" : [{ "row_states" : [[0], [1], [2]], "column_states" : [[1], [2], [3]], "transition_rates" : [1, 2, 3] }], "observed_data" : { "nodes" : [], "variables" : [], "iid_observations" : [[]] } }, "requests" : [{"property" : "SDDDWEL"}] } j_out = process_json_in(j_in) a, b, c, d = zip(*j_out['responses'][0]) lines = plt.plot( ts, a, 'blue', ts, b, 'green', ts, c, 'red', ts, d, 'skyblue') plt.ylabel("Time-averaged Expected sojourn time") plt.xlabel("Time") # Use a transparent legend frame. plt.legend( lines, ('State 1', 'State 2', 'State 3', 'State 4 (absorbing)'), loc='center right', framealpha=0) # Use a transparent background for the figure. plt.savefig('out00.svg', transparent=True)
def main(): ts = np.linspace(1e-5, 30, 100) n = len(ts) j_in = { "scene": { "node_count": n + 1, "process_count": 1, "state_space_shape": [4], "tree": { "row_nodes": [n] * n, "column_nodes": range(n), "edge_rate_scaling_factors": (0.5 * ts).tolist(), "edge_processes": [0] * n }, "root_prior": { "states": [[0]], "probabilities": [1] }, "process_definitions": [{ "row_states": [[0], [1], [2]], "column_states": [[1], [2], [3]], "transition_rates": [1, 2, 3] }], "observed_data": { "nodes": [], "variables": [], "iid_observations": [[]] } }, "requests": [{ "property": "SDDDWEL" }] } j_out = process_json_in(j_in) a, b, c, d = zip(*j_out['responses'][0]) lines = plt.plot(ts, a, 'blue', ts, b, 'green', ts, c, 'red', ts, d, 'skyblue') plt.ylabel("Time-averaged Expected sojourn time") plt.xlabel("Time") # Use a transparent legend frame. plt.legend(lines, ('State 1', 'State 2', 'State 3', 'State 4 (absorbing)'), loc='center right', framealpha=0) # Use a transparent background for the figure. plt.savefig('out00.svg', transparent=True)
def get_pattern_likelihoods(scene, rate_scaling_factor): scene = copy.deepcopy(scene) scene['tree'] = get_simulation_tree(rate_scaling_factor) # Define the request for per-pattern log likelihoods. log_likelihoods_request = {'property' : 'DNNLOGL'} # Get the per-pattern log likelihoods. j_in = dict( scene = scene, requests = [log_likelihoods_request]) j_out = process_json_in(j_in) pattern_log_likelihoods = j_out['responses'][0] pattern_likelihoods = np.exp(pattern_log_likelihoods) # The sum of likelihoods over all patterns should be 1. assert_allclose(pattern_likelihoods.sum(), 1) # Return the pattern likelihoods. return pattern_likelihoods.tolist()
def run_partitioned_analysis(scene, partitioned_likelihoods, kappa): # Copy the scene because we are going to do some surgery. scene = copy.deepcopy(scene) scene['tree'] = get_analysis_tree() analysis_process = get_K80_process_definition(kappa) scene['process_definitions'] = [analysis_process] # Request nucleotide transition count expectations. ts_requests = [] for likelihoods in partitioned_likelihoods: ts_request = dict( property = 'WSNTRAN', observation_reduction = get_observation_reduction(likelihoods), transition_reduction = get_ts_reduction()) ts_requests.append(ts_request) # Request nucleotide transversion count expectations. tv_requests = [] for likelihoods in partitioned_likelihoods: tv_request = dict( property = 'WSNTRAN', observation_reduction = get_observation_reduction(likelihoods), transition_reduction = get_tv_reduction()) tv_requests.append(tv_request) # Run the analysis. j_in = dict( scene = scene, requests = ts_requests + tv_requests) j_out = process_json_in(j_in) ts_responses = j_out['responses'][:3] tv_responses = j_out['responses'][3:] partitioned_ts = np.mean(ts_responses) partitioned_tv = np.mean(tv_responses) return partitioned_ts, partitioned_tv
def main(): with open('in01.json') as fin: j_in = json.load(fin) scene = j_in['scene'] observation_reduction = j_in['requests'][0]['observation_reduction'] node_count = scene['node_count'] edge_count = node_count - 1 # These starting points for EM are OK. rates = [0.001 for r in range(edge_count)] #rates = [0.01 for r in range(edge_count)] #rates = [0.1 for r in range(edge_count)] # These starting points are questionable. #rates = [0.15 for r in range(edge_count)] #rates = [0.2 for r in range(edge_count)] # These starting points are not really feasible. #rates = [0.5 for r in range(edge_count)] #rates = [0.95 for r in range(edge_count)] #rates = [1 for r in range(edge_count)] # Initialize rates. scene['tree']['edge_rate_scaling_factors'] = rates # Update rates according to EM. rates = optimize_em(j_in['scene'], observation_reduction, 3) # Show the log likelihood scene['tree']['edge_rate_scaling_factors'] = rates j_in = dict( scene = scene, requests = [dict( property = 'WNNLOGL', observation_reduction = observation_reduction)]) ll = process_json_in(j_in)['responses'][0] print(ll)
def _process_ex(Q, d, observable_node, debug=False): """ Use the more advanced interface. """ state_space_shape = (2, 3) nstates = np.prod(state_space_shape) nnodes = 4 nedges = nnodes - 1 nodes = range(nnodes) edges = range(nedges) states = list(product( range(state_space_shape[0]), range(state_space_shape[1]), )) state_pairs = list(permutations(states, 2)) ntrans = len(state_pairs) assert_equal(ntrans, nstates * (nstates - 1)) row, col = zip(*state_pairs) idx_row = np.ravel_multi_index(np.transpose(row), state_space_shape) idx_col = np.ravel_multi_index(np.transpose(col), state_space_shape) transition_rates = [Q[i, j] for i, j in zip(idx_row, idx_col)] dwell_states = [[0, 0], [0, 1], [0, 2]] dwell_expect = [1, 1, 1] scene = dict( node_count = nnodes, process_count = 1, state_space_shape = state_space_shape, root_prior = dict( states = states, probabilities = d.tolist()), tree = dict( row_nodes = nodes[:-1], column_nodes = nodes[1:], edge_processes = [0]*nedges, edge_rate_scaling_factors = [0.2]*nedges, ), process_definitions = [dict( row_states = [i for i, j in state_pairs], column_states = [j for i, j in state_pairs], transition_rates = transition_rates, )], observed_data = dict( nodes = [observable_node], variables = [1], iid_observations = [ [0], [2], [1], [0], [1], ])) dwell_request = dict( property = 'ddwdwel', state_reduction = dict( states = dwell_states, weights = dwell_expect)) transition_request = dict( property = 'ddntran', transition_reduction = dict( row_states = [i for i, j in state_pairs], column_states = [j for i, j in state_pairs], weights = [1 for i, j in state_pairs])) j_in = dict( scene=scene, requests=[dwell_request, transition_request]) return interface.process_json_in(j_in, debug=debug)
def main(): nstates = len(s_aas) assert_equal(nstates, 20) d = {a: i for i, a in enumerate(s_aas)} distn = [float(x) for x in s_distn.strip().split()] assert_equal(len(distn), nstates) lines = s_mtmam.splitlines() assert_equal(len(lines), nstates - 1) rate_matrix = np.zeros((nstates, nstates), dtype=int) for i, line in enumerate(lines): row_index = i + 1 row = [int(x) for x in line.strip().split()] assert_equal(len(row), row_index) rate_matrix[row_index, :row_index] = row rate_matrix = np.multiply(rate_matrix + rate_matrix.T, distn) exit_rates = rate_matrix.sum(axis=1) # This is a partial scene, missing the root distribution, # the process definition, and the observed data. scene = { "node_count": 12, "process_count": 1, "state_space_shape": [20], "tree": { "row_nodes": [0, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11], "column_nodes": [8, 1, 2, 7, 9, 3, 10, 6, 11, 4, 5], "edge_rate_scaling_factors": [ 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001 ], "edge_processes": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] } } # Add the root distribution. scene['root_prior'] = { "states": [[i] for i in range(nstates)], "probabilities": distn } # Add the process definition. triples = [] for i in range(nstates): for j in range(nstates): r = rate_matrix[i, j] if i != j and r: triples.append((i, j, r)) row_states, col_states, rates = zip(*triples) scene['process_definitions'] = [{ "row_states": [[s] for s in row_states], "column_states": [[s] for s in col_states], "transition_rates": rates }] # Add the observed data. sequences = [] with open('mtCDNApri.aa') as fin: lines = fin.readlines() header = lines[0] for line in lines[1:]: name, sequence = line.strip().split() sequences.append([d[x] for x in sequence]) columns = [list(x) for x in zip(*sequences)] nsites = len(columns) scene['observed_data'] = { "nodes": [0, 1, 2, 3, 4, 5, 6], "variables": [0, 0, 0, 0, 0, 0, 0], "iid_observations": columns } # Update the edge rates according to a few iterations of EM. observation_reduction = None em_iterations = 6 edge_rates = optimize_em(scene, observation_reduction, em_iterations) scene['tree']['edge_rate_scaling_factors'] = edge_rates # Report the log likelihood for the updated edge rates. j_in = dict(scene=scene, requests=[dict(property='SNNLOGL')]) ll = process_json_in(j_in)['responses'][0] print(ll)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--n", type=int, default=1000, help="population size") parser.add_argument("--mu", type=float, default=0.3, help="extinction") parser.add_argument("--lam", type=float, default=0.5, help="speciation") args = parser.parse_args() n = args.n mu = args.mu lam = args.lam edge_rates = [5, 10, 5, 5] wrap_scene = get_scene(edge_rates, mu, lam, n, WRAP) absorb_scene = get_scene(edge_rates, mu, lam, n, ABSORB) # Log likelihood. logl_request = dict(property="snnlogl") # Unweighted sum over observations and over edges, # and weighted sum over transitions consisting of the unweighted sum # over transitions corresponding to extinction events. extinction_request = dict( property="ssntran", transition_reduction=dict( row_states=[[1, i] for i in range(2, n)], column_states=[[1, i - 1] for i in range(2, n)], weights=[1] * (n - 2), ), ) # Unweighted sum over observations, and weighted sum over states. extant_request = dict(property="snwnode", state_reduction=dict(states=[[1, i] for i in range(n)], weights=range(n))) # Unweighted sum over observations, weighted sum over edges, # and weighted sum over states. dwell_request = dict( property="swwdwel", edge_reduction=dict(edges=[0, 1, 2, 3], weights=edge_rates), state_reduction=dict(states=[[1, i] for i in range(n)], weights=range(n)), ) # Compute only the likelihood for the absorbing high population boundary. j_out = process_json_in(dict(scene=absorb_scene, requests=[logl_request])) absorb_likelihood = exp(j_out["responses"][0]) # Compute more stuff for the wrapping boundary. j_in = dict(scene=wrap_scene, requests=[logl_request, extinction_request, extant_request, dwell_request]) j_out = process_json_in(j_in) logl, extinction, extant, dwell = j_out["responses"] wrap_likelihood = exp(logl) print("gene population limit:", n) print("gene birth rate:", lam) print("gene death rate:", mu) print("likelihood:", wrap_likelihood) print("upper bound likelihood for unbounded population:", absorb_likelihood) print("unconditional probability of exceeding the population cap:", absorb_likelihood - wrap_likelihood) print("expected number of extinctions:", extinction) print("expected number of extant lineages at each node:") for i, x in enumerate(extant): print(i, ":", x) print("expected total size of the gene tree:", dwell)
def main(): nstates = len(s_aas) assert_equal(nstates, 20) d = {a : i for i, a in enumerate(s_aas)} distn = [float(x) for x in s_distn.strip().split()] assert_equal(len(distn), nstates) lines = s_mtmam.splitlines() assert_equal(len(lines), nstates-1) rate_matrix = np.zeros((nstates, nstates), dtype=int) for i, line in enumerate(lines): row_index = i + 1 row = [int(x) for x in line.strip().split()] assert_equal(len(row), row_index) rate_matrix[row_index, :row_index] = row rate_matrix = np.multiply(rate_matrix + rate_matrix.T, distn) exit_rates = rate_matrix.sum(axis=1) # This is a partial scene, missing the root distribution, # the process definition, and the observed data. scene = { "node_count" : 12, "process_count" : 1, "state_space_shape" : [20], "tree" : { "row_nodes" : [ 0, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11], "column_nodes" : [ 8, 1, 2, 7, 9, 3, 10, 6, 11, 4, 5], "edge_rate_scaling_factors" : [ 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001], "edge_processes" : [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] } } # Add the root distribution. scene['root_prior'] = { "states" : [[i] for i in range(nstates)], "probabilities" : distn } # Add the process definition. triples = [] for i in range(nstates): for j in range(nstates): r = rate_matrix[i, j] if i != j and r: triples.append((i, j, r)) row_states, col_states, rates = zip(*triples) scene['process_definitions'] = [{ "row_states" : [[s] for s in row_states], "column_states" : [[s] for s in col_states], "transition_rates" : rates }] # Add the observed data. sequences = [] with open('mtCDNApri.aa') as fin: lines = fin.readlines() header = lines[0] for line in lines[1:]: name, sequence = line.strip().split() sequences.append([d[x] for x in sequence]) columns = [list(x) for x in zip(*sequences)] nsites = len(columns) scene['observed_data'] = { "nodes" : [0, 1, 2, 3, 4, 5, 6], "variables" : [0, 0, 0, 0, 0, 0, 0], "iid_observations" : columns } # Update the edge rates according to a few iterations of EM. observation_reduction = None em_iterations = 6 edge_rates = optimize_em(scene, observation_reduction, em_iterations) scene['tree']['edge_rate_scaling_factors'] = edge_rates # Report the log likelihood for the updated edge rates. j_in = dict( scene = scene, requests = [dict(property = 'SNNLOGL')]) ll = process_json_in(j_in)['responses'][0] print(ll)
def main(args): # Get the paralog names. paralog_names = args.paralogs # Read the tree. with open(args.tree) as fin: tree_string = fin.read().strip() name_to_node, edges = get_tree_info(tree_string) edge_count = len(edges) node_count = edge_count + 1 # Read the alignment. with open(args.alignment) as alignment_fd: info = get_alignment_info(alignment_fd, name_to_node, paralog_names) nodes, variables, iid_observations = info nsites = len(iid_observations) print('number of sites in the alignment:', nsites) print('number of sequences:', len(nodes)) # Compute the empirical distribution of the nucleotides. counts = np.zeros(4) for k in np.ravel(iid_observations): counts[k] += 1 empirical_pi = counts / counts.sum() # Initialize some guesses. edge_rates = [0.01] * edge_count pi = empirical_pi kappa = 2.0 # Define the tree component of the scene row_nodes, column_nodes = zip(*edges) tree = dict( row_nodes = list(row_nodes), column_nodes = list(column_nodes), edge_rate_scaling_factors = edge_rates, edge_processes = [0] * edge_count) # Define the root distribution. root_prior = get_root_prior(pi) # Define the observed data. observed_data = dict( nodes = nodes, variables = variables, iid_observations = iid_observations) # Assemble the scene. scene = dict( node_count = node_count, process_count = 1, state_space_shape = [4, 4], tree = tree, root_prior = root_prior, observed_data = observed_data) arr = [] j_out = None iterative_improvement_count = 5 tm_start = time.time() for i in range(iterative_improvement_count): # if j_out is available, recompute kappa and edge rates if j_out is not None: responses = j_out['responses'] ( ll, per_edge_opportunity, per_edge_change, ts_opportunity, tv_opportunity, ts_change, tv_change) = responses edge_rates = [] for change, dwell in zip(per_edge_change, per_edge_opportunity): # In this model, edge rates are with respect to # the univariate process. bivariate_rate = change / dwell univariate_rate = bivariate_rate / 2 edge_rates.append(univariate_rate) kappa = (ts_change / ts_opportunity) / (tv_change / tv_opportunity) defn = get_joint_hky_process_definition(pi, kappa) j_in = dict(scene = scene) j_in['scene']['tree']['edge_rate_scaling_factors'] = edge_rates j_in['scene']['process_definitions'] = [defn] j_in['requests'] = get_requests(edge_rates, pi, kappa) j_out = process_json_in(j_in) arr.append(copy.deepcopy(j_out)) tm_stop = time.time() print( 'seconds for', iterative_improvement_count, 'initial iterations:', tm_stop - tm_start) # Improve the estimates using a numerical search. P0 = pack_global_params(pi, kappa) B0 = np.log(edge_rates) tm_start = time.time() verbose = False observation_reduction = None result, P_opt, B_opt = optimize_quasi_newton( verbose, scene, observation_reduction, _get_process_definitions, _get_root_prior, P0, B0) tm_stop = time.time() print('seconds for quasi-newton search:', tm_stop - tm_start) # Unpack and report the results. pi, kappa = unpack_global_params(P_opt) edge_rates = np.exp(B_opt) print('negative log likelihood:', result.fun) print('nucleotide distribution:') for nt, p in zip('ACGT', pi): print(nt, ':', p) print('kappa:', kappa) print('edge rates:') print(edge_rates)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--n', type=int, default=1000, help='population size') parser.add_argument('--mu', type=float, default=0.3, help='extinction') parser.add_argument('--lam', type=float, default=0.5, help='speciation') args = parser.parse_args() n = args.n mu = args.mu lam = args.lam edge_rates = [5, 10, 5, 5] wrap_scene = get_scene(edge_rates, mu, lam, n, WRAP) absorb_scene = get_scene(edge_rates, mu, lam, n, ABSORB) # Log likelihood. logl_request = dict(property='snnlogl') # Unweighted sum over observations and over edges, # and weighted sum over transitions consisting of the unweighted sum # over transitions corresponding to extinction events. extinction_request = dict(property='ssntran', transition_reduction=dict( row_states=[[1, i] for i in range(2, n)], column_states=[[1, i - 1] for i in range(2, n)], weights=[1] * (n - 2), )) # Unweighted sum over observations, and weighted sum over states. extant_request = dict(property='snwnode', state_reduction=dict( states=[[1, i] for i in range(n)], weights=range(n), )) # Unweighted sum over observations, weighted sum over edges, # and weighted sum over states. dwell_request = dict(property='swwdwel', edge_reduction=dict(edges=[0, 1, 2, 3], weights=edge_rates), state_reduction=dict(states=[[1, i] for i in range(n)], weights=range(n))) # Compute only the likelihood for the absorbing high population boundary. j_out = process_json_in(dict(scene=absorb_scene, requests=[logl_request])) absorb_likelihood = exp(j_out['responses'][0]) # Compute more stuff for the wrapping boundary. j_in = dict(scene=wrap_scene, requests=[ logl_request, extinction_request, extant_request, dwell_request, ]) j_out = process_json_in(j_in) logl, extinction, extant, dwell = j_out['responses'] wrap_likelihood = exp(logl) print('gene population limit:', n) print('gene birth rate:', lam) print('gene death rate:', mu) print('likelihood:', wrap_likelihood) print('upper bound likelihood for unbounded population:', absorb_likelihood) print('unconditional probability of exceeding the population cap:', absorb_likelihood - wrap_likelihood) print('expected number of extinctions:', extinction) print('expected number of extant lineages at each node:') for i, x in enumerate(extant): print(i, ':', x) print('expected total size of the gene tree:', dwell)
def main(): print('initializing...') # Initialize some values for one of the analyses. ( pi, kappa, omega, tau, suffix_length, paralog_to_index, fasta_filename, edges, edge_rates, name_to_node, ) = initialization_a2() use_empirical_pi = True use_uninformative_edge_rates = True use_zerotau = True #use_empirical_pi = False #use_uninformative_edge_rates = False print('building the tree...') edge_count = len(edges) node_count = edge_count + 1 if use_uninformative_edge_rates: edge_rates = [0.1] * edge_count row_nodes, column_nodes = zip(*edges) tree = dict( row_nodes = list(row_nodes), column_nodes = list(column_nodes), edge_rate_scaling_factors = edge_rates, edge_processes = [0] * edge_count) print('reading the genetic code...') # Define the genetic code. codon_residue_pairs = [] for line in _code.splitlines(): line = line.strip() if line: row = line.upper().split() idx_string, residue, codon = row if residue != 'STOP': codon_residue_pairs.append((codon, residue)) nstates = 61 assert_equal(len(codon_residue_pairs), nstates) codon_to_state = {c : i for i, (c, r) in enumerate(codon_residue_pairs)} print('reading the fasta file...') # Read the fasta file. # At the same time, compute the empirical nucleotide distribution # without regard to the underlying tree. acgt_counts = np.zeros(4) observable_nodes = [] sequences = [] variables = [] with open(fasta_filename) as fin: lines = [line.strip() for line in fin] lines = [line for line in lines if line] for name_line, sequence_line in grouper(2, lines): for c in sequence_line.upper(): acgt_counts['ACGT'.index(c)] += 1 assert_(name_line.startswith('>')) suffix = name_line[-suffix_length:] name = name_line[1:-suffix_length] paralog_idx = paralog_to_index[suffix] sequence = [] for triple in grouper(3, sequence_line): codon = ''.join(triple) state = codon_to_state[codon] sequence.append(state) variables.append(paralog_idx) observable_nodes.append(name_to_node[name]) sequences.append(sequence) print('defining the observed data...') # Define the observed data. columns = zip(*sequences) nsites = len(columns) print('number of sites in the alignment:', nsites) observed_data = dict( nodes = observable_nodes, variables = variables, iid_observations = [list(column) for column in columns]) if use_empirical_pi: print('computing the empirical nucleotide distribution...') # Define the empirical nucleotide distribution. pi = acgt_counts / acgt_counts.sum() print('empirical nucleotide distribution:', pi) print('defining the distribution over codons...') # Define the distribution over codons. codon_weights = np.zeros(nstates) for i, (codon, r) in enumerate(codon_residue_pairs): codon_weights[i] = np.prod([pi['ACGT'.index(x)] for x in codon]) codon_distribution = codon_weights / codon_weights.sum() root_prior = dict( states = [[i, i] for i in range(nstates)], probabilities = codon_distribution.tolist()) print('defining the codon gene conversion process...') # Define the process. process_definition = get_geneconv_process_definition( pi, kappa, omega, tau, codon_distribution, codon_residue_pairs) print('assembling the scene...') # Assemble the scene. scene = dict( node_count = node_count, process_count = 1, state_space_shape = [nstates, nstates], tree = tree, root_prior = root_prior, process_definitions = [process_definition], observed_data = observed_data) print('computing the log likelihood...') # Ask for the log likelihood, summed over sites. log_likelihood_request = dict(property = 'SNNLOGL') j_in = dict( scene = scene, requests = [log_likelihood_request]) j_out = process_json_in(j_in) print(j_out) print('updating edge specific rate scaling factors using EM...') # Use the generic EM edge rate scaling factor updating function. observation_reduction = None em_iterations = 1 edge_rates = optimize_em(scene, observation_reduction, em_iterations) # Update the scene to reflect the edge rates. print('updated edge rate scaling factors:') print(edge_rates) scene['tree']['edge_rate_scaling_factors'] = edge_rates print('checking log likelihood after having updated edge rates...') # Check the log likelihood again. j_in = dict( scene = scene, requests = [log_likelihood_request]) j_out = process_json_in(j_in) print(j_out) print('computing the maximum likelihood estimates...') # Improve the estimates using a numerical search. if use_zerotau: P0 = pack_global_params_zerotau(pi, kappa, omega) get_process_definitions = partial( _get_process_definitions_zerotau, codon_residue_pairs) get_root_prior = partial( _get_root_prior_zerotau, codon_residue_pairs) else: P0 = pack_global_params(pi, kappa, omega, tau) get_process_definitions = partial( _get_process_definitions, codon_residue_pairs) get_root_prior = partial( _get_root_prior, codon_residue_pairs) B0 = np.log(edge_rates) verbose = True observation_reduction = None result, P_opt, B_opt = optimize_quasi_newton( verbose, scene, observation_reduction, get_process_definitions, get_root_prior, P0, B0) # Unpack and report the results. if use_zerotau: tau = 0 pi, kappa, omega = unpack_global_params_zerotau(P_opt) else: pi, kappa, omega, tau = unpack_global_params(P_opt) edge_rates = np.exp(B_opt) print('pi:', pi) print('kappa:', kappa) print('omega:', omega) print('tau:', tau) print('edge rates:') for rate in edge_rates: print(rate) print()
def main(args): # Get the paralog names. paralog_names = args.paralogs # Read the tree. with open(args.tree) as fin: tree_string = fin.read().strip() name_to_node, edges = get_tree_info(tree_string) edge_count = len(edges) node_count = edge_count + 1 # Read the alignment. with open(args.alignment) as alignment_fd: info = get_alignment_info(alignment_fd, name_to_node, paralog_names) nodes, variables, iid_observations = info nsites = len(iid_observations) print('number of sites in the alignment:', nsites) print('number of sequences:', len(nodes)) # Compute the empirical distribution of the nucleotides. counts = np.zeros(4) for k in np.ravel(iid_observations): counts[k] += 1 empirical_pi = counts / counts.sum() # Initialize some guesses. edge_rates = [0.01] * edge_count pi = empirical_pi kappa = 2.0 # Define the tree component of the scene row_nodes, column_nodes = zip(*edges) tree = dict( row_nodes = list(row_nodes), column_nodes = list(column_nodes), edge_rate_scaling_factors = edge_rates, edge_processes = [0] * edge_count) # Define the root distribution. root_prior = get_root_prior(pi) # Define the observed data. observed_data = dict( nodes = nodes, variables = variables, iid_observations = iid_observations) # Assemble the scene. process_defn = get_joint_hky_process_definition(pi, kappa) scene = dict( node_count = node_count, process_count = 1, state_space_shape = [4, 4], tree = tree, root_prior = root_prior, process_definitions = [process_defn], observed_data = observed_data) print('computing the log likelihood...') # Ask for the log likelihood, summed over sites. log_likelihood_request = dict(property = 'SNNLOGL') j_in = dict( scene = scene, requests = [log_likelihood_request]) j_out = process_json_in(j_in) print(j_out) print('updating edge specific rate scaling factors using EM...') # Use the generic EM edge rate scaling factor updating function. observation_reduction = None em_iterations = 1 edge_rates = optimize_em(scene, observation_reduction, em_iterations) # Update the scene to reflect the edge rates. print('updated edge rate scaling factors:') print(edge_rates) scene['tree']['edge_rate_scaling_factors'] = edge_rates print('checking log likelihood after having updated edge rates...') # Check the log likelihood again. j_in = dict( scene = scene, requests = [log_likelihood_request]) j_out = process_json_in(j_in) print(j_out) print('computing the maximum likelihood estimates...') # Improve the estimates using a numerical search. P0 = pack_global_params(pi, kappa) B0 = np.log(edge_rates) verbose = False observation_reduction = None result, P_opt, B_opt = optimize_quasi_newton( verbose, scene, observation_reduction, _get_process_definitions, _get_root_prior, P0, B0) # Unpack and report the results. pi, kappa = unpack_global_params(P_opt) edge_rates = np.exp(B_opt) print('negative log likelihood:', result.fun) print('nucleotide distribution:') for nt, p in zip('ACGT', pi): print(nt, ':', p) print('kappa:', kappa) print('edge rates:') print(edge_rates)
def main(): print('initializing...') # Initialize some values for one of the analyses. ( pi, kappa, omega, tau, suffix_length, paralog_to_index, fasta_filename, edges, edge_rates, name_to_node, ) = initialization_a2() use_empirical_pi = True use_uninformative_edge_rates = True use_zerotau = True #use_empirical_pi = False #use_uninformative_edge_rates = False print('building the tree...') edge_count = len(edges) node_count = edge_count + 1 if use_uninformative_edge_rates: edge_rates = [0.1] * edge_count row_nodes, column_nodes = zip(*edges) tree = dict(row_nodes=list(row_nodes), column_nodes=list(column_nodes), edge_rate_scaling_factors=edge_rates, edge_processes=[0] * edge_count) print('reading the genetic code...') # Define the genetic code. codon_residue_pairs = [] for line in _code.splitlines(): line = line.strip() if line: row = line.upper().split() idx_string, residue, codon = row if residue != 'STOP': codon_residue_pairs.append((codon, residue)) nstates = 61 assert_equal(len(codon_residue_pairs), nstates) codon_to_state = {c: i for i, (c, r) in enumerate(codon_residue_pairs)} print('reading the fasta file...') # Read the fasta file. # At the same time, compute the empirical nucleotide distribution # without regard to the underlying tree. acgt_counts = np.zeros(4) observable_nodes = [] sequences = [] variables = [] with open(fasta_filename) as fin: lines = [line.strip() for line in fin] lines = [line for line in lines if line] for name_line, sequence_line in grouper(2, lines): for c in sequence_line.upper(): acgt_counts['ACGT'.index(c)] += 1 assert_(name_line.startswith('>')) suffix = name_line[-suffix_length:] name = name_line[1:-suffix_length] paralog_idx = paralog_to_index[suffix] sequence = [] for triple in grouper(3, sequence_line): codon = ''.join(triple) state = codon_to_state[codon] sequence.append(state) variables.append(paralog_idx) observable_nodes.append(name_to_node[name]) sequences.append(sequence) print('defining the observed data...') # Define the observed data. columns = zip(*sequences) nsites = len(columns) print('number of sites in the alignment:', nsites) observed_data = dict(nodes=observable_nodes, variables=variables, iid_observations=[list(column) for column in columns]) if use_empirical_pi: print('computing the empirical nucleotide distribution...') # Define the empirical nucleotide distribution. pi = acgt_counts / acgt_counts.sum() print('empirical nucleotide distribution:', pi) print('defining the distribution over codons...') # Define the distribution over codons. codon_weights = np.zeros(nstates) for i, (codon, r) in enumerate(codon_residue_pairs): codon_weights[i] = np.prod([pi['ACGT'.index(x)] for x in codon]) codon_distribution = codon_weights / codon_weights.sum() root_prior = dict(states=[[i, i] for i in range(nstates)], probabilities=codon_distribution.tolist()) print('defining the codon gene conversion process...') # Define the process. process_definition = get_geneconv_process_definition( pi, kappa, omega, tau, codon_distribution, codon_residue_pairs) print('assembling the scene...') # Assemble the scene. scene = dict(node_count=node_count, process_count=1, state_space_shape=[nstates, nstates], tree=tree, root_prior=root_prior, process_definitions=[process_definition], observed_data=observed_data) print('computing the log likelihood...') # Ask for the log likelihood, summed over sites. log_likelihood_request = dict(property='SNNLOGL') j_in = dict(scene=scene, requests=[log_likelihood_request]) j_out = process_json_in(j_in) print(j_out) print('updating edge specific rate scaling factors using EM...') # Use the generic EM edge rate scaling factor updating function. observation_reduction = None em_iterations = 1 edge_rates = optimize_em(scene, observation_reduction, em_iterations) # Update the scene to reflect the edge rates. print('updated edge rate scaling factors:') print(edge_rates) scene['tree']['edge_rate_scaling_factors'] = edge_rates print('checking log likelihood after having updated edge rates...') # Check the log likelihood again. j_in = dict(scene=scene, requests=[log_likelihood_request]) j_out = process_json_in(j_in) print(j_out) print('computing the maximum likelihood estimates...') # Improve the estimates using a numerical search. if use_zerotau: P0 = pack_global_params_zerotau(pi, kappa, omega) get_process_definitions = partial(_get_process_definitions_zerotau, codon_residue_pairs) get_root_prior = partial(_get_root_prior_zerotau, codon_residue_pairs) else: P0 = pack_global_params(pi, kappa, omega, tau) get_process_definitions = partial(_get_process_definitions, codon_residue_pairs) get_root_prior = partial(_get_root_prior, codon_residue_pairs) B0 = np.log(edge_rates) verbose = True observation_reduction = None result, P_opt, B_opt = optimize_quasi_newton(verbose, scene, observation_reduction, get_process_definitions, get_root_prior, P0, B0) # Unpack and report the results. if use_zerotau: tau = 0 pi, kappa, omega = unpack_global_params_zerotau(P_opt) else: pi, kappa, omega, tau = unpack_global_params(P_opt) edge_rates = np.exp(B_opt) print('pi:', pi) print('kappa:', kappa) print('omega:', omega) print('tau:', tau) print('edge rates:') for rate in edge_rates: print(rate) print()