def test_shuffled_edges(): d = copy.deepcopy(d_original) original_dwell = json.loads(arbplf_dwell(json.dumps(d))) d = copy.deepcopy(d_original) original_ll = json.loads(arbplf_ll(json.dumps(d))) d = copy.deepcopy(d_original) d['site_reduction'] = {'aggregation' : 'sum'} original_em_update = json.loads(arbplf_em_update(json.dumps(d))) iter_count = 10 for i in range(iter_count): d_shuffled, perm = _shuffle_edges(d_original) # the ll output does not have an edge column d = copy.deepcopy(d_shuffled) ll = json.loads(arbplf_ll(json.dumps(d))) assert_equal(ll, original_ll) d = copy.deepcopy(d_shuffled) dwell = json.loads(arbplf_dwell(json.dumps(d))) dwell_prime = _perm_output_edges(dwell, perm) dwell_prime['data'].sort() assert_equal(dwell_prime, original_dwell) d = copy.deepcopy(d_shuffled) d['site_reduction'] = {'aggregation' : 'sum'} em_update = json.loads(arbplf_em_update(json.dumps(d))) em_update_prime = _perm_output_edges(em_update, perm) em_update_prime['data'].sort() assert_equal(em_update_prime, original_em_update)
def test_shuffled_nodes(): d = copy.deepcopy(d_original) original_dwell = json.loads(arbplf_dwell(json.dumps(d))) d = copy.deepcopy(d_original) original_ll = json.loads(arbplf_ll(json.dumps(d))) d = copy.deepcopy(d_original) d['site_reduction'] = {'aggregation' : 'sum'} original_em_update = json.loads(arbplf_em_update(json.dumps(d))) iter_count = 10 for i in range(iter_count): d_shuffled = _shuffle_nodes(d_original) d = copy.deepcopy(d_shuffled) dwell = json.loads(arbplf_dwell(json.dumps(d))) assert_equal(dwell, original_dwell) d = copy.deepcopy(d_shuffled) ll = json.loads(arbplf_ll(json.dumps(d))) assert_equal(ll, original_ll) d = copy.deepcopy(d_shuffled) d['site_reduction'] = {'aggregation' : 'sum'} em_update = json.loads(arbplf_em_update(json.dumps(d))) assert_equal(em_update, original_em_update)
def generic_neg_ll(get_s, X): s_in = get_s(X) s_out = arbplf_ll(s_in) df = pd.read_json(StringIO(s_out), orient='split', precise_float=True) y = -df.value.values[0] print(y) return y
def myll(d): """ Provides a dict -> pandas.DataFrame wrapper of the JSON arbplf_ll. """ s = arbplf_ll(json.dumps(d)) df = pd.read_json(StringIO(s), orient='split', precise_float=True) return df
def summarize(d): print('newton delta:') print(arbplf_newton_delta(json.dumps(d))) print('deriv:') print(arbplf_deriv(json.dumps(d))) print('ll:') print(arbplf_ll(json.dumps(d)))
def test_heterogeneous_edge_rates(): # try changing one of the edge rate coefficients d = { "model_and_data": { "edges": [[0, 1], [1, 2]], "edge_rate_coefficients": [1, 2], "rate_matrix": [[0, 1], [0, 0]], "probability_array": [[[1, 0], [1, 1], [1, 0]]] }, "site_reduction": { "aggregation": "only" } } actual_marginal = json.loads(arbplf_marginal(json.dumps(d))) assert_equal(actual_marginal, desired_marginal) g = copy.deepcopy(d) g['trans_reduction'] = dict(selection=[[0, 1], [1, 0]]) actual_trans = json.loads(arbplf_trans(json.dumps(g))) assert_equal(actual_trans, desired_trans) actual_ll = json.loads(arbplf_ll(json.dumps(d))) desired_ll = {"columns": ["value"], "data": [[-3.0]]} assert_equal(actual_ll, desired_ll) actual_em_update = json.loads(arbplf_em_update(json.dumps(d))) assert_equal(actual_em_update, desired_em_update) actual_dwell = json.loads(arbplf_dwell(json.dumps(d))) assert_equal(actual_dwell, desired_dwell)
def test_edges_are_not_preordered(): # Try switching the order of the edges in the input # and increasing the birth rate in the rate matrix. d = { "model_and_data": { "edges": [[1, 2], [0, 1]], "edge_rate_coefficients": [1, 2], "rate_matrix": [[0, 2], [0, 0]], "probability_array": [[[1, 0], [1, 1], [1, 0]]] }, "site_reduction": { "aggregation": "only" } } actual_marginal = json.loads(arbplf_marginal(json.dumps(d))) assert_equal(actual_marginal, desired_marginal) g = copy.deepcopy(d) g['trans_reduction'] = dict(selection=[[0, 1], [1, 0]]) actual_trans = json.loads(arbplf_trans(json.dumps(g))) assert_equal(actual_trans, desired_trans) actual_ll = json.loads(arbplf_ll(json.dumps(d))) desired_ll = {"columns": ["value"], "data": [[-6.0]]} assert_equal(actual_ll, desired_ll) actual_em_update = json.loads(arbplf_em_update(json.dumps(d))) assert_equal(actual_em_update, desired_em_update) actual_dwell = json.loads(arbplf_dwell(json.dumps(d))) assert_equal(actual_dwell, desired_dwell)
def test_truncated_ll(): d = copy.deepcopy(D) d['model_and_data']['probability_array'][0][-1] = [0, 1] s = arbplf_ll(json.dumps(d)) df = pd.read_json(StringIO(s), orient='split', precise_float=True) actual = df.value.values[0] # compute the desired closed form solution T = sum(rates) desired = log(1 - exp(-T)) # compare actual and desired result assert_allclose(actual, desired)
def test_simplified_felsenstein_fig_16_4_example(): x = { "model_and_data" : { "edges" : [[5, 0], [5, 1], [5, 6], [6, 2], [6, 7], [7, 3], [7, 4]], "edge_rate_coefficients" : [0.01, 0.2, 0.15, 0.3, 0.05, 0.3, 0.02], "rate_matrix" : [ [0, 3, 3, 3], [3, 0, 3, 3], [3, 3, 0, 3], [3, 3, 3, 0]], "probability_array" : [[ [1, 0, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0.25, 0.25, 0.25, 0.25], [1, 1, 1, 1], [1, 1, 1, 1]]]}, "site_reduction" : { "aggregation" : "sum"}} arbplf_ll(json.dumps(x))
def objective(X): # 7 branch length parameters # 3 stationary distribution parameters # 5 symmetric GTR parameters # 1 gamma rate mixture shape parameter print('in:', X) #edge_rate_coefficients = np.exp(X[:7]).tolist() edge_rate_coefficients = X[:7].tolist() p = expit(X[7:7+3]) q = expit(-X[7:7+3]) root_prior = np.zeros(4) root_prior[0] = p[0] * p[1] root_prior[1] = p[0] * q[1] root_prior[2] = q[0] * p[2] root_prior[3] = q[0] * q[2] a, b, c, d, e = np.exp(X[7+3:7+3+5]) rate_matrix = np.array([ [0, a, b, c], [a, 0, d, e], [b, d, 0, 1], [c, e, 1, 0]]) rate_matrix = (rate_matrix * root_prior).tolist() root_prior = root_prior.tolist() gamma_shape = np.exp(X[-1]) rate_mixture = dict( prior = [1/rate_category_count] * rate_category_count, rates = discretized_gamma(rate_category_count, gamma_shape)) model_and_data = dict( edges = [[5, 0], [5, 1], [6, 5], [6, 2], [7, 6], [7, 3], [7, 4]], edge_rate_coefficients = edge_rate_coefficients, root_prior = root_prior, rate_matrix = rate_matrix, rate_mixture = rate_mixture, probability_array = probability_array) d = dict( model_and_data = model_and_data, site_reduction = dict(aggregation='sum')) s = arbplf_ll(json.dumps(d)) df = pd.read_json(StringIO(s), orient='split', precise_float=True) log_likelihood = df.values[0, 0] y = -log_likelihood print('out:', y) return y
def generic_objective(get_s, X): s_in = get_s(X) neg_ll = partial(generic_neg_ll, get_s) # compute negative log likelihood s_out = arbplf_ll(s_in) df = pd.read_json(StringIO(s_out), orient='split', precise_float=True) y = -df.value.values[0] # explicitly compute derivatives with respect to edge rate coefficients s_out = arbplf_deriv(s_in) df = pd.read_json(StringIO(s_out), orient='split', precise_float=True) d = -df.set_index('edge').value.values edge_count = len(_edges) D = np.concatenate((d, [0]*3)) request = np.array([0]*edge_count + [1]*3) _finite_diffs(neg_ll, request, D, X, y) print(y, D) return y, D
def objective(X): # 7 branch length parameters # 3 stationary distribution parameters # 5 symmetric GTR parameters # 1 gamma rate mixture shape parameter print('in:', X) #edge_rate_coefficients = np.exp(X[:7]).tolist() edge_rate_coefficients = X[:7].tolist() p = expit(X[7:7 + 3]) q = expit(-X[7:7 + 3]) root_prior = np.zeros(4) root_prior[0] = p[0] * p[1] root_prior[1] = p[0] * q[1] root_prior[2] = q[0] * p[2] root_prior[3] = q[0] * q[2] a, b, c, d, e = np.exp(X[7 + 3:7 + 3 + 5]) rate_matrix = np.array([[0, a, b, c], [a, 0, d, e], [b, d, 0, 1], [c, e, 1, 0]]) rate_matrix = (rate_matrix * root_prior).tolist() root_prior = root_prior.tolist() gamma_shape = np.exp(X[-1]) rate_mixture = dict(prior=[1 / rate_category_count] * rate_category_count, rates=discretized_gamma(rate_category_count, gamma_shape)) model_and_data = dict(edges=[[5, 0], [5, 1], [6, 5], [6, 2], [7, 6], [7, 3], [7, 4]], edge_rate_coefficients=edge_rate_coefficients, root_prior=root_prior, rate_matrix=rate_matrix, rate_mixture=rate_mixture, probability_array=probability_array) d = dict(model_and_data=model_and_data, site_reduction=dict(aggregation='sum')) s = arbplf_ll(json.dumps(d)) df = pd.read_json(StringIO(s), orient='split', precise_float=True) log_likelihood = df.values[0, 0] y = -log_likelihood print('out:', y) return y
def main(): xs = np.linspace(1e-5, 1, 100) ts = -2 * np.log(xs) arr = [] for i, t in enumerate(ts): s = arbplf_ll(get_json_input(t)) df = pd.read_json(StringIO(s), orient='split', precise_float=True) arr.append(df.value.values[0]) lines = plt.plot(xs, arr, 'blue') plt.ylabel("log likelihood") plt.xlabel("x = exp(-0.5 t)") plt.savefig('out00.svg', transparent=True) # local optima for i, t in enumerate((0.1, 6.0)): s = arbplf_newton_refine(get_json_input(t)) df = pd.read_json(StringIO(s), orient='split', precise_float=True) u = df.value.values[0] print('local optimum', i, ':') print(' initial guess:', t) print(' refined isolated interior local optimum:') print(' t = {:.16}'.format(u)) print(' x =', np.exp(-0.5 * u))
def myll(d): s = arbplf_ll(json.dumps(d)) df = pd.read_json(StringIO(s), orient="split", precise_float=True) return df
def run(assumed_kappa): state_count = 4 node_count = 5 true_kappa = 4 assumed_m, assumed_denom = get_rate_matrix(assumed_kappa) true_m, true_denom = get_rate_matrix(true_kappa) edges = [[0, 2], [0, 1], [1, 3], [1, 4]] assumed_coeffs = [28, 21, 12, 9] true_coeffs = [30, 20, 10, 10] # There are five nodes. # Three of them have unobserved states. # Use one site for each of the 4^3 = 64 possible observations. X = [-1] U = range(4) all_site_patterns = list(itertools.product(X, X, U, U, U)) prior_array = [[[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]] probability_array = [] for pattern in all_site_patterns: arr = [] for i, p in enumerate(pattern): if p == -1: row = [1] * state_count else: row = [0] * state_count row[p] = 1 arr.append(row) probability_array.append(arr) model_and_data = { "edges": edges, "edge_rate_coefficients": true_coeffs, "root_prior": "equilibrium_distribution", "rate_matrix": true_m, "rate_divisor": true_denom * 100, "probability_array": probability_array } d = {"model_and_data": model_and_data} s = arbplf_ll(json.dumps(d)) df = pd.read_json(StringIO(s), orient='split', precise_float=True) log_likelihoods = df.value.values print('log likelihood sum:', sum(log_likelihoods)) # compute ts and tv using the likelihoods as observation weights weights = [math.exp(ll) for ll in log_likelihoods] total = sum(weights) weights = [w / total for w in weights] ts_pairs, tv_pairs = get_ts_tv_pairs() model_and_data = { "edges": edges, "edge_rate_coefficients": assumed_coeffs, "root_prior": "equilibrium_distribution", "rate_matrix": assumed_m, "rate_divisor": assumed_denom * 100, "probability_array": probability_array } d = { "model_and_data": model_and_data, "site_reduction": { "aggregation": weights }, "edge_reduction": { "aggregation": "sum" }, "trans_reduction": { "aggregation": "sum" } } d['trans_reduction']['selection'] = ts_pairs d['trans_reduction']['aggregation'] = [1000] * len(ts_pairs) d['site_reduction']['aggregation'] = "sum" d['model_and_data']['probability_array'] = prior_array s = arbplf_trans(json.dumps(d)) df = pd.read_json(StringIO(s), orient='split', precise_float=True) print("prior ts expectation:") print(df.value.values[0]) print(s) d['trans_reduction']['selection'] = ts_pairs d['trans_reduction']['aggregation'] = [1000] * len(ts_pairs) d['site_reduction']['aggregation'] = weights d['model_and_data']['probability_array'] = probability_array s = arbplf_trans(json.dumps(d)) df = pd.read_json(StringIO(s), orient='split', precise_float=True) print("conditional ts expectation:") print(df.value.values[0]) print(s) d['trans_reduction']['selection'] = tv_pairs d['trans_reduction']['aggregation'] = [1000] * len(tv_pairs) d['site_reduction']['aggregation'] = "sum" d['model_and_data']['probability_array'] = prior_array s = arbplf_trans(json.dumps(d)) df = pd.read_json(StringIO(s), orient='split', precise_float=True) print("prior tv expectation:") print(df.value.values[0]) print(s) d['trans_reduction']['selection'] = tv_pairs d['trans_reduction']['aggregation'] = [1000] * len(tv_pairs) d['site_reduction']['aggregation'] = weights d['model_and_data']['probability_array'] = probability_array s = arbplf_trans(json.dumps(d)) df = pd.read_json(StringIO(s), orient='split', precise_float=True) print("conditional tv expectation:") print(df.value.values[0]) print(s)
AGAACTGCTAACTCACTACCCATGTATAACAACATGGCTTTCTCAACTTTTAAAGGATA ACAGCTATCCATTGGTCTTAGGACCCAAAAATTTTGGTGCAACTCCAAATAAAAGTAATA GCAATGTACACCACCATAGCCATTCTAACGCTAACCTCCCTAATTCCCCCCATTACAGCC ACCCTTATTAACCCCAATAAAAAGAACTTATACCCGCACTACGTAAAAATGACCATTGCC TCTACCTTTATAATCAGCCTATTTCCCACAATAATATTCATGTGCACAGACCAAGAAACC ATTATTTCAAACTGACACTGAACTGCAACCCAAACGCTAGAACTCTCCCTAAGCTT """ ) def elem(i): x = [0]*4 x[i] = 1 return x probability_array = [] sequences = [''.join(s.split()) for s in brown_nuc_sequences] state_map = dict(zip('TCAG', (0, 1, 2, 3))) for column in zip(*sequences): rows = [] for nt in column: rows.append(elem(state_map[nt])) # 3 internal nodes for i in range(3): rows.append([1, 1, 1, 1]) probability_array.append(rows) model_and_data['probability_array'] = probability_array d = dict(model_and_data=model_and_data, site_reduction=dict(aggregation='sum')) s = arbplf_ll(json.dumps(d)) df = pd.read_json(StringIO(s), orient='split', precise_float=True) print(df)
GCAATGTACACCACCATAGCCATTCTAACGCTAACCTCCCTAATTCCCCCCATTACAGCC ACCCTTATTAACCCCAATAAAAAGAACTTATACCCGCACTACGTAAAAATGACCATTGCC TCTACCTTTATAATCAGCCTATTTCCCACAATAATATTCATGTGCACAGACCAAGAAACC ATTATTTCAAACTGACACTGAACTGCAACCCAAACGCTAGAACTCTCCCTAAGCTT """) def elem(i): x = [0] * 4 x[i] = 1 return x probability_array = [] sequences = [''.join(s.split()) for s in brown_nuc_sequences] state_map = dict(zip('TCAG', (0, 1, 2, 3))) for column in zip(*sequences): rows = [] for nt in column: rows.append(elem(state_map[nt])) # 3 internal nodes for i in range(3): rows.append([1, 1, 1, 1]) probability_array.append(rows) model_and_data['probability_array'] = probability_array d = dict(model_and_data=model_and_data, site_reduction=dict(aggregation='sum')) s = arbplf_ll(json.dumps(d)) df = pd.read_json(StringIO(s), orient='split', precise_float=True) print(df)
def test_ok(): arbplf_ll(json.dumps(good_input))
def myll(d): s = arbplf_ll(json.dumps(d)) df = pd.read_json(StringIO(s), orient='split', precise_float=True) return df
def test_ok_reduction_deleted(): x = copy.deepcopy(good_input) del x['site_reduction'] arbplf_ll(json.dumps(x))
def run(): state_count = 4 edge_count = 5 node_count = edge_count + 1 # Define the tree used in the phyl transition mapping example. edges = [[4, 0], [4, 1], [5, 4], [5, 2], [5, 3]] inference_rates = [0.001, 0.002, 0.008, 0.01, 0.1] simulation_rates = [0.001 * (9 / 20), 0.002, 0.008, 0.01, 0.1] """ # Define the poisson rate matrix with expected exit rate 1 rate_divisor = 3 rate_matrix = [ [0, 1, 1, 1], [1, 0, 1, 1], [1, 1, 0, 1], [1, 1, 1, 0]] """ # use a GTR rate matrix a, b, c, d, e, pA, pC, pG, pT = ( 1, 0.2, 0.3, 0.4, 0.4, 0.1, 0.35, 0.35, 0.2) rate_matrix = make_rate_matrix(a, b, c, d, e, pA, pC, pG, pT) # Use one site for each of the 4^4 = 256 possible observations. X = [-1] U = range(4) all_site_patterns = list(itertools.product(U, U, U, U, X, X)) prior_array = [[ [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]] probability_array = [] for pattern in all_site_patterns: arr = [] for i, p in enumerate(pattern): if p == -1: row = [1]*state_count else: row = [0]*state_count row[p] = 1 arr.append(row) probability_array.append(arr) model_and_data = { "edges" : edges, "edge_rate_coefficients" : simulation_rates, "rate_divisor" : "equilibrium_exit_rate", "root_prior" : "equilibrium_distribution", "rate_matrix" : rate_matrix, "probability_array" : probability_array} d = {"model_and_data" : model_and_data} s = json.dumps(d) s = arbplf_ll(s) df = pd.read_json(StringIO(s), orient='split', precise_float=True) log_likelihoods = df.value.values # compute expectations using the likelihoods as observation weights weights = [math.exp(ll) for ll in log_likelihoods] total = sum(weights) weights = [(20000 * w) / total for w in weights] model_and_data = { "edges" : edges, "edge_rate_coefficients" : inference_rates, "rate_divisor" : "equilibrium_exit_rate", "root_prior" : "equilibrium_distribution", "rate_matrix" : rate_matrix, "probability_array" : probability_array} d = { "model_and_data" : model_and_data, "site_reduction" : {"aggregation" : weights}, "trans_reduction" : {"aggregation" : "sum"}} d['model_and_data']['probability_array'] = prior_array d['trans_reduction']['selection'] = [ [i, j] for i in range(4) for j in range(4) if i != j] d['site_reduction'] = {"aggregation" : "sum"} s = arbplf_trans(json.dumps(d)) df = pd.read_json(StringIO(s), orient='split', precise_float=True) print("prior expectation:") print(20000 * df.value.values) d['model_and_data']['probability_array'] = probability_array d['trans_reduction']['selection'] = [ [i, j] for i in range(4) for j in range(4) if i != j] d['site_reduction'] = {"aggregation" : weights} s = arbplf_trans(json.dumps(d)) df = pd.read_json(StringIO(s), orient='split', precise_float=True) print("conditional expectation:") print(df.value.values)
def test_ok_reduction_empty(): x = copy.deepcopy(good_input) x['site_reduction'] = {} arbplf_ll(json.dumps(x))
def test_ok_reduction_selection_aggregation(): x = copy.deepcopy(good_input) x['site_reduction'] = {"selection" : [0], "aggregation" : "sum"} arbplf_ll(json.dumps(x))
def test_ok_reduction_avg_aggregation(): x = copy.deepcopy(good_input) x['site_reduction'] = {"aggregation" : "avg"} arbplf_ll(json.dumps(x))