def hard_coded_analysis(): branch_length = 5.0 sequence_length = 1000 nsequences = 1000 estimate_triple_list = [] column_headers = ('most.info', 'less.info', 'least.info') for i in range(nsequences): # sample sequence changes at three levels of informativeness sequence_changes = sample_sequence_changes( branch_length, sequence_length) # get a distance estimate for each level of informativeness estimate_triple = sample_distance(*sequence_changes) estimate_triple_list.append(estimate_triple) print RUtil.get_table_string(estimate_triple_list, column_headers)
def get_table_string_and_scripts_from_logs(start_stop_pairs, log_paths, nsamples): """ This is for analysis of remote execution. """ # build the array for the R table data_arr = [] sequence_lengths = [] midpoints = [] for start_stop_pair, log_path in zip(start_stop_pairs, log_paths): start_pos, stop_pos = start_stop_pair sequence_length = stop_pos - start_pos + 1 means, variations, covs = read_log(log_path, nsamples) midpoint = (start_pos + stop_pos) / 2.0 row = [sequence_length, midpoint] for values in means, variations, covs: corr_info = mcmc.Correlation() corr_info.analyze(values) hpd_low, hpd_high = mcmc.get_hpd_interval(0.95, values) row.extend([hpd_low, corr_info.mean, hpd_high]) data_arr.append(row) sequence_lengths.append(sequence_length) midpoints.append(midpoint) # build the table string table_string = RUtil.get_table_string(data_arr, g_headers) # get the scripts scripts = get_ggplot2_scripts(nsamples, sequence_lengths, midpoints) # return the table string and scripts return table_string, scripts
def get_response_content(fs): f_info = ctmcmi.get_mutual_info_known_distn # define the R table headers headers = ['log.probability.ratio', 'mutual.information'] # make the array arr = [] for x in np.linspace(fs.x_min, fs.x_max, 101): row = [x] proc = evozoo.AlternatingHypercube_d_1(3) X = np.array([x]) distn = proc.get_distn(X) Q = proc.get_rate_matrix(X) info = f_info(Q, distn, fs.t) row.append(info) arr.append(row) # create the R table string and scripts # get the R table table_string = RUtil.get_table_string(arr, headers) # get the R script script = get_ggplot() # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_table_string_and_scripts_from_logs( start_stop_pairs, log_paths, nsamples): """ This is for analysis of remote execution. """ # build the array for the R table data_arr = [] sequence_lengths = [] midpoints = [] for start_stop_pair, log_path in zip( start_stop_pairs, log_paths): start_pos, stop_pos = start_stop_pair sequence_length = stop_pos - start_pos + 1 means, variations, covs = read_log(log_path, nsamples) midpoint = (start_pos + stop_pos) / 2.0 row = [sequence_length, midpoint] for values in means, variations, covs: corr_info = mcmc.Correlation() corr_info.analyze(values) hpd_low, hpd_high = mcmc.get_hpd_interval(0.95, values) row.extend([hpd_low, corr_info.mean, hpd_high]) data_arr.append(row) sequence_lengths.append(sequence_length) midpoints.append(midpoint) # build the table string table_string = RUtil.get_table_string(data_arr, g_headers) # get the scripts scripts = get_ggplot2_scripts(nsamples, sequence_lengths, midpoints) # return the table string and scripts return table_string, scripts
def get_table_string_and_scripts_par(start_stop_pairs, nsamples): """ Local command-line multi-process only. """ # define the pool of processes corresponding to the number of cores mypool = Pool(processes=4) # do the multiprocessing start_stop_n_triples = [(a, b, nsamples) for a, b in start_stop_pairs] post_pairs_list = mypool.map(forked_function, start_stop_n_triples) # build the array for the R table data_arr = [] sequence_lengths = [] midpoints = [] for start_stop_pair, post_pairs in zip(start_stop_pairs, post_pairs_list): start_pos, stop_pos = start_stop_pair sequence_length = stop_pos - start_pos + 1 midpoint = (start_pos + stop_pos) / 2.0 row = [sequence_length, midpoint] for corr_info, hpd_interval in post_pairs: hpd_low, hpd_high = hpd_interval row.extend([hpd_low, corr_info.mean, hpd_high]) data_arr.append(row) sequence_lengths.append(sequence_length) midpoints.append(midpoint) # build the table string table_string = RUtil.get_table_string(data_arr, g_headers) # get the scripts scripts = get_ggplot2_scripts(nsamples, sequence_lengths, midpoints) # return the table string and scripts return table_string, scripts
def get_table_string_and_scripts(start_stop_pairs, nsamples): """ Command-line only. """ # build the array for the R table data_arr = [] sequence_lengths = [] midpoints = [] for start_pos, stop_pos in start_stop_pairs: sequence_length = stop_pos - start_pos + 1 means, variations, covs = get_value_lists( start_pos, stop_pos, nsamples) midpoint = (start_pos + stop_pos) / 2.0 row = [sequence_length, midpoint] for values in means, variations, covs: corr_info = mcmc.Correlation() corr_info.analyze(values) hpd_low, hpd_high = mcmc.get_hpd_interval(0.95, values) row.extend([hpd_low, corr_info.mean, hpd_high]) data_arr.append(row) sequence_lengths.append(sequence_length) midpoints.append(midpoint) # build the table string table_string = RUtil.get_table_string(data_arr, g_headers) # get the scripts scripts = get_ggplot2_scripts(nsamples, sequence_lengths, midpoints) # return the table string and scripts return table_string, scripts
def get_response_content(fs): # validate and store user input if fs.x_max <= fs.x_min: raise ValueError('check the min and max logs') f_info = divtime.get_fisher_info_known_distn_fast # define the R table headers headers = ['log.probability.ratio', 'fisher.information'] # make the array arr = [] for x in np.linspace(fs.x_min, fs.x_max, 101): row = [x] proc = evozoo.DistinguishedCornerPairHypercube_d_1(3) X = np.array([x]) distn = proc.get_distn(X) Q = proc.get_rate_matrix(X) info = f_info(Q, distn, fs.t) row.append(info) arr.append(row) # create the R table string and scripts # get the R table table_string = RUtil.get_table_string(arr, headers) # get the R script script = get_ggplot() # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_latex_documentbody(fs): """ This is obsolete because I am now using pure R output. The latex documentbody should have a bunch of tikz pieces in it. Each tikz piece should have been generated from R. """ Q_mut, Q_sels = get_qmut_qsels(fs) # compute the statistics ER_ratios, NSR_ratios, ER_NSR_ratios = get_statistic_ratios(Q_mut, Q_sels) M = zip(*(ER_ratios, NSR_ratios, ER_NSR_ratios)) column_headers = ('ER.ratio', 'NSR.ratio', 'ER.times.NSR.ratio') table_string = RUtil.get_table_string(M, column_headers) nsels = len(Q_sels) # define the R scripts scripts = [] for name in column_headers: scripts.append(get_r_tikz_script(nsels, name)) # get the tikz codes from R, for each histogram retcode, r_out, r_err, tikz_code_list = RUtil.run_plotter_multiple_scripts( table_string, scripts, 'tikz', width=3, height=2) if retcode: raise RUtil.RError(r_err) # # show some timings print 'R did not fail, but here is its stderr:' print r_err # # write the latex code out = StringIO() #print >> out, '\\pagestyle{empty}' for tikz_code in tikz_code_list: print >> out, tikz_code # return the latex code, consisting mainly of a bunch of tikz plots return out.getvalue()
def get_response_content(fs): M, R = get_input_matrices(fs) # create the R table string and scripts headers = [ 't', 'mi.true.mut', 'mi.true.mutsel', 'mi.analog.mut', 'mi.analog.mutsel'] npoints = 100 t_low = 0.0 t_high = 5.0 t_incr = (t_high - t_low) / (npoints - 1) t_values = [t_low + t_incr*i for i in range(npoints)] # get the data for the R table arr = [] for t in t_values: mi_mut = ctmcmi.get_mutual_information(M, t) mi_mutsel = ctmcmi.get_mutual_information(R, t) mi_analog_mut = ctmcmi.get_ll_ratio_wrong(M, t) mi_analog_mutsel = ctmcmi.get_ll_ratio_wrong(R, t) row = [t, mi_mut, mi_mutsel, mi_analog_mut, mi_analog_mutsel] arr.append(row) # get the R table table_string = RUtil.get_table_string(arr, headers) # get the R script script = get_ggplot() # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_table_string_and_scripts(stop_positions, nsamples): """ Command-line only. """ start_position = 1 # build the array for the R table data_arr = [] for stop_position in stop_positions: sequence_length = stop_position - start_position + 1 means, variations, covs = get_value_lists(start_position, stop_position, nsamples) row = [sequence_length] for values in means, variations, covs: corr_info = mcmc.Correlation() corr_info.analyze(values) hpd_low, hpd_high = mcmc.get_hpd_interval(0.95, values) row.extend([hpd_low, corr_info.mean, hpd_high]) data_arr.append(row) # build the table string table_string = RUtil.get_table_string(data_arr, g_headers) # get the scripts sequence_lengths = [x - start_position + 1 for x in stop_positions] scripts = get_ggplot2_scripts(sequence_lengths) # return the table string and scripts return table_string, scripts
def get_response_content(fs): # precompute some transition matrices P_drift_selection = pgmsinglesite.create_drift_selection_transition_matrix( fs.npop, fs.selection_ratio) MatrixUtil.assert_transition_matrix(P_drift_selection) P_mutation = pgmsinglesite.create_mutation_transition_matrix( fs.npop, fs.mutation_ab, fs.mutation_ba) MatrixUtil.assert_transition_matrix(P_mutation) # define the R table headers headers = ['generation', 'number.of.mutants'] # compute the path samples P = np.dot(P_drift_selection, P_mutation) mypath = PathSampler.sample_endpoint_conditioned_path( fs.nmutants_initial, fs.nmutants_final, fs.ngenerations, P) arr = [[i, nmutants] for i, nmutants in enumerate(mypath)] # create the R table string and scripts # get the R table table_string = RUtil.get_table_string(arr, headers) # get the R script script = get_ggplot() # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_table_string_and_scripts(start_stop_pairs, nsamples): """ Command-line only. """ # build the array for the R table data_arr = [] sequence_lengths = [] midpoints = [] for start_pos, stop_pos in start_stop_pairs: sequence_length = stop_pos - start_pos + 1 means, variations, covs = get_value_lists(start_pos, stop_pos, nsamples) midpoint = (start_pos + stop_pos) / 2.0 row = [sequence_length, midpoint] for values in means, variations, covs: corr_info = mcmc.Correlation() corr_info.analyze(values) hpd_low, hpd_high = mcmc.get_hpd_interval(0.95, values) row.extend([hpd_low, corr_info.mean, hpd_high]) data_arr.append(row) sequence_lengths.append(sequence_length) midpoints.append(midpoint) # build the table string table_string = RUtil.get_table_string(data_arr, g_headers) # get the scripts scripts = get_ggplot2_scripts(nsamples, sequence_lengths, midpoints) # return the table string and scripts return table_string, scripts
def get_response_content(fs): M, R = get_input_matrices(fs) # create the R table string and scripts headers = [ 't', 'mi.true.mut', 'mi.true.mutsel', 'mi.analog.mut', 'mi.analog.mutsel' ] npoints = 100 t_low = 0.0 t_high = 5.0 t_incr = (t_high - t_low) / (npoints - 1) t_values = [t_low + t_incr * i for i in range(npoints)] # get the data for the R table arr = [] for t in t_values: mi_mut = ctmcmi.get_mutual_information(M, t) mi_mutsel = ctmcmi.get_mutual_information(R, t) mi_analog_mut = ctmcmi.get_ll_ratio_wrong(M, t) mi_analog_mutsel = ctmcmi.get_ll_ratio_wrong(R, t) row = [t, mi_mut, mi_mutsel, mi_analog_mut, mi_analog_mutsel] arr.append(row) # get the R table table_string = RUtil.get_table_string(arr, headers) # get the R script script = get_ggplot() # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_table_string_and_scripts(stop_positions, nsamples): """ Command-line only. """ start_position = 1 # build the array for the R table data_arr = [] for stop_position in stop_positions: sequence_length = stop_position - start_position + 1 means, variations, covs = get_value_lists( start_position, stop_position, nsamples) row = [sequence_length] for values in means, variations, covs: corr_info = mcmc.Correlation() corr_info.analyze(values) hpd_low, hpd_high = mcmc.get_hpd_interval(0.95, values) row.extend([hpd_low, corr_info.mean, hpd_high]) data_arr.append(row) # build the table string table_string = RUtil.get_table_string(data_arr, g_headers) # get the scripts sequence_lengths = [x - start_position + 1 for x in stop_positions] scripts = get_ggplot2_scripts(sequence_lengths) # return the table string and scripts return table_string, scripts
def get_table_string_and_scripts(fs): nstates = fs.nresidues**fs.nsites if nstates > 256: raise ValueError('the mutation rate matrix is too big') # get the mutation matrix Q_mut = mrate.get_sparse_sequence_rate_matrix(fs.nresidues, fs.nsites) # sample a bunch of mutation-selection rate matrices Q_sels = [] for selection_index in range(fs.nselections): # sample the selection parameters if fs.low_var: v = 0.2 elif fs.medium_var: v = 1 elif fs.high_var: v = 5.0 elif fs.really_high_var: v = 25.0 s = math.sqrt(v) if fs.neg_skew: sels = [-random.expovariate(1 / s) for i in range(nstates)] elif fs.no_skew: sels = [random.gauss(0, s) for i in range(nstates)] elif fs.pos_skew: sels = [random.expovariate(1 / s) for i in range(nstates)] # define the mutation-selection rate matrix using Halpern-Bruno Q = np.zeros_like(Q_mut) for i in range(nstates): for j in range(nstates): if i != j: tau = math.exp(-(sels[j] - sels[i])) coeff = math.log(tau) / (1 - 1 / tau) Q[i, j] = Q_mut[i, j] * coeff for i in range(nstates): Q[i, i] = -np.sum(Q[i]) Q_sels.append(Q) # define the time points incr = (fs.t_high - fs.t_low) / (fs.ntimes - 1) times = [fs.t_low + i * incr for i in range(fs.ntimes)] # compute the statistics nsels = len(Q_sels) pairs = [get_time_point_summary(Q_mut, Q_sels, t) for t in times] mi_sign_lists, time_stats = zip(*pairs) ncrossing_list = [] # look at how the signs change over time for each selection sample for signs in zip(*mi_sign_lists): count = 0 for sign_a, sign_b in iterutils.pairwise(signs): if sign_a != sign_b: count += 1 ncrossing_list.append(count) # get the R scripts scripts = [ get_r_band_script(nsels, time_stats), get_r_prop_script(nsels, time_stats), get_r_cross_script(ncrossing_list) ] table_string = RUtil.get_table_string(time_stats, g_time_stats_headers) return table_string, scripts
def get_table_string_and_scripts(fs): nstates = fs.nresidues ** fs.nsites if nstates > 256: raise ValueError('the mutation rate matrix is too big') # get the mutation matrix Q_mut = mrate.get_sparse_sequence_rate_matrix(fs.nresidues, fs.nsites) # sample a bunch of mutation-selection rate matrices Q_sels = [] for selection_index in range(fs.nselections): # sample the selection parameters if fs.low_var: v = 0.2 elif fs.medium_var: v = 1 elif fs.high_var: v = 5.0 elif fs.really_high_var: v = 25.0 s = math.sqrt(v) if fs.neg_skew: sels = [-random.expovariate(1/s) for i in range(nstates)] elif fs.no_skew: sels = [random.gauss(0, s) for i in range(nstates)] elif fs.pos_skew: sels = [random.expovariate(1/s) for i in range(nstates)] # define the mutation-selection rate matrix using Halpern-Bruno Q = np.zeros_like(Q_mut) for i in range(nstates): for j in range(nstates): if i != j: tau = math.exp(-(sels[j] - sels[i])) coeff = math.log(tau) / (1 - 1/tau) Q[i, j] = Q_mut[i, j] * coeff for i in range(nstates): Q[i, i] = -np.sum(Q[i]) Q_sels.append(Q) # define the time points incr = (fs.t_high - fs.t_low) / (fs.ntimes - 1) times = [fs.t_low + i*incr for i in range(fs.ntimes)] # compute the statistics nsels = len(Q_sels) pairs = [get_time_point_summary(Q_mut, Q_sels, t) for t in times] mi_sign_lists, time_stats = zip(*pairs) ncrossing_list = [] # look at how the signs change over time for each selection sample for signs in zip(*mi_sign_lists): count = 0 for sign_a, sign_b in iterutils.pairwise(signs): if sign_a != sign_b: count += 1 ncrossing_list.append(count) # get the R scripts scripts = [ get_r_band_script(nsels, time_stats), get_r_prop_script(nsels, time_stats), get_r_cross_script(ncrossing_list)] table_string = RUtil.get_table_string(time_stats, g_time_stats_headers) return table_string, scripts
def get_response_content(fs): f_info = divtime.get_fisher_info_known_distn_fast requested_triples = [] for triple in g_process_triples: name, desc, zoo_obj = triple if getattr(fs, name): requested_triples.append(triple) if not requested_triples: raise ValueError('nothing to plot') # define the R table headers r_names = [a.replace('_', '.') for a, b, c in requested_triples] headers = ['t'] + r_names # Spend a lot of time doing the optimizations # to construct the points for the R table. arr = [] for t in cbreaker.throttled(progrid.gen_binary(fs.start_time, fs.stop_time), nseconds=5, ncount=200): row = [t] for python_name, desc, zoo_class in requested_triples: zoo_obj = zoo_class(fs.d) df = zoo_obj.get_df() opt_dep = OptDep(zoo_obj, t, f_info) if df: X0 = np.random.randn(df) xopt = scipy.optimize.fmin(opt_dep, X0, maxiter=10000, maxfun=10000) # I would like to use scipy.optimize.minimize # except that this requires a newer version of # scipy than is packaged for ubuntu right now. # fmin_bfgs seems to have problems sometimes # either hanging or maxiter=10K is too big. """ xopt = scipy.optimize.fmin_bfgs(opt_dep, X0, gtol=1e-8, maxiter=10000) """ else: xopt = np.array([]) info_value = -opt_dep(xopt) row.append(info_value) arr.append(row) arr.sort() npoints = len(arr) # create the R table string and scripts # get the R table table_string = RUtil.get_table_string(arr, headers) # get the R script script = get_ggplot() # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_response_content(fs): # legend labels label_a = 'N=%d mu=%f' % (fs.nstates_a, fs.mu_a) label_b = 'N=%d mu=%f' % (fs.nstates_b, fs.mu_b) arr, headers = make_table(fs) # compute the max value ymax = math.log(max(fs.nstates_a, fs.nstates_b)) nfifths = int(math.floor(ymax * 5.0)) + 1 ylim = RUtil.mk_call_str('c', 0, 0.2 * nfifths) # write the R script body out = StringIO() print >> out, RUtil.mk_call_str( 'plot', 'my.table$t', 'my.table$alpha', type='"n"', ylim=ylim, xlab='"time"', ylab='"information"', main='"comparison of an information criterion for two processes"', ) # draw some horizontal lines for i in range(nfifths+1): print >> out, RUtil.mk_call_str( 'abline', h=0.2*i, col='"lightgray"', lty='"dotted"') colors = ('darkblue', 'darkred') for c, header in zip(colors, headers[1:]): print >> out, RUtil.mk_call_str( 'lines', 'my.table$t', 'my.table$%s' % header, col='"%s"' % c, ) legend_names = (label_a, label_b) legend_name_str = 'c(' + ', '.join('"%s"' % s for s in legend_names) + ')' legend_col_str = 'c(' + ', '.join('"%s"' % s for s in colors) + ')' legend_lty_str = 'c(' + ', '.join('1' for s in colors) + ')' print >> out, RUtil.mk_call_str( 'legend', '"%s"' % fs.legend_placement, legend_name_str, col=legend_col_str, lty=legend_lty_str, ) script_body = out.getvalue() # create the R plot image table_string = RUtil.get_table_string(arr, headers) device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script_body, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_response_content(fs): # precompute some transition matrices P_drift_selection = pgmsinglesite.create_drift_selection_transition_matrix( fs.npop, fs.selection_ratio) MatrixUtil.assert_transition_matrix(P_drift_selection) P_mutation = pgmsinglesite.create_mutation_transition_matrix( fs.npop, fs.mutation_ab, fs.mutation_ba) MatrixUtil.assert_transition_matrix(P_mutation) # define the R table headers headers = [ 'generation', 'number.of.mutants', 'probability', 'log.prob', ] # compute the transition matrix P = np.dot(P_drift_selection, P_mutation) # Compute the endpoint conditional probabilities for various states # along the unobserved path. nstates = fs.npop + 1 M = np.zeros((nstates, fs.ngenerations)) M[fs.nmutants_initial, 0] = 1.0 M[fs.nmutants_final, fs.ngenerations-1] = 1.0 for i in range(fs.ngenerations-2): A_exponent = i + 1 B_exponent = fs.ngenerations - 1 - A_exponent A = np.linalg.matrix_power(P, A_exponent) B = np.linalg.matrix_power(P, B_exponent) weights = np.zeros(nstates) for k in range(nstates): weights[k] = A[fs.nmutants_initial, k] * B[k, fs.nmutants_final] weights /= np.sum(weights) for k, p in enumerate(weights): M[k, i+1] = p arr = [] for g in range(fs.ngenerations): for k in range(nstates): p = M[k, g] if p: logp = math.log(p) else: logp = float('-inf') row = [g, k, p, logp] arr.append(row) # create the R table string and scripts # get the R table table_string = RUtil.get_table_string(arr, headers) # get the R script script = get_ggplot() # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_table_string_and_scripts(fs): """ The latex documentbody should have a bunch of tikz pieces in it. Each tikz piece should have been generated from R. """ nstates = fs.nresidues ** fs.nsites if nstates > 256: raise ValueError("the mutation rate matrix is too big") # get the mutation matrix Q_mut = mrate.get_sparse_sequence_rate_matrix(fs.nresidues, fs.nsites) # sample a bunch of mutation-selection rate matrices Q_sels = [] for selection_index in range(fs.nselections): # sample the selection parameters if fs.low_var: v = 0.2 elif fs.medium_var: v = 1 elif fs.high_var: v = 5.0 elif fs.really_high_var: v = 25.0 s = math.sqrt(v) if fs.neg_skew: sels = [-random.expovariate(1 / s) for i in range(nstates)] elif fs.no_skew: sels = [random.gauss(0, s) for i in range(nstates)] elif fs.pos_skew: sels = [random.expovariate(1 / s) for i in range(nstates)] # define the mutation-selection rate matrix using Halpern-Bruno Q = np.zeros_like(Q_mut) for i in range(nstates): for j in range(nstates): if i != j: tau = math.exp(-(sels[j] - sels[i])) coeff = math.log(tau) / (1 - 1 / tau) Q[i, j] = Q_mut[i, j] * coeff for i in range(nstates): Q[i, i] = -np.sum(Q[i]) Q_sels.append(Q) # define the time points incr = (fs.t_high - fs.t_low) / (fs.ntimes - 1) times = [fs.t_low + i * incr for i in range(fs.ntimes)] # compute the statistics nsels = len(Q_sels) time_stats = [get_time_point_summary(Q_mut, Q_sels, t) for t in times] # get the R scripts scripts = [ # get_r_tikz_mi_plot(nsels, time_stats), get_r_tikz_corr_plot(nsels, time_stats), get_r_tikz_prop_plot(nsels, time_stats), get_r_tikz_info_plot(nsels, time_stats), ] table_string = RUtil.get_table_string(time_stats, g_time_stats_headers) return table_string, scripts
def get_response_content(fs): # legend labels label_a = 'N=%d mu=%f' % (fs.nstates_a, fs.mu_a) label_b = 'N=%d mu=%f' % (fs.nstates_b, fs.mu_b) arr, headers = make_table(fs) # compute the max value ymax = math.log(max(fs.nstates_a, fs.nstates_b)) nfifths = int(math.floor(ymax * 5.0)) + 1 ylim = RUtil.mk_call_str('c', 0, 0.2 * nfifths) # write the R script body out = StringIO() print >> out, RUtil.mk_call_str( 'plot', 'my.table$t', 'my.table$alpha', type='"n"', ylim=ylim, xlab='"time"', ylab='"information"', main='"comparison of an information criterion for two processes"', ) # draw some horizontal lines for i in range(nfifths + 1): print >> out, RUtil.mk_call_str('abline', h=0.2 * i, col='"lightgray"', lty='"dotted"') colors = ('darkblue', 'darkred') for c, header in zip(colors, headers[1:]): print >> out, RUtil.mk_call_str( 'lines', 'my.table$t', 'my.table$%s' % header, col='"%s"' % c, ) legend_names = (label_a, label_b) legend_name_str = 'c(' + ', '.join('"%s"' % s for s in legend_names) + ')' legend_col_str = 'c(' + ', '.join('"%s"' % s for s in colors) + ')' legend_lty_str = 'c(' + ', '.join('1' for s in colors) + ')' print >> out, RUtil.mk_call_str( 'legend', '"%s"' % fs.legend_placement, legend_name_str, col=legend_col_str, lty=legend_lty_str, ) script_body = out.getvalue() # create the R plot image table_string = RUtil.get_table_string(arr, headers) device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script_body, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_response_content(fs): distn_modes = [x for x in g_ordered_modes if x in fs.distribution] if not distn_modes: raise ValueError('no distribution mode was specified') colors = [g_mode_to_color[m] for m in distn_modes] arr, headers = make_table(fs, distn_modes) distn_headers = headers[1:] # Get the largest value in the array, # skipping the first column. arrmax = np.max(arr[:, 1:]) # write the R script body out = StringIO() ylim = RUtil.mk_call_str('c', 0, arrmax + 0.1) sel_str = { BALANCED: 'balanced', HALPERN_BRUNO: 'Halpern-Bruno', }[fs.selection] print >> out, RUtil.mk_call_str( 'plot', 'my.table$t', 'my.table$%s' % distn_headers[0], type='"n"', ylim=ylim, xlab='""', ylab='"relaxation time"', main='"Effect of selection (%s) on relaxation time for %d states"' % (sel_str, fs.nstates), ) for c, header in zip(colors, distn_headers): print >> out, RUtil.mk_call_str( 'lines', 'my.table$t', 'my.table$%s' % header, col='"%s"' % c, ) mode_names = [s.replace('_', ' ') for s in distn_modes] legend_name_str = 'c(' + ', '.join('"%s"' % s for s in mode_names) + ')' legend_col_str = 'c(' + ', '.join('"%s"' % s for s in colors) + ')' legend_lty_str = 'c(' + ', '.join(['1'] * len(distn_modes)) + ')' print >> out, RUtil.mk_call_str( 'legend', '"%s"' % fs.legend_placement, legend_name_str, col=legend_col_str, lty=legend_lty_str, ) script_body = out.getvalue() # create the R plot image table_string = RUtil.get_table_string(arr, headers) device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script_body, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_response_content(fs): f_info = divtime.get_fisher_info_known_distn_fast requested_triples = [] for triple in g_process_triples: name, desc, zoo_obj = triple if getattr(fs, name): requested_triples.append(triple) if not requested_triples: raise ValueError('nothing to plot') # define the R table headers r_names = [a.replace('_', '.') for a, b, c in requested_triples] headers = ['t'] + r_names # Spend a lot of time doing the optimizations # to construct the points for the R table. arr = [] for t in cbreaker.throttled( progrid.gen_binary(fs.start_time, fs.stop_time), nseconds=5, ncount=200): row = [t] for python_name, desc, zoo_class in requested_triples: zoo_obj = zoo_class(fs.d) df = zoo_obj.get_df() opt_dep = OptDep(zoo_obj, t, f_info) if df: X0 = np.random.randn(df) xopt = scipy.optimize.fmin( opt_dep, X0, maxiter=10000, maxfun=10000) # I would like to use scipy.optimize.minimize # except that this requires a newer version of # scipy than is packaged for ubuntu right now. # fmin_bfgs seems to have problems sometimes # either hanging or maxiter=10K is too big. """ xopt = scipy.optimize.fmin_bfgs(opt_dep, X0, gtol=1e-8, maxiter=10000) """ else: xopt = np.array([]) info_value = -opt_dep(xopt) row.append(info_value) arr.append(row) arr.sort() npoints = len(arr) # create the R table string and scripts # get the R table table_string = RUtil.get_table_string(arr, headers) # get the R script script = get_ggplot() # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_response_content(fs): distn_modes = [x for x in g_ordered_modes if x in fs.distribution] if not distn_modes: raise ValueError('no distribution mode was specified') colors = [g_mode_to_color[m] for m in distn_modes] arr, headers = make_table(fs, distn_modes) distn_headers = headers[1:] # Get the largest value in the array, # skipping the first column. arrmax = np.max(arr[:,1:]) # write the R script body out = StringIO() ylim = RUtil.mk_call_str('c', 0, arrmax + 0.1) sel_str = { BALANCED : 'balanced', HALPERN_BRUNO : 'Halpern-Bruno', }[fs.selection] print >> out, RUtil.mk_call_str( 'plot', 'my.table$t', 'my.table$%s' % distn_headers[0], type='"n"', ylim=ylim, xlab='""', ylab='"relaxation time"', main='"Effect of selection (%s) on relaxation time for %d states"' % (sel_str, fs.nstates), ) for c, header in zip(colors, distn_headers): print >> out, RUtil.mk_call_str( 'lines', 'my.table$t', 'my.table$%s' % header, col='"%s"' % c, ) mode_names = [s.replace('_', ' ') for s in distn_modes] legend_name_str = 'c(' + ', '.join('"%s"' % s for s in mode_names) + ')' legend_col_str = 'c(' + ', '.join('"%s"' % s for s in colors) + ')' legend_lty_str = 'c(' + ', '.join(['1']*len(distn_modes)) + ')' print >> out, RUtil.mk_call_str( 'legend', '"%s"' % fs.legend_placement, legend_name_str, col=legend_col_str, lty=legend_lty_str, ) script_body = out.getvalue() # create the R plot image table_string = RUtil.get_table_string(arr, headers) device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script_body, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_table_string_and_scripts(fs): """ The latex documentbody should have a bunch of tikz pieces in it. Each tikz piece should have been generated from R. """ nstates = fs.nresidues ** fs.nsites if nstates > 256: raise ValueError('the mutation rate matrix is too big') # get the mutation matrix Q_mut = mrate.get_sparse_sequence_rate_matrix(fs.nresidues, fs.nsites) # sample a bunch of mutation-selection rate matrices Q_sels = [] for selection_index in range(fs.nselections): # sample the selection parameters if fs.low_var: v = 0.2 elif fs.medium_var: v = 1 elif fs.high_var: v = 5.0 elif fs.really_high_var: v = 25.0 s = math.sqrt(v) if fs.neg_skew: sels = [-random.expovariate(1/s) for i in range(nstates)] elif fs.no_skew: sels = [random.gauss(0, s) for i in range(nstates)] elif fs.pos_skew: sels = [random.expovariate(1/s) for i in range(nstates)] # define the mutation-selection rate matrix using Halpern-Bruno Q = np.zeros_like(Q_mut) for i in range(nstates): for j in range(nstates): if i != j: tau = math.exp(-(sels[j] - sels[i])) coeff = math.log(tau) / (1 - 1/tau) Q[i, j] = Q_mut[i, j] * coeff for i in range(nstates): Q[i, i] = -np.sum(Q[i]) Q_sels.append(Q) # define the time points incr = (fs.t_high - fs.t_low) / (fs.ntimes - 1) times = [fs.t_low + i*incr for i in range(fs.ntimes)] # compute the statistics nsels = len(Q_sels) time_stats = [get_time_point_summary(Q_mut, Q_sels, t) for t in times] # get the R scripts scripts = [ #get_r_tikz_mi_plot(nsels, time_stats), get_r_tikz_corr_plot(nsels, time_stats), get_r_tikz_prop_plot(nsels, time_stats), get_r_tikz_info_plot(nsels, time_stats)] table_string = RUtil.get_table_string(time_stats, g_time_stats_headers) return table_string, scripts
def get_response_content(fs): f_info = ctmcmi.get_mutual_info_known_distn requested_triples = [] for triple in g_process_triples: name, desc, zoo_obj = triple if getattr(fs, name): requested_triples.append(triple) if not requested_triples: raise ValueError('nothing to plot') # define the R table headers headers = ['t'] if fs.log4: headers.append('log.4') if fs.log3: headers.append('log.3') r_names = [a.replace('_', '.') for a, b, c in requested_triples] headers.extend(r_names) # Spend a lot of time doing the optimizations # to construct the points for the R table. times = np.linspace(fs.start_time, fs.stop_time, 101) arr = [] for t in times: row = [t] if fs.log4: row.append(math.log(4)) if fs.log3: row.append(math.log(3)) for python_name, desc, zoo_obj in requested_triples: X = np.array([]) info_value = f_info( zoo_obj.get_rate_matrix(X), zoo_obj.get_distn(X), t) row.append(info_value) arr.append(row) # create the R table string and scripts # get the R table table_string = RUtil.get_table_string(arr, headers) # get the R script script = get_ggplot() # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_response_content(fs): Q_mut, Q_sels = get_qmut_qsels(fs) # compute the statistics ER_ratios, NSR_ratios, ER_NSR_ratios = get_statistic_ratios(Q_mut, Q_sels) M = zip(*(ER_ratios, NSR_ratios, ER_NSR_ratios)) column_headers = ('ER.ratio', 'NSR.ratio', 'ER.times.NSR.ratio') table_string = RUtil.get_table_string(M, column_headers) nsels = len(Q_sels) # get the R script comboscript = get_r_comboscript(nsels, column_headers) # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, comboscript, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_response_content(fs): f_info = ctmcmi.get_mutual_info_known_distn requested_triples = [] for triple in g_process_triples: name, desc, zoo_obj = triple if getattr(fs, name): requested_triples.append(triple) if not requested_triples: raise ValueError('nothing to plot') # define the R table headers headers = ['t'] if fs.log4: headers.append('log.4') if fs.log3: headers.append('log.3') r_names = [a.replace('_', '.') for a, b, c in requested_triples] headers.extend(r_names) # Spend a lot of time doing the optimizations # to construct the points for the R table. times = np.linspace(fs.start_time, fs.stop_time, 101) arr = [] for t in times: row = [t] if fs.log4: row.append(math.log(4)) if fs.log3: row.append(math.log(3)) for python_name, desc, zoo_obj in requested_triples: X = np.array([]) info_value = f_info(zoo_obj.get_rate_matrix(X), zoo_obj.get_distn(X), t) row.append(info_value) arr.append(row) # create the R table string and scripts # get the R table table_string = RUtil.get_table_string(arr, headers) # get the R script script = get_ggplot() # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_response_content(fs): # create the R table string and scripts headers = [ 'entropy', 'analog'] distributions = [] nstates = 4 npoints = 5000 arr = [] best_pair = None for i in range(npoints): weights = [random.expovariate(1) for j in range(nstates)] total = sum(weights) distn = [x / total for x in weights] entropy = -sum(p * math.log(p) for p in distn) sum_squares = sum(p*p for p in distn) sum_cubes = sum(p*p*p for p in distn) analog = math.log(sum_squares / sum_cubes) row = [entropy, analog] arr.append(row) dist = (entropy - 1.0)**2 + (analog - 0.4)**2 if (best_pair is None) or (dist < best_pair[0]): best_pair = (dist, distn) # get the R table table_string = RUtil.get_table_string(arr, headers) # get the R script out = StringIO() title = ', '.join(str(x) for x in best_pair[1]) print >> out, RUtil.mk_call_str( 'plot', 'my.table$entropy', 'my.table$analog', pch='20', main='"%s"' % title) script = out.getvalue() # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_response_content(fs): distn_modes = [x for x in g_ordered_modes if x in fs.distribution] if not distn_modes: raise ValueError("no distribution mode was specified") colors = [g_mode_to_color[m] for m in distn_modes] arr, headers = make_table(fs, distn_modes) distn_headers = headers[1:] # Get the largest value in the array, # skipping the first column. arrmax = np.max(arr[:, 1:]) # write the R script body out = StringIO() ylim = RUtil.mk_call_str("c", 0, arrmax + 0.1) sel_str = {BALANCED: "f=1/2", HALPERN_BRUNO: "Halpern-Bruno"}[fs.selection] print >> out, RUtil.mk_call_str( "plot", "my.table$t", "my.table$%s" % distn_headers[0], type='"n"', ylim=ylim, xlab='"time"', ylab='"expected log L-ratio"', main='"Effect of selection (%s) on log L-ratio for %d states"' % (sel_str, fs.nstates), ) for c, header in zip(colors, distn_headers): print >> out, RUtil.mk_call_str("lines", "my.table$t", "my.table$%s" % header, col='"%s"' % c) mode_names = [s.replace("_", " ") for s in distn_modes] legend_name_str = "c(" + ", ".join('"%s"' % s for s in mode_names) + ")" legend_col_str = "c(" + ", ".join('"%s"' % s for s in colors) + ")" legend_lty_str = "c(" + ", ".join(["1"] * len(distn_modes)) + ")" print >> out, RUtil.mk_call_str( "legend", '"%s"' % fs.legend_placement, legend_name_str, col=legend_col_str, lty=legend_lty_str ) script_body = out.getvalue() # create the R plot image table_string = RUtil.get_table_string(arr, headers) device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter(table_string, script_body, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_response_content(fs): # unpack some options npoints = fs.npoints stddev = fs.stddev # define the data rows and the headers if fs.add_labels: headers = ('x', 'y', 'label') data_rows = list(SpiralSampler.gen_labeled_points(npoints, stddev)) else: headers = ('x', 'y') data_rows = list(SpiralSampler.gen_points(npoints, stddev)) # begin the response if fs.raw: lines = [] for data_row in data_rows: line = '\t'.join(str(x) for x in data_row) lines.append(line) response_text = '\n'.join(lines) elif fs.table: response_text = RUtil.get_table_string(data_rows, headers) # return the response return response_text
def get_response_content(fs): # create the R table string and scripts headers = ['entropy', 'analog'] distributions = [] nstates = 4 npoints = 5000 arr = [] best_pair = None for i in range(npoints): weights = [random.expovariate(1) for j in range(nstates)] total = sum(weights) distn = [x / total for x in weights] entropy = -sum(p * math.log(p) for p in distn) sum_squares = sum(p * p for p in distn) sum_cubes = sum(p * p * p for p in distn) analog = math.log(sum_squares / sum_cubes) row = [entropy, analog] arr.append(row) dist = (entropy - 1.0)**2 + (analog - 0.4)**2 if (best_pair is None) or (dist < best_pair[0]): best_pair = (dist, distn) # get the R table table_string = RUtil.get_table_string(arr, headers) # get the R script out = StringIO() title = ', '.join(str(x) for x in best_pair[1]) print >> out, RUtil.mk_call_str('plot', 'my.table$entropy', 'my.table$analog', pch='20', main='"%s"' % title) script = out.getvalue() # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_table_string_and_scripts(start_stop_pairs, nsamples, header_seq_pairs): """ Command-line only. """ # build the array for the R table data_arr = [] sequence_lengths = [] midpoints = [] for start_pos, stop_pos in start_stop_pairs: sequence_length = stop_pos - start_pos + 1 midpoint = (start_pos + stop_pos) / 2.0 arr = get_loganalysis_array( start_pos, stop_pos, nsamples, header_seq_pairs) mean_low = arr[1][4] mean_mean = arr[1][1] mean_high = arr[1][5] var_low = arr[2][4] var_mean = arr[2][1] var_high = arr[2][5] cov_low = arr[3][4] cov_mean = arr[3][1] cov_high = arr[3][5] row = [ sequence_length, midpoint, mean_low, mean_mean, mean_high, var_low, var_mean, var_high, cov_low, cov_mean, cov_high] data_arr.append(row) sequence_lengths.append(sequence_length) midpoints.append(midpoint) # build the table string table_string = RUtil.get_table_string(data_arr, g_headers) # get the scripts scripts = beasttut.get_ggplot2_scripts( nsamples, sequence_lengths, midpoints) # return the table string and scripts return table_string, scripts
def get_response_content(fs): # create the R table string and scripts headers = [ 'z', 'c.neg2.0', 'c.neg0.5', 'c.0.5', 'c.2.0', #'c.a', #'c.b', #'c.c', #'c.d', ] #C = numpy.array([-0.5, -0.2, 0.2, 0.5], dtype=float) #C = numpy.array([-1.0, -0.4, 0.4, 1.0], dtype=float) C = numpy.array([-2.0, -0.5, 0.5, 2.0], dtype=float) Z = numpy.linspace(-5, 5, 101) # get the data for the R table arr = [] for z in Z: row = [z] for c in C: rate = 1.0 / kimrecessive.denom_piecewise(c, z*numpy.sign(c)) row.append(rate) arr.append(row) # get the R table table_string = RUtil.get_table_string(arr, headers) # get the R script script = get_ggplot() # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_response_content(fs): # create the R table string and scripts headers = [ 'z', 'c.neg2.0', 'c.neg0.5', 'c.0.5', 'c.2.0', #'c.a', #'c.b', #'c.c', #'c.d', ] #C = numpy.array([-0.5, -0.2, 0.2, 0.5], dtype=float) #C = numpy.array([-1.0, -0.4, 0.4, 1.0], dtype=float) C = numpy.array([-2.0, -0.5, 0.5, 2.0], dtype=float) Z = numpy.linspace(-5, 5, 101) # get the data for the R table arr = [] for z in Z: row = [z] for c in C: rate = 1.0 / kimrecessive.denom_piecewise(c, z * numpy.sign(c)) row.append(rate) arr.append(row) # get the R table table_string = RUtil.get_table_string(arr, headers) # get the R script script = get_ggplot() # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_table_strings_and_scripts( xmldata, alignment_id, start_stop_pairs, nsamples): """ Command-line only. @param xmldata: xml data already adjusted for nsamples and log filename @param alignment_id: xml element id @param start_stop_pairs: alignment interval bounds @param nsamples: an extra parameter for script generation @return: short table string, long table string, scripts (for short table) """ # init the array for the full R table full_data_arr = [] # build the array for the R table data_arr = [] sequence_lengths = [] midpoints = [] for start_pos, stop_pos in start_stop_pairs: sequence_length = stop_pos - start_pos + 1 midpoint = (start_pos + stop_pos) / 2.0 interval_xml_data = beast.set_alignment_interval( xmldata, alignment_id, start_pos, stop_pos) row_labels, col_labels, arr = get_loganalysis_labeled_array( interval_xml_data) stat_name_to_row_index = dict( (x, i) for i, x in enumerate(row_labels)) summary_name_to_col_index = dict( (x, i) for i, x in enumerate(col_labels)) # define row indices of interest mean_row_index = stat_name_to_row_index['meanRate'] var_row_index = stat_name_to_row_index['coefficientOfVariation'] cov_row_index = stat_name_to_row_index['covariance'] # define column indices of interest mean_col_index = summary_name_to_col_index['mean'] low_col_index = summary_name_to_col_index['hpdLower'] high_col_index = summary_name_to_col_index['hpdUpper'] row = [ sequence_length, midpoint, arr[mean_row_index][low_col_index], arr[mean_row_index][mean_col_index], arr[mean_row_index][high_col_index], arr[var_row_index][low_col_index], arr[var_row_index][mean_col_index], arr[var_row_index][high_col_index], arr[cov_row_index][low_col_index], arr[cov_row_index][mean_col_index], arr[cov_row_index][high_col_index], ] data_arr.append(row) # add rows to the full data array for row_index, row_label in enumerate(row_labels): for col_index, col_label in enumerate(col_labels): row = [ sequence_length, midpoint, '"' + row_label + '"', '"' + col_label + '"', arr[row_index][col_index], ] full_data_arr.append(row) # add entries to some utility arrays sequence_lengths.append(sequence_length) midpoints.append(midpoint) # build the table strings table_string = RUtil.get_table_string(data_arr, g_headers) full_table_string = RUtil.get_table_string( full_data_arr, [ 'sequence.length', 'midpoint', 'statistic.name', 'posterior.analysis', 'value' ], force_float=False, ) # get the scripts scripts = beasttut.get_ggplot2_scripts( nsamples, sequence_lengths, midpoints) # return the table string and scripts return table_string, full_table_string, scripts
def get_response_content(fs): M = get_input_matrix(fs) nstates = len(M) nsites = fs.nsites if nstates ** nsites > 16: raise ValueError('the site dependent rate matrix is too big') # precompute some stuff M_site_indep = get_site_independent_process(M, nsites) v = mrate.R_to_distn(M) v_site_indep = get_site_independent_distn(v, nsites) if fs.info_fis: f_info = divtime.get_fisher_info_known_distn_fast elif fs.info_mut: f_info = ctmcmi.get_mutual_info_known_distn else: raise ValueError('no info type specified') f_selection = mrate.to_gtr_hb_known_energies # Spend a lot of time doing the optimizations # to construct the points for the R table. arr = [] for t in cbreaker.throttled( progrid.gen_binary(fs.start_time, fs.stop_time), nseconds=4, ncount=100): row = [t] # get the site-dependent mutation selection balance information if fs.dep_balance: dep_balance = OptDep( M_site_indep, v_site_indep, t, f_info, f_selection) X0 = np.random.randn(nstates ** nsites - 1) xopt = scipy.optimize.fmin(dep_balance, X0) max_dep_balance_info = -dep_balance(xopt) row.append(max_dep_balance_info) # for debug Q_bal, v_bal = dep_balance.get_process(xopt) print 'dependent balance:' print max_dep_balance_info print v_bal print Q_bal print # get the site-independent mutation selection balance information if fs.indep_balance: indep_balance = OptIndep( M, v, nsites, t, f_info, f_selection) X0 = np.random.randn(nstates-1) xopt = scipy.optimize.fmin(indep_balance, X0) max_indep_balance_info = -indep_balance(xopt) row.append(max_indep_balance_info) # for debug Q_bal, v_bal = indep_balance.get_process(xopt) print 'independent balance:' print max_indep_balance_info print v_bal print Q_bal print # get the site-independent mutation process information if fs.indep_mutation: indep_mut_info = f_info(M_site_indep, v_site_indep, t) row.append(indep_mut_info) # add the data row to the table arr.append(row) arr.sort() npoints = len(arr) # create the R table string and scripts headers = ['t'] if fs.dep_balance: headers.append('max.site.dep.balance') if fs.indep_balance: headers.append('max.site.indep.balance') if fs.indep_mutation: headers.append('site.indep.mutation') # get the R table table_string = RUtil.get_table_string(arr, headers) # get the R script script = get_ggplot() # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def do_hard_coded_analysis_b(tree, tree_remark): """ Do a hardcoded analysis of tree reconstruction methods. Make R files of ordered reconstruction losses. @param tree: a tree object @param tree_remark: a string that is a comment about the tree """ # define an arbitrary order for the names of the leaves of the tree ordered_names = list(node.name for node in tree.gen_tips()) # use some replicates reconstruction_count = 100 # Make R files for reconstruction results from sequences # of some number of nucleotides in length. sequence_length = 2000 # define the tree reconstruction methods to be used sims = [ Simulation(Clustering.NeighborJoiningDMS(), 'nj', 'neighbor joining'), Simulation(Clustering.StoneSpectralSignDMS(), 'nj', 'spectral sign') ] # set tree reconstruction parameters for sim in sims: sim.set_original_tree(tree) # initialize the distance matrix sampler sampler = DMSampler.InfiniteAllelesSampler(tree, ordered_names, sequence_length) sampler.set_inf_replacement(20.0) sampler.set_zero_replacement(0.0) # start the progress bar pbar = Progress.Bar(1.0) # sample some distance matrices distance_matrix_start_time = time.time() distance_matrices = [] for result in sampler.gen_samples_or_none(): # if we got a result then update the distance matrix list if result: sequence_list, D = result distance_matrices.append(D) # Update the progressbar regardless of whether or not # the proposal was accepted. remaining_acceptances = reconstruction_count - len(distance_matrices) numerator = sampler.get_completed_proposals() denominator = numerator + sampler.get_remaining_proposals( remaining_acceptances) dms_fraction = float(numerator) / float(denominator) dms_total = 1.0 / (1 + len(sims)) pbar.update(dms_fraction * dms_total) # if we have enough samples then break the loop if not remaining_acceptances: break distance_matrix_seconds = time.time() - distance_matrix_start_time # reconstruct trees using various methods reconstruction_seconds = [] for i, sim in enumerate(sims): reconstruction_start_time = time.time() print 'reconstructing', len(distance_matrices), 'trees' print 'using', sim.description sim.run(distance_matrices, ordered_names) pbar.update(float(i + 2) / float(1 + len(sims))) reconstruction_seconds.append(time.time() - reconstruction_start_time) # stop the progress bar pbar.finish() # consider the neighbor joining and the spectral sign results nj_sim, ss_sim = sims # extract the simulation data label_list_pairs = [ ('nj.unweighted', nj_sim.get_normalized_error_counts()), ('ss.unweighted', ss_sim.get_normalized_error_counts()), ('nj.weighted', nj_sim.get_normalized_loss_values()), ('ss.weighted', ss_sim.get_normalized_loss_values()) ] labels, transposed_table = zip(*label_list_pairs) table = zip(*transposed_table) table_string = RUtil.get_table_string(table, labels) # write the table filename = 'out3.table' with open(filename, 'w') as fout: print >> fout, '# tree source:', tree_remark print >> fout, '# number of taxa:', len(ordered_names) print >> fout, '# sampled distance matrices:', len(distance_matrices) print >> fout, '# sampling seconds elapsed:', distance_matrix_seconds print >> fout, '# sites per sequence:', sequence_length for sim, seconds in zip(sims, reconstruction_seconds): msg_a = '# seconds elapsed for tree reconstruction using ' msg_b = sim.description + ': ' + str(seconds) print >> fout, msg_a + msg_b print >> fout, table_string print 'wrote', filename
def get_response_content(fs): M = get_input_matrix(fs) nstates = len(M) nsites = fs.nsites if nstates**nsites > 16: raise ValueError('the site dependent rate matrix is too big') # precompute some stuff M_site_indep = get_site_independent_process(M, nsites) v = mrate.R_to_distn(M) v_site_indep = get_site_independent_distn(v, nsites) if fs.info_fis: f_info = divtime.get_fisher_info_known_distn_fast elif fs.info_mut: f_info = ctmcmi.get_mutual_info_known_distn else: raise ValueError('no info type specified') f_selection = mrate.to_gtr_hb_known_energies # Spend a lot of time doing the optimizations # to construct the points for the R table. arr = [] for t in cbreaker.throttled(progrid.gen_binary(fs.start_time, fs.stop_time), nseconds=4, ncount=100): row = [t] # get the site-dependent mutation selection balance information if fs.dep_balance: dep_balance = OptDep(M_site_indep, v_site_indep, t, f_info, f_selection) X0 = np.random.randn(nstates**nsites - 1) xopt = scipy.optimize.fmin(dep_balance, X0) max_dep_balance_info = -dep_balance(xopt) row.append(max_dep_balance_info) # for debug Q_bal, v_bal = dep_balance.get_process(xopt) print 'dependent balance:' print max_dep_balance_info print v_bal print Q_bal print # get the site-independent mutation selection balance information if fs.indep_balance: indep_balance = OptIndep(M, v, nsites, t, f_info, f_selection) X0 = np.random.randn(nstates - 1) xopt = scipy.optimize.fmin(indep_balance, X0) max_indep_balance_info = -indep_balance(xopt) row.append(max_indep_balance_info) # for debug Q_bal, v_bal = indep_balance.get_process(xopt) print 'independent balance:' print max_indep_balance_info print v_bal print Q_bal print # get the site-independent mutation process information if fs.indep_mutation: indep_mut_info = f_info(M_site_indep, v_site_indep, t) row.append(indep_mut_info) # add the data row to the table arr.append(row) arr.sort() npoints = len(arr) # create the R table string and scripts headers = ['t'] if fs.dep_balance: headers.append('max.site.dep.balance') if fs.indep_balance: headers.append('max.site.indep.balance') if fs.indep_mutation: headers.append('site.indep.mutation') # get the R table table_string = RUtil.get_table_string(arr, headers) # get the R script script = get_ggplot() # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_response_content(fs): # transform the arguments according to the diffusion approximation mutation_ab = (fs.pop * fs.mutation_ab) / fs.pop_gran mutation_ba = (fs.pop * fs.mutation_ba) / fs.pop_gran if mutation_ab > 1 or mutation_ba > 1: raise Exception( 'the mutation probability is not small enough ' 'for the diffusion approximation to be meaningful') selection_ratio = 1 + (fs.pop * fs.additive_selection) / fs.pop_gran npop = fs.pop_gran ngenerations = fs.ngenerations nmutants_initial = int(fs.initial_freq * fs.pop_gran) nmutants_final = int(fs.final_freq * fs.pop_gran) # precompute some transition matrices P_drift_selection = pgmsinglesite.create_drift_selection_transition_matrix( npop, selection_ratio) MatrixUtil.assert_transition_matrix(P_drift_selection) P_mutation = pgmsinglesite.create_mutation_transition_matrix( npop, mutation_ab, mutation_ba) MatrixUtil.assert_transition_matrix(P_mutation) # define the R table headers headers = [ 'generation', 'allele.frequency', 'probability', 'log.density', ] # compute the transition matrix P = np.dot(P_drift_selection, P_mutation) # Compute the endpoint conditional probabilities for various states # along the unobserved path. nstates = npop + 1 M = np.zeros((nstates, ngenerations)) M[nmutants_initial, 0] = 1.0 M[nmutants_final, ngenerations-1] = 1.0 for i in range(ngenerations-2): A_exponent = i + 1 B_exponent = ngenerations - 1 - A_exponent A = np.linalg.matrix_power(P, A_exponent) B = np.linalg.matrix_power(P, B_exponent) weights = np.zeros(nstates) for k in range(nstates): weights[k] = A[nmutants_initial, k] * B[k, nmutants_final] weights /= np.sum(weights) for k, p in enumerate(weights): M[k, i+1] = p arr = [] for g in range(ngenerations): for k in range(nstates): p = M[k, g] allele_frequency = k / float(npop) # Finer gridding needs larger scaling for the density # because each interval has a smaller support. density = p * nstates if density: log_density = math.log(density) else: log_density = float('-inf') row = [g, allele_frequency, p, log_density] arr.append(row) # create the R table string and scripts # get the R table table_string = RUtil.get_table_string(arr, headers) # get the R script script = get_ggplot(nstates) # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_response_content(fs): # transform the arguments according to the diffusion approximation mutation_ab = (fs.pop * fs.mutation_ab) / fs.pop_gran mutation_ba = (fs.pop * fs.mutation_ba) / fs.pop_gran if mutation_ab > 1 or mutation_ba > 1: raise Exception('the mutation probability is not small enough ' 'for the diffusion approximation to be meaningful') selection_ratio = 1 + (fs.pop * fs.additive_selection) / fs.pop_gran npop = fs.pop_gran ngenerations = fs.ngenerations nmutants_initial = int(fs.initial_freq * fs.pop_gran) nmutants_final = int(fs.final_freq * fs.pop_gran) # precompute some transition matrices P_drift_selection = pgmsinglesite.create_drift_selection_transition_matrix( npop, selection_ratio) MatrixUtil.assert_transition_matrix(P_drift_selection) P_mutation = pgmsinglesite.create_mutation_transition_matrix( npop, mutation_ab, mutation_ba) MatrixUtil.assert_transition_matrix(P_mutation) # define the R table headers headers = [ 'generation', 'allele.frequency', 'probability', 'log.density', ] # compute the transition matrix P = np.dot(P_drift_selection, P_mutation) # Compute the endpoint conditional probabilities for various states # along the unobserved path. nstates = npop + 1 M = np.zeros((nstates, ngenerations)) M[nmutants_initial, 0] = 1.0 M[nmutants_final, ngenerations - 1] = 1.0 for i in range(ngenerations - 2): A_exponent = i + 1 B_exponent = ngenerations - 1 - A_exponent A = np.linalg.matrix_power(P, A_exponent) B = np.linalg.matrix_power(P, B_exponent) weights = np.zeros(nstates) for k in range(nstates): weights[k] = A[nmutants_initial, k] * B[k, nmutants_final] weights /= np.sum(weights) for k, p in enumerate(weights): M[k, i + 1] = p arr = [] for g in range(ngenerations): for k in range(nstates): p = M[k, g] allele_frequency = k / float(npop) # Finer gridding needs larger scaling for the density # because each interval has a smaller support. density = p * nstates if density: log_density = math.log(density) else: log_density = float('-inf') row = [g, allele_frequency, p, log_density] arr.append(row) # create the R table string and scripts # get the R table table_string = RUtil.get_table_string(arr, headers) # get the R script script = get_ggplot(nstates) # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script, device_name) if retcode: raise RUtil.RError(r_err) return image_data
def get_table_strings_and_scripts(xmldata, alignment_id, start_stop_pairs, nsamples): """ Command-line only. @param xmldata: xml data already adjusted for nsamples and log filename @param alignment_id: xml element id @param start_stop_pairs: alignment interval bounds @param nsamples: an extra parameter for script generation @return: short table string, long table string, scripts (for short table) """ # init the array for the full R table full_data_arr = [] # build the array for the R table data_arr = [] sequence_lengths = [] midpoints = [] for start_pos, stop_pos in start_stop_pairs: sequence_length = stop_pos - start_pos + 1 midpoint = (start_pos + stop_pos) / 2.0 interval_xml_data = beast.set_alignment_interval( xmldata, alignment_id, start_pos, stop_pos) row_labels, col_labels, arr = get_loganalysis_labeled_array( interval_xml_data) stat_name_to_row_index = dict((x, i) for i, x in enumerate(row_labels)) summary_name_to_col_index = dict( (x, i) for i, x in enumerate(col_labels)) # define row indices of interest mean_row_index = stat_name_to_row_index['meanRate'] var_row_index = stat_name_to_row_index['coefficientOfVariation'] cov_row_index = stat_name_to_row_index['covariance'] # define column indices of interest mean_col_index = summary_name_to_col_index['mean'] low_col_index = summary_name_to_col_index['hpdLower'] high_col_index = summary_name_to_col_index['hpdUpper'] row = [ sequence_length, midpoint, arr[mean_row_index][low_col_index], arr[mean_row_index][mean_col_index], arr[mean_row_index][high_col_index], arr[var_row_index][low_col_index], arr[var_row_index][mean_col_index], arr[var_row_index][high_col_index], arr[cov_row_index][low_col_index], arr[cov_row_index][mean_col_index], arr[cov_row_index][high_col_index], ] data_arr.append(row) # add rows to the full data array for row_index, row_label in enumerate(row_labels): for col_index, col_label in enumerate(col_labels): row = [ sequence_length, midpoint, '"' + row_label + '"', '"' + col_label + '"', arr[row_index][col_index], ] full_data_arr.append(row) # add entries to some utility arrays sequence_lengths.append(sequence_length) midpoints.append(midpoint) # build the table strings table_string = RUtil.get_table_string(data_arr, g_headers) full_table_string = RUtil.get_table_string( full_data_arr, [ 'sequence.length', 'midpoint', 'statistic.name', 'posterior.analysis', 'value' ], force_float=False, ) # get the scripts scripts = beasttut.get_ggplot2_scripts(nsamples, sequence_lengths, midpoints) # return the table string and scripts return table_string, full_table_string, scripts
def do_hard_coded_analysis_b(tree, tree_remark): """ Do a hardcoded analysis of tree reconstruction methods. Make R files of ordered reconstruction losses. @param tree: a tree object @param tree_remark: a string that is a comment about the tree """ # define an arbitrary order for the names of the leaves of the tree ordered_names = list(node.name for node in tree.gen_tips()) # use some replicates reconstruction_count = 100 # Make R files for reconstruction results from sequences # of some number of nucleotides in length. sequence_length = 2000 # define the tree reconstruction methods to be used sims = [ Simulation(Clustering.NeighborJoiningDMS(), "nj", "neighbor joining"), Simulation(Clustering.StoneSpectralSignDMS(), "nj", "spectral sign"), ] # set tree reconstruction parameters for sim in sims: sim.set_original_tree(tree) # initialize the distance matrix sampler sampler = DMSampler.InfiniteAllelesSampler(tree, ordered_names, sequence_length) sampler.set_inf_replacement(20.0) sampler.set_zero_replacement(0.0) # start the progress bar pbar = Progress.Bar(1.0) # sample some distance matrices distance_matrix_start_time = time.time() distance_matrices = [] for result in sampler.gen_samples_or_none(): # if we got a result then update the distance matrix list if result: sequence_list, D = result distance_matrices.append(D) # Update the progressbar regardless of whether or not # the proposal was accepted. remaining_acceptances = reconstruction_count - len(distance_matrices) numerator = sampler.get_completed_proposals() denominator = numerator + sampler.get_remaining_proposals(remaining_acceptances) dms_fraction = float(numerator) / float(denominator) dms_total = 1.0 / (1 + len(sims)) pbar.update(dms_fraction * dms_total) # if we have enough samples then break the loop if not remaining_acceptances: break distance_matrix_seconds = time.time() - distance_matrix_start_time # reconstruct trees using various methods reconstruction_seconds = [] for i, sim in enumerate(sims): reconstruction_start_time = time.time() print "reconstructing", len(distance_matrices), "trees" print "using", sim.description sim.run(distance_matrices, ordered_names) pbar.update(float(i + 2) / float(1 + len(sims))) reconstruction_seconds.append(time.time() - reconstruction_start_time) # stop the progress bar pbar.finish() # consider the neighbor joining and the spectral sign results nj_sim, ss_sim = sims # extract the simulation data label_list_pairs = [ ("nj.unweighted", nj_sim.get_normalized_error_counts()), ("ss.unweighted", ss_sim.get_normalized_error_counts()), ("nj.weighted", nj_sim.get_normalized_loss_values()), ("ss.weighted", ss_sim.get_normalized_loss_values()), ] labels, transposed_table = zip(*label_list_pairs) table = zip(*transposed_table) table_string = RUtil.get_table_string(table, labels) # write the table filename = "out3.table" with open(filename, "w") as fout: print >> fout, "# tree source:", tree_remark print >> fout, "# number of taxa:", len(ordered_names) print >> fout, "# sampled distance matrices:", len(distance_matrices) print >> fout, "# sampling seconds elapsed:", distance_matrix_seconds print >> fout, "# sites per sequence:", sequence_length for sim, seconds in zip(sims, reconstruction_seconds): msg_a = "# seconds elapsed for tree reconstruction using " msg_b = sim.description + ": " + str(seconds) print >> fout, msg_a + msg_b print >> fout, table_string print "wrote", filename
def get_response_content(fs): M = get_input_matrix(fs) # create the R table string and scripts headers = ['t'] if fs.show_entropy: headers.append('ub.entropy') headers.extend([ 'ub.jc.spectral', 'ub.f81.spectral', 'mutual.information', 'lb.2.state.spectral', 'lb.2.state', 'lb.f81', ]) npoints = 100 t_low = fs.start_time t_high = fs.stop_time t_incr = (t_high - t_low) / (npoints - 1) t_values = [t_low + t_incr*i for i in range(npoints)] # define some extra stuff v = mrate.R_to_distn(M) entropy = -np.dot(v, np.log(v)) n = len(M) gap = sorted(abs(x) for x in np.linalg.eigvals(M))[1] print 'stationary distn:', v print 'entropy:', entropy print 'spectral gap:', gap M_slow_jc = gap * (1.0 / n) * (np.ones((n,n)) - n*np.eye(n)) M_slow_f81 = gap * np.outer(np.ones(n), v) M_slow_f81 -= np.diag(np.sum(M_slow_f81, axis=1)) M_f81 = msimpl.get_fast_f81(M) M_2state = msimpl.get_fast_two_state_autobarrier(M) M_2state_spectral = -gap * M_2state / np.trace(M_2state) # get the data for the R table arr = [] for u in t_values: # experiment with log time #t = math.exp(u) t = u mi_slow_jc = ctmcmi.get_mutual_information(M_slow_jc, t) mi_slow_f81 = ctmcmi.get_mutual_information(M_slow_f81, t) mi_mut = ctmcmi.get_mutual_information(M, t) mi_2state_spectral = ctmcmi.get_mutual_information(M_2state_spectral, t) mi_f81 = ctmcmi.get_mutual_information(M_f81, t) mi_2state = ctmcmi.get_mutual_information(M_2state, t) row = [u] if fs.show_entropy: row.append(entropy) row.extend([mi_slow_jc, mi_slow_f81, mi_mut, mi_2state_spectral, mi_2state, mi_f81]) arr.append(row) # get the R table table_string = RUtil.get_table_string(arr, headers) # get the R script script = get_ggplot() # create the R plot image device_name = Form.g_imageformat_to_r_function[fs.imageformat] retcode, r_out, r_err, image_data = RUtil.run_plotter( table_string, script, device_name) if retcode: raise RUtil.RError(r_err) return image_data