Example #1
0
def hard_coded_analysis():
    branch_length = 5.0
    sequence_length = 1000
    nsequences = 1000
    estimate_triple_list = []
    column_headers = ('most.info', 'less.info', 'least.info')
    for i in range(nsequences):
        # sample sequence changes at three levels of informativeness
        sequence_changes = sample_sequence_changes(
                branch_length, sequence_length)
        # get a distance estimate for each level of informativeness
        estimate_triple = sample_distance(*sequence_changes)
        estimate_triple_list.append(estimate_triple)
    print RUtil.get_table_string(estimate_triple_list, column_headers)
Example #2
0
def get_table_string_and_scripts_from_logs(start_stop_pairs, log_paths,
                                           nsamples):
    """
    This is for analysis of remote execution.
    """
    # build the array for the R table
    data_arr = []
    sequence_lengths = []
    midpoints = []
    for start_stop_pair, log_path in zip(start_stop_pairs, log_paths):
        start_pos, stop_pos = start_stop_pair
        sequence_length = stop_pos - start_pos + 1
        means, variations, covs = read_log(log_path, nsamples)
        midpoint = (start_pos + stop_pos) / 2.0
        row = [sequence_length, midpoint]
        for values in means, variations, covs:
            corr_info = mcmc.Correlation()
            corr_info.analyze(values)
            hpd_low, hpd_high = mcmc.get_hpd_interval(0.95, values)
            row.extend([hpd_low, corr_info.mean, hpd_high])
        data_arr.append(row)
        sequence_lengths.append(sequence_length)
        midpoints.append(midpoint)
    # build the table string
    table_string = RUtil.get_table_string(data_arr, g_headers)
    # get the scripts
    scripts = get_ggplot2_scripts(nsamples, sequence_lengths, midpoints)
    # return the table string and scripts
    return table_string, scripts
Example #3
0
def get_response_content(fs):
    f_info = ctmcmi.get_mutual_info_known_distn
    # define the R table headers
    headers = ['log.probability.ratio', 'mutual.information']
    # make the array
    arr = []
    for x in np.linspace(fs.x_min, fs.x_max, 101):
        row = [x]
        proc = evozoo.AlternatingHypercube_d_1(3)
        X = np.array([x])
        distn = proc.get_distn(X)
        Q = proc.get_rate_matrix(X)
        info = f_info(Q, distn, fs.t)
        row.append(info)
        arr.append(row)
    # create the R table string and scripts
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
            table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #4
0
def get_table_string_and_scripts_from_logs(
        start_stop_pairs, log_paths, nsamples):
    """
    This is for analysis of remote execution.
    """
    # build the array for the R table
    data_arr = []
    sequence_lengths = []
    midpoints = []
    for start_stop_pair, log_path in zip(
            start_stop_pairs, log_paths):
        start_pos, stop_pos = start_stop_pair
        sequence_length = stop_pos - start_pos + 1
        means, variations, covs = read_log(log_path, nsamples)
        midpoint = (start_pos + stop_pos) / 2.0
        row = [sequence_length, midpoint]
        for values in means, variations, covs:
            corr_info = mcmc.Correlation()
            corr_info.analyze(values)
            hpd_low, hpd_high = mcmc.get_hpd_interval(0.95, values)
            row.extend([hpd_low, corr_info.mean, hpd_high])
        data_arr.append(row)
        sequence_lengths.append(sequence_length)
        midpoints.append(midpoint)
    # build the table string
    table_string = RUtil.get_table_string(data_arr, g_headers)
    # get the scripts
    scripts = get_ggplot2_scripts(nsamples, sequence_lengths, midpoints)
    # return the table string and scripts
    return table_string, scripts
Example #5
0
def get_table_string_and_scripts_par(start_stop_pairs, nsamples):
    """
    Local command-line multi-process only.
    """
    # define the pool of processes corresponding to the number of cores
    mypool = Pool(processes=4)
    # do the multiprocessing
    start_stop_n_triples = [(a, b, nsamples) for a, b in start_stop_pairs]
    post_pairs_list = mypool.map(forked_function, start_stop_n_triples)
    # build the array for the R table
    data_arr = []
    sequence_lengths = []
    midpoints = []
    for start_stop_pair, post_pairs in zip(start_stop_pairs, post_pairs_list):
        start_pos, stop_pos = start_stop_pair
        sequence_length = stop_pos - start_pos + 1
        midpoint = (start_pos + stop_pos) / 2.0
        row = [sequence_length, midpoint]
        for corr_info, hpd_interval in post_pairs:
            hpd_low, hpd_high = hpd_interval
            row.extend([hpd_low, corr_info.mean, hpd_high])
        data_arr.append(row)
        sequence_lengths.append(sequence_length)
        midpoints.append(midpoint)
    # build the table string
    table_string = RUtil.get_table_string(data_arr, g_headers)
    # get the scripts
    scripts = get_ggplot2_scripts(nsamples, sequence_lengths, midpoints)
    # return the table string and scripts
    return table_string, scripts
Example #6
0
def get_table_string_and_scripts(start_stop_pairs, nsamples):
    """
    Command-line only.
    """
    # build the array for the R table
    data_arr = []
    sequence_lengths = []
    midpoints = []
    for start_pos, stop_pos in start_stop_pairs:
        sequence_length = stop_pos - start_pos + 1
        means, variations, covs = get_value_lists(
                start_pos, stop_pos, nsamples)
        midpoint = (start_pos + stop_pos) / 2.0
        row = [sequence_length, midpoint]
        for values in means, variations, covs:
            corr_info = mcmc.Correlation()
            corr_info.analyze(values)
            hpd_low, hpd_high = mcmc.get_hpd_interval(0.95, values)
            row.extend([hpd_low, corr_info.mean, hpd_high])
        data_arr.append(row)
        sequence_lengths.append(sequence_length)
        midpoints.append(midpoint)
    # build the table string
    table_string = RUtil.get_table_string(data_arr, g_headers)
    # get the scripts
    scripts = get_ggplot2_scripts(nsamples, sequence_lengths, midpoints)
    # return the table string and scripts
    return table_string, scripts
Example #7
0
def get_response_content(fs):
    # validate and store user input
    if fs.x_max <= fs.x_min:
        raise ValueError('check the min and max logs')
    f_info = divtime.get_fisher_info_known_distn_fast
    # define the R table headers
    headers = ['log.probability.ratio', 'fisher.information']
    # make the array
    arr = []
    for x in np.linspace(fs.x_min, fs.x_max, 101):
        row = [x]
        proc = evozoo.DistinguishedCornerPairHypercube_d_1(3)
        X = np.array([x])
        distn = proc.get_distn(X)
        Q = proc.get_rate_matrix(X)
        info = f_info(Q, distn, fs.t)
        row.append(info)
        arr.append(row)
    # create the R table string and scripts
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
        table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #8
0
def get_latex_documentbody(fs):
    """
    This is obsolete because I am now using pure R output.
    The latex documentbody should have a bunch of tikz pieces in it.
    Each tikz piece should have been generated from R.
    """
    Q_mut, Q_sels = get_qmut_qsels(fs)
    # compute the statistics
    ER_ratios, NSR_ratios, ER_NSR_ratios = get_statistic_ratios(Q_mut, Q_sels)
    M = zip(*(ER_ratios, NSR_ratios, ER_NSR_ratios))
    column_headers = ('ER.ratio', 'NSR.ratio', 'ER.times.NSR.ratio')
    table_string = RUtil.get_table_string(M, column_headers)
    nsels = len(Q_sels)
    # define the R scripts
    scripts = []
    for name in column_headers:
        scripts.append(get_r_tikz_script(nsels, name))
    # get the tikz codes from R, for each histogram
    retcode, r_out, r_err, tikz_code_list = RUtil.run_plotter_multiple_scripts(
        table_string, scripts, 'tikz', width=3, height=2)
    if retcode:
        raise RUtil.RError(r_err)
    #
    # show some timings
    print 'R did not fail, but here is its stderr:'
    print r_err
    #
    # write the latex code
    out = StringIO()
    #print >> out, '\\pagestyle{empty}'
    for tikz_code in tikz_code_list:
        print >> out, tikz_code
    # return the latex code, consisting mainly of a bunch of tikz plots
    return out.getvalue()
Example #9
0
def get_latex_documentbody(fs):
    """
    This is obsolete because I am now using pure R output.
    The latex documentbody should have a bunch of tikz pieces in it.
    Each tikz piece should have been generated from R.
    """
    Q_mut, Q_sels = get_qmut_qsels(fs)
    # compute the statistics
    ER_ratios, NSR_ratios, ER_NSR_ratios  = get_statistic_ratios(Q_mut, Q_sels)
    M = zip(*(ER_ratios, NSR_ratios, ER_NSR_ratios))
    column_headers = ('ER.ratio', 'NSR.ratio', 'ER.times.NSR.ratio')
    table_string = RUtil.get_table_string(M, column_headers)
    nsels = len(Q_sels)
    # define the R scripts
    scripts = []
    for name in column_headers:
        scripts.append(get_r_tikz_script(nsels, name))
    # get the tikz codes from R, for each histogram
    retcode, r_out, r_err, tikz_code_list = RUtil.run_plotter_multiple_scripts(
            table_string, scripts, 'tikz',
            width=3, height=2)
    if retcode:
        raise RUtil.RError(r_err)
    #
    # show some timings
    print 'R did not fail, but here is its stderr:'
    print r_err
    #
    # write the latex code
    out = StringIO()
    #print >> out, '\\pagestyle{empty}'
    for tikz_code in tikz_code_list:
        print >> out, tikz_code
    # return the latex code, consisting mainly of a bunch of tikz plots
    return out.getvalue()
Example #10
0
def get_response_content(fs):
    M, R = get_input_matrices(fs)
    # create the R table string and scripts
    headers = [
            't',
            'mi.true.mut',
            'mi.true.mutsel',
            'mi.analog.mut',
            'mi.analog.mutsel']
    npoints = 100
    t_low = 0.0
    t_high = 5.0
    t_incr = (t_high - t_low) / (npoints - 1)
    t_values = [t_low + t_incr*i for i in range(npoints)]
    # get the data for the R table
    arr = []
    for t in t_values:
        mi_mut = ctmcmi.get_mutual_information(M, t)
        mi_mutsel = ctmcmi.get_mutual_information(R, t)
        mi_analog_mut = ctmcmi.get_ll_ratio_wrong(M, t)
        mi_analog_mutsel = ctmcmi.get_ll_ratio_wrong(R, t)
        row = [t, mi_mut, mi_mutsel, mi_analog_mut, mi_analog_mutsel]
        arr.append(row)
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
            table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #11
0
def get_table_string_and_scripts(stop_positions, nsamples):
    """
    Command-line only.
    """
    start_position = 1
    # build the array for the R table
    data_arr = []
    for stop_position in stop_positions:
        sequence_length = stop_position - start_position + 1
        means, variations, covs = get_value_lists(start_position,
                                                  stop_position, nsamples)
        row = [sequence_length]
        for values in means, variations, covs:
            corr_info = mcmc.Correlation()
            corr_info.analyze(values)
            hpd_low, hpd_high = mcmc.get_hpd_interval(0.95, values)
            row.extend([hpd_low, corr_info.mean, hpd_high])
        data_arr.append(row)
    # build the table string
    table_string = RUtil.get_table_string(data_arr, g_headers)
    # get the scripts
    sequence_lengths = [x - start_position + 1 for x in stop_positions]
    scripts = get_ggplot2_scripts(sequence_lengths)
    # return the table string and scripts
    return table_string, scripts
Example #12
0
def get_response_content(fs):
    # precompute some transition matrices
    P_drift_selection = pgmsinglesite.create_drift_selection_transition_matrix(
        fs.npop, fs.selection_ratio)
    MatrixUtil.assert_transition_matrix(P_drift_selection)
    P_mutation = pgmsinglesite.create_mutation_transition_matrix(
        fs.npop, fs.mutation_ab, fs.mutation_ba)
    MatrixUtil.assert_transition_matrix(P_mutation)
    # define the R table headers
    headers = ['generation', 'number.of.mutants']
    # compute the path samples
    P = np.dot(P_drift_selection, P_mutation)
    mypath = PathSampler.sample_endpoint_conditioned_path(
        fs.nmutants_initial, fs.nmutants_final, fs.ngenerations, P)
    arr = [[i, nmutants] for i, nmutants in enumerate(mypath)]
    # create the R table string and scripts
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
        table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #13
0
def get_response_content(fs):
    # validate and store user input
    if fs.x_max <= fs.x_min:
        raise ValueError('check the min and max logs')
    f_info = divtime.get_fisher_info_known_distn_fast
    # define the R table headers
    headers = ['log.probability.ratio', 'fisher.information']
    # make the array
    arr = []
    for x in np.linspace(fs.x_min, fs.x_max, 101):
        row = [x]
        proc = evozoo.DistinguishedCornerPairHypercube_d_1(3)
        X = np.array([x])
        distn = proc.get_distn(X)
        Q = proc.get_rate_matrix(X)
        info = f_info(Q, distn, fs.t)
        row.append(info)
        arr.append(row)
    # create the R table string and scripts
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
            table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #14
0
def get_response_content(fs):
    f_info = ctmcmi.get_mutual_info_known_distn
    # define the R table headers
    headers = ['log.probability.ratio', 'mutual.information']
    # make the array
    arr = []
    for x in np.linspace(fs.x_min, fs.x_max, 101):
        row = [x]
        proc = evozoo.AlternatingHypercube_d_1(3)
        X = np.array([x])
        distn = proc.get_distn(X)
        Q = proc.get_rate_matrix(X)
        info = f_info(Q, distn, fs.t)
        row.append(info)
        arr.append(row)
    # create the R table string and scripts
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
        table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #15
0
def get_table_string_and_scripts(start_stop_pairs, nsamples):
    """
    Command-line only.
    """
    # build the array for the R table
    data_arr = []
    sequence_lengths = []
    midpoints = []
    for start_pos, stop_pos in start_stop_pairs:
        sequence_length = stop_pos - start_pos + 1
        means, variations, covs = get_value_lists(start_pos, stop_pos,
                                                  nsamples)
        midpoint = (start_pos + stop_pos) / 2.0
        row = [sequence_length, midpoint]
        for values in means, variations, covs:
            corr_info = mcmc.Correlation()
            corr_info.analyze(values)
            hpd_low, hpd_high = mcmc.get_hpd_interval(0.95, values)
            row.extend([hpd_low, corr_info.mean, hpd_high])
        data_arr.append(row)
        sequence_lengths.append(sequence_length)
        midpoints.append(midpoint)
    # build the table string
    table_string = RUtil.get_table_string(data_arr, g_headers)
    # get the scripts
    scripts = get_ggplot2_scripts(nsamples, sequence_lengths, midpoints)
    # return the table string and scripts
    return table_string, scripts
Example #16
0
def get_response_content(fs):
    M, R = get_input_matrices(fs)
    # create the R table string and scripts
    headers = [
        't', 'mi.true.mut', 'mi.true.mutsel', 'mi.analog.mut',
        'mi.analog.mutsel'
    ]
    npoints = 100
    t_low = 0.0
    t_high = 5.0
    t_incr = (t_high - t_low) / (npoints - 1)
    t_values = [t_low + t_incr * i for i in range(npoints)]
    # get the data for the R table
    arr = []
    for t in t_values:
        mi_mut = ctmcmi.get_mutual_information(M, t)
        mi_mutsel = ctmcmi.get_mutual_information(R, t)
        mi_analog_mut = ctmcmi.get_ll_ratio_wrong(M, t)
        mi_analog_mutsel = ctmcmi.get_ll_ratio_wrong(R, t)
        row = [t, mi_mut, mi_mutsel, mi_analog_mut, mi_analog_mutsel]
        arr.append(row)
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
        table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #17
0
def get_table_string_and_scripts_par(start_stop_pairs, nsamples):
    """
    Local command-line multi-process only.
    """
    # define the pool of processes corresponding to the number of cores
    mypool = Pool(processes=4)
    # do the multiprocessing
    start_stop_n_triples = [(a, b, nsamples) for a, b in start_stop_pairs]
    post_pairs_list = mypool.map(forked_function, start_stop_n_triples)
    # build the array for the R table
    data_arr = []
    sequence_lengths = []
    midpoints = []
    for start_stop_pair, post_pairs in zip(start_stop_pairs, post_pairs_list):
        start_pos, stop_pos = start_stop_pair
        sequence_length = stop_pos - start_pos + 1
        midpoint = (start_pos + stop_pos) / 2.0
        row = [sequence_length, midpoint]
        for corr_info, hpd_interval in post_pairs:
            hpd_low, hpd_high = hpd_interval
            row.extend([hpd_low, corr_info.mean, hpd_high])
        data_arr.append(row)
        sequence_lengths.append(sequence_length)
        midpoints.append(midpoint)
    # build the table string
    table_string = RUtil.get_table_string(data_arr, g_headers)
    # get the scripts
    scripts = get_ggplot2_scripts(nsamples, sequence_lengths, midpoints)
    # return the table string and scripts
    return table_string, scripts
Example #18
0
def get_table_string_and_scripts(stop_positions, nsamples):
    """
    Command-line only.
    """
    start_position = 1
    # build the array for the R table
    data_arr = []
    for stop_position in stop_positions:
        sequence_length = stop_position - start_position + 1
        means, variations, covs = get_value_lists(
                start_position, stop_position, nsamples)
        row = [sequence_length]
        for values in means, variations, covs:
            corr_info = mcmc.Correlation()
            corr_info.analyze(values)
            hpd_low, hpd_high = mcmc.get_hpd_interval(0.95, values)
            row.extend([hpd_low, corr_info.mean, hpd_high])
        data_arr.append(row)
    # build the table string
    table_string = RUtil.get_table_string(data_arr, g_headers)
    # get the scripts
    sequence_lengths = [x - start_position + 1 for x in stop_positions]
    scripts = get_ggplot2_scripts(sequence_lengths)
    # return the table string and scripts
    return table_string, scripts
Example #19
0
def get_response_content(fs):
    # precompute some transition matrices
    P_drift_selection = pgmsinglesite.create_drift_selection_transition_matrix(
            fs.npop, fs.selection_ratio)
    MatrixUtil.assert_transition_matrix(P_drift_selection)
    P_mutation = pgmsinglesite.create_mutation_transition_matrix(
            fs.npop, fs.mutation_ab, fs.mutation_ba)
    MatrixUtil.assert_transition_matrix(P_mutation)
    # define the R table headers
    headers = ['generation', 'number.of.mutants']
    # compute the path samples
    P = np.dot(P_drift_selection, P_mutation)
    mypath = PathSampler.sample_endpoint_conditioned_path(
            fs.nmutants_initial, fs.nmutants_final, fs.ngenerations, P)
    arr = [[i, nmutants] for i, nmutants in enumerate(mypath)]
    # create the R table string and scripts
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
            table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #20
0
def get_table_string_and_scripts(fs):
    nstates = fs.nresidues**fs.nsites
    if nstates > 256:
        raise ValueError('the mutation rate matrix is too big')
    # get the mutation matrix
    Q_mut = mrate.get_sparse_sequence_rate_matrix(fs.nresidues, fs.nsites)
    # sample a bunch of mutation-selection rate matrices
    Q_sels = []
    for selection_index in range(fs.nselections):
        # sample the selection parameters
        if fs.low_var:
            v = 0.2
        elif fs.medium_var:
            v = 1
        elif fs.high_var:
            v = 5.0
        elif fs.really_high_var:
            v = 25.0
        s = math.sqrt(v)
        if fs.neg_skew:
            sels = [-random.expovariate(1 / s) for i in range(nstates)]
        elif fs.no_skew:
            sels = [random.gauss(0, s) for i in range(nstates)]
        elif fs.pos_skew:
            sels = [random.expovariate(1 / s) for i in range(nstates)]
        # define the mutation-selection rate matrix using Halpern-Bruno
        Q = np.zeros_like(Q_mut)
        for i in range(nstates):
            for j in range(nstates):
                if i != j:
                    tau = math.exp(-(sels[j] - sels[i]))
                    coeff = math.log(tau) / (1 - 1 / tau)
                    Q[i, j] = Q_mut[i, j] * coeff
        for i in range(nstates):
            Q[i, i] = -np.sum(Q[i])
        Q_sels.append(Q)
    # define the time points
    incr = (fs.t_high - fs.t_low) / (fs.ntimes - 1)
    times = [fs.t_low + i * incr for i in range(fs.ntimes)]
    # compute the statistics
    nsels = len(Q_sels)
    pairs = [get_time_point_summary(Q_mut, Q_sels, t) for t in times]
    mi_sign_lists, time_stats = zip(*pairs)
    ncrossing_list = []
    # look at how the signs change over time for each selection sample
    for signs in zip(*mi_sign_lists):
        count = 0
        for sign_a, sign_b in iterutils.pairwise(signs):
            if sign_a != sign_b:
                count += 1
        ncrossing_list.append(count)
    # get the R scripts
    scripts = [
        get_r_band_script(nsels, time_stats),
        get_r_prop_script(nsels, time_stats),
        get_r_cross_script(ncrossing_list)
    ]
    table_string = RUtil.get_table_string(time_stats, g_time_stats_headers)
    return table_string, scripts
Example #21
0
def get_table_string_and_scripts(fs):
    nstates = fs.nresidues ** fs.nsites
    if nstates > 256:
        raise ValueError('the mutation rate matrix is too big')
    # get the mutation matrix
    Q_mut = mrate.get_sparse_sequence_rate_matrix(fs.nresidues, fs.nsites)
    # sample a bunch of mutation-selection rate matrices
    Q_sels = []
    for selection_index in range(fs.nselections):
        # sample the selection parameters
        if fs.low_var:
            v = 0.2
        elif fs.medium_var:
            v = 1
        elif fs.high_var:
            v = 5.0
        elif fs.really_high_var:
            v = 25.0
        s = math.sqrt(v)
        if fs.neg_skew:
            sels = [-random.expovariate(1/s) for i in range(nstates)]
        elif fs.no_skew:
            sels = [random.gauss(0, s) for i in range(nstates)]
        elif fs.pos_skew:
            sels = [random.expovariate(1/s) for i in range(nstates)]
        # define the mutation-selection rate matrix using Halpern-Bruno
        Q = np.zeros_like(Q_mut)
        for i in range(nstates):
            for j in range(nstates):
                if i != j:
                    tau = math.exp(-(sels[j] - sels[i]))
                    coeff = math.log(tau) / (1 - 1/tau)
                    Q[i, j] = Q_mut[i, j] * coeff
        for i in range(nstates):
            Q[i, i] = -np.sum(Q[i])
        Q_sels.append(Q)
    # define the time points
    incr = (fs.t_high - fs.t_low) / (fs.ntimes - 1)
    times = [fs.t_low + i*incr for i in range(fs.ntimes)]
    # compute the statistics
    nsels = len(Q_sels)
    pairs = [get_time_point_summary(Q_mut, Q_sels, t) for t in times]
    mi_sign_lists, time_stats = zip(*pairs)
    ncrossing_list = []
    # look at how the signs change over time for each selection sample
    for signs in zip(*mi_sign_lists):
        count = 0
        for sign_a, sign_b in iterutils.pairwise(signs):
            if sign_a != sign_b:
                count += 1
        ncrossing_list.append(count)
    # get the R scripts
    scripts = [
            get_r_band_script(nsels, time_stats),
            get_r_prop_script(nsels, time_stats),
            get_r_cross_script(ncrossing_list)]
    table_string = RUtil.get_table_string(time_stats, g_time_stats_headers)
    return table_string, scripts
Example #22
0
def get_response_content(fs):
    f_info = divtime.get_fisher_info_known_distn_fast
    requested_triples = []
    for triple in g_process_triples:
        name, desc, zoo_obj = triple
        if getattr(fs, name):
            requested_triples.append(triple)
    if not requested_triples:
        raise ValueError('nothing to plot')
    # define the R table headers
    r_names = [a.replace('_', '.') for a, b, c in requested_triples]
    headers = ['t'] + r_names
    # Spend a lot of time doing the optimizations
    # to construct the points for the R table.
    arr = []
    for t in cbreaker.throttled(progrid.gen_binary(fs.start_time,
                                                   fs.stop_time),
                                nseconds=5,
                                ncount=200):
        row = [t]
        for python_name, desc, zoo_class in requested_triples:
            zoo_obj = zoo_class(fs.d)
            df = zoo_obj.get_df()
            opt_dep = OptDep(zoo_obj, t, f_info)
            if df:
                X0 = np.random.randn(df)
                xopt = scipy.optimize.fmin(opt_dep,
                                           X0,
                                           maxiter=10000,
                                           maxfun=10000)
                # I would like to use scipy.optimize.minimize
                # except that this requires a newer version of
                # scipy than is packaged for ubuntu right now.
                # fmin_bfgs seems to have problems sometimes
                # either hanging or maxiter=10K is too big.
                """
                xopt = scipy.optimize.fmin_bfgs(opt_dep, X0,
                        gtol=1e-8, maxiter=10000)
                """
            else:
                xopt = np.array([])
            info_value = -opt_dep(xopt)
            row.append(info_value)
        arr.append(row)
    arr.sort()
    npoints = len(arr)
    # create the R table string and scripts
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
        table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #23
0
def get_response_content(fs):
    # legend labels
    label_a = 'N=%d mu=%f' % (fs.nstates_a, fs.mu_a)
    label_b = 'N=%d mu=%f' % (fs.nstates_b, fs.mu_b)
    arr, headers = make_table(fs)
    # compute the max value
    ymax = math.log(max(fs.nstates_a, fs.nstates_b))
    nfifths = int(math.floor(ymax * 5.0)) + 1
    ylim = RUtil.mk_call_str('c', 0, 0.2 * nfifths)
    # write the R script body
    out = StringIO()
    print >> out, RUtil.mk_call_str(
            'plot',
            'my.table$t',
            'my.table$alpha',
            type='"n"',
            ylim=ylim,
            xlab='"time"',
            ylab='"information"',
            main='"comparison of an information criterion for two processes"',
            )
    # draw some horizontal lines
    for i in range(nfifths+1):
        print >> out, RUtil.mk_call_str(
                'abline',
                h=0.2*i,
                col='"lightgray"',
                lty='"dotted"')
    colors = ('darkblue', 'darkred')
    for c, header in zip(colors, headers[1:]):
        print >> out, RUtil.mk_call_str(
                'lines',
                'my.table$t',
                'my.table$%s' % header,
                col='"%s"' % c,
                )
    legend_names = (label_a, label_b)
    legend_name_str = 'c(' + ', '.join('"%s"' % s for s in legend_names) + ')'
    legend_col_str = 'c(' + ', '.join('"%s"' % s for s in colors) + ')'
    legend_lty_str = 'c(' + ', '.join('1' for s in colors) + ')'
    print >> out, RUtil.mk_call_str(
            'legend',
            '"%s"' % fs.legend_placement,
            legend_name_str,
            col=legend_col_str,
            lty=legend_lty_str,
            )
    script_body = out.getvalue()
    # create the R plot image
    table_string = RUtil.get_table_string(arr, headers)
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
            table_string, script_body, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #24
0
def get_response_content(fs):
    # precompute some transition matrices
    P_drift_selection = pgmsinglesite.create_drift_selection_transition_matrix(
            fs.npop, fs.selection_ratio)
    MatrixUtil.assert_transition_matrix(P_drift_selection)
    P_mutation = pgmsinglesite.create_mutation_transition_matrix(
            fs.npop, fs.mutation_ab, fs.mutation_ba)
    MatrixUtil.assert_transition_matrix(P_mutation)
    # define the R table headers
    headers = [
            'generation',
            'number.of.mutants',
            'probability',
            'log.prob',
            ]
    # compute the transition matrix
    P = np.dot(P_drift_selection, P_mutation)
    # Compute the endpoint conditional probabilities for various states
    # along the unobserved path.
    nstates = fs.npop + 1
    M = np.zeros((nstates, fs.ngenerations))
    M[fs.nmutants_initial, 0] = 1.0
    M[fs.nmutants_final, fs.ngenerations-1] = 1.0
    for i in range(fs.ngenerations-2):
        A_exponent = i + 1
        B_exponent = fs.ngenerations - 1 - A_exponent
        A = np.linalg.matrix_power(P, A_exponent)
        B = np.linalg.matrix_power(P, B_exponent)
        weights = np.zeros(nstates)
        for k in range(nstates):
            weights[k] = A[fs.nmutants_initial, k] * B[k, fs.nmutants_final]
        weights /= np.sum(weights)
        for k, p in enumerate(weights):
            M[k, i+1] = p
    arr = []
    for g in range(fs.ngenerations):
        for k in range(nstates):
            p = M[k, g]
            if p:
                logp = math.log(p)
            else:
                logp = float('-inf')
            row = [g, k, p, logp]
            arr.append(row)
    # create the R table string and scripts
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
            table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #25
0
def get_table_string_and_scripts(fs):
    """
    The latex documentbody should have a bunch of tikz pieces in it.
    Each tikz piece should have been generated from R.
    """
    nstates = fs.nresidues ** fs.nsites
    if nstates > 256:
        raise ValueError("the mutation rate matrix is too big")
    # get the mutation matrix
    Q_mut = mrate.get_sparse_sequence_rate_matrix(fs.nresidues, fs.nsites)
    # sample a bunch of mutation-selection rate matrices
    Q_sels = []
    for selection_index in range(fs.nselections):
        # sample the selection parameters
        if fs.low_var:
            v = 0.2
        elif fs.medium_var:
            v = 1
        elif fs.high_var:
            v = 5.0
        elif fs.really_high_var:
            v = 25.0
        s = math.sqrt(v)
        if fs.neg_skew:
            sels = [-random.expovariate(1 / s) for i in range(nstates)]
        elif fs.no_skew:
            sels = [random.gauss(0, s) for i in range(nstates)]
        elif fs.pos_skew:
            sels = [random.expovariate(1 / s) for i in range(nstates)]
        # define the mutation-selection rate matrix using Halpern-Bruno
        Q = np.zeros_like(Q_mut)
        for i in range(nstates):
            for j in range(nstates):
                if i != j:
                    tau = math.exp(-(sels[j] - sels[i]))
                    coeff = math.log(tau) / (1 - 1 / tau)
                    Q[i, j] = Q_mut[i, j] * coeff
        for i in range(nstates):
            Q[i, i] = -np.sum(Q[i])
        Q_sels.append(Q)
    # define the time points
    incr = (fs.t_high - fs.t_low) / (fs.ntimes - 1)
    times = [fs.t_low + i * incr for i in range(fs.ntimes)]
    # compute the statistics
    nsels = len(Q_sels)
    time_stats = [get_time_point_summary(Q_mut, Q_sels, t) for t in times]
    # get the R scripts
    scripts = [
        # get_r_tikz_mi_plot(nsels, time_stats),
        get_r_tikz_corr_plot(nsels, time_stats),
        get_r_tikz_prop_plot(nsels, time_stats),
        get_r_tikz_info_plot(nsels, time_stats),
    ]
    table_string = RUtil.get_table_string(time_stats, g_time_stats_headers)
    return table_string, scripts
Example #26
0
def get_response_content(fs):
    # legend labels
    label_a = 'N=%d mu=%f' % (fs.nstates_a, fs.mu_a)
    label_b = 'N=%d mu=%f' % (fs.nstates_b, fs.mu_b)
    arr, headers = make_table(fs)
    # compute the max value
    ymax = math.log(max(fs.nstates_a, fs.nstates_b))
    nfifths = int(math.floor(ymax * 5.0)) + 1
    ylim = RUtil.mk_call_str('c', 0, 0.2 * nfifths)
    # write the R script body
    out = StringIO()
    print >> out, RUtil.mk_call_str(
        'plot',
        'my.table$t',
        'my.table$alpha',
        type='"n"',
        ylim=ylim,
        xlab='"time"',
        ylab='"information"',
        main='"comparison of an information criterion for two processes"',
    )
    # draw some horizontal lines
    for i in range(nfifths + 1):
        print >> out, RUtil.mk_call_str('abline',
                                        h=0.2 * i,
                                        col='"lightgray"',
                                        lty='"dotted"')
    colors = ('darkblue', 'darkred')
    for c, header in zip(colors, headers[1:]):
        print >> out, RUtil.mk_call_str(
            'lines',
            'my.table$t',
            'my.table$%s' % header,
            col='"%s"' % c,
        )
    legend_names = (label_a, label_b)
    legend_name_str = 'c(' + ', '.join('"%s"' % s for s in legend_names) + ')'
    legend_col_str = 'c(' + ', '.join('"%s"' % s for s in colors) + ')'
    legend_lty_str = 'c(' + ', '.join('1' for s in colors) + ')'
    print >> out, RUtil.mk_call_str(
        'legend',
        '"%s"' % fs.legend_placement,
        legend_name_str,
        col=legend_col_str,
        lty=legend_lty_str,
    )
    script_body = out.getvalue()
    # create the R plot image
    table_string = RUtil.get_table_string(arr, headers)
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
        table_string, script_body, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #27
0
def get_response_content(fs):
    distn_modes = [x for x in g_ordered_modes if x in fs.distribution]
    if not distn_modes:
        raise ValueError('no distribution mode was specified')
    colors = [g_mode_to_color[m] for m in distn_modes]
    arr, headers = make_table(fs, distn_modes)
    distn_headers = headers[1:]
    # Get the largest value in the array,
    # skipping the first column.
    arrmax = np.max(arr[:, 1:])
    # write the R script body
    out = StringIO()
    ylim = RUtil.mk_call_str('c', 0, arrmax + 0.1)
    sel_str = {
        BALANCED: 'balanced',
        HALPERN_BRUNO: 'Halpern-Bruno',
    }[fs.selection]
    print >> out, RUtil.mk_call_str(
        'plot',
        'my.table$t',
        'my.table$%s' % distn_headers[0],
        type='"n"',
        ylim=ylim,
        xlab='""',
        ylab='"relaxation time"',
        main='"Effect of selection (%s) on relaxation time for %d states"' %
        (sel_str, fs.nstates),
    )
    for c, header in zip(colors, distn_headers):
        print >> out, RUtil.mk_call_str(
            'lines',
            'my.table$t',
            'my.table$%s' % header,
            col='"%s"' % c,
        )
    mode_names = [s.replace('_', ' ') for s in distn_modes]
    legend_name_str = 'c(' + ', '.join('"%s"' % s for s in mode_names) + ')'
    legend_col_str = 'c(' + ', '.join('"%s"' % s for s in colors) + ')'
    legend_lty_str = 'c(' + ', '.join(['1'] * len(distn_modes)) + ')'
    print >> out, RUtil.mk_call_str(
        'legend',
        '"%s"' % fs.legend_placement,
        legend_name_str,
        col=legend_col_str,
        lty=legend_lty_str,
    )
    script_body = out.getvalue()
    # create the R plot image
    table_string = RUtil.get_table_string(arr, headers)
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
        table_string, script_body, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #28
0
def get_response_content(fs):
    f_info = divtime.get_fisher_info_known_distn_fast
    requested_triples = []
    for triple in g_process_triples:
        name, desc, zoo_obj = triple
        if getattr(fs, name):
            requested_triples.append(triple)
    if not requested_triples:
        raise ValueError('nothing to plot')
    # define the R table headers
    r_names = [a.replace('_', '.') for a, b, c in requested_triples]
    headers = ['t'] + r_names
    # Spend a lot of time doing the optimizations
    # to construct the points for the R table.
    arr = []
    for t in cbreaker.throttled(
            progrid.gen_binary(fs.start_time, fs.stop_time),
            nseconds=5, ncount=200):
        row = [t]
        for python_name, desc, zoo_class in requested_triples:
            zoo_obj = zoo_class(fs.d)
            df = zoo_obj.get_df()
            opt_dep = OptDep(zoo_obj, t, f_info)
            if df:
                X0 = np.random.randn(df)
                xopt = scipy.optimize.fmin(
                        opt_dep, X0, maxiter=10000, maxfun=10000)
                # I would like to use scipy.optimize.minimize
                # except that this requires a newer version of
                # scipy than is packaged for ubuntu right now.
                # fmin_bfgs seems to have problems sometimes
                # either hanging or maxiter=10K is too big.
                """
                xopt = scipy.optimize.fmin_bfgs(opt_dep, X0,
                        gtol=1e-8, maxiter=10000)
                """
            else:
                xopt = np.array([])
            info_value = -opt_dep(xopt)
            row.append(info_value)
        arr.append(row)
    arr.sort()
    npoints = len(arr)
    # create the R table string and scripts
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
            table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #29
0
def get_response_content(fs):
    distn_modes = [x for x in g_ordered_modes if x in fs.distribution]
    if not distn_modes:
        raise ValueError('no distribution mode was specified')
    colors = [g_mode_to_color[m] for m in distn_modes]
    arr, headers = make_table(fs, distn_modes)
    distn_headers = headers[1:]
    # Get the largest value in the array,
    # skipping the first column.
    arrmax = np.max(arr[:,1:])
    # write the R script body
    out = StringIO()
    ylim = RUtil.mk_call_str('c', 0, arrmax + 0.1)
    sel_str = {
            BALANCED : 'balanced',
            HALPERN_BRUNO : 'Halpern-Bruno',
            }[fs.selection]
    print >> out, RUtil.mk_call_str(
            'plot',
            'my.table$t',
            'my.table$%s' % distn_headers[0],
            type='"n"',
            ylim=ylim,
            xlab='""',
            ylab='"relaxation time"',
            main='"Effect of selection (%s) on relaxation time for %d states"' % (sel_str, fs.nstates),
            )
    for c, header in zip(colors, distn_headers):
        print >> out, RUtil.mk_call_str(
                'lines',
                'my.table$t',
                'my.table$%s' % header,
                col='"%s"' % c,
                )
    mode_names = [s.replace('_', ' ') for s in distn_modes]
    legend_name_str = 'c(' + ', '.join('"%s"' % s for s in mode_names) + ')'
    legend_col_str = 'c(' + ', '.join('"%s"' % s for s in colors) + ')'
    legend_lty_str = 'c(' + ', '.join(['1']*len(distn_modes)) + ')'
    print >> out, RUtil.mk_call_str(
            'legend',
            '"%s"' % fs.legend_placement,
            legend_name_str,
            col=legend_col_str,
            lty=legend_lty_str,
            )
    script_body = out.getvalue()
    # create the R plot image
    table_string = RUtil.get_table_string(arr, headers)
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
            table_string, script_body, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #30
0
def get_table_string_and_scripts(fs):
    """
    The latex documentbody should have a bunch of tikz pieces in it.
    Each tikz piece should have been generated from R.
    """
    nstates = fs.nresidues ** fs.nsites
    if nstates > 256:
        raise ValueError('the mutation rate matrix is too big')
    # get the mutation matrix
    Q_mut = mrate.get_sparse_sequence_rate_matrix(fs.nresidues, fs.nsites)
    # sample a bunch of mutation-selection rate matrices
    Q_sels = []
    for selection_index in range(fs.nselections):
        # sample the selection parameters
        if fs.low_var:
            v = 0.2
        elif fs.medium_var:
            v = 1
        elif fs.high_var:
            v = 5.0
        elif fs.really_high_var:
            v = 25.0
        s = math.sqrt(v)
        if fs.neg_skew:
            sels = [-random.expovariate(1/s) for i in range(nstates)]
        elif fs.no_skew:
            sels = [random.gauss(0, s) for i in range(nstates)]
        elif fs.pos_skew:
            sels = [random.expovariate(1/s) for i in range(nstates)]
        # define the mutation-selection rate matrix using Halpern-Bruno
        Q = np.zeros_like(Q_mut)
        for i in range(nstates):
            for j in range(nstates):
                if i != j:
                    tau = math.exp(-(sels[j] - sels[i]))
                    coeff = math.log(tau) / (1 - 1/tau)
                    Q[i, j] = Q_mut[i, j] * coeff
        for i in range(nstates):
            Q[i, i] = -np.sum(Q[i])
        Q_sels.append(Q)
    # define the time points
    incr = (fs.t_high - fs.t_low) / (fs.ntimes - 1)
    times = [fs.t_low + i*incr for i in range(fs.ntimes)]
    # compute the statistics
    nsels = len(Q_sels)
    time_stats = [get_time_point_summary(Q_mut, Q_sels, t) for t in times]
    # get the R scripts
    scripts = [
            #get_r_tikz_mi_plot(nsels, time_stats),
            get_r_tikz_corr_plot(nsels, time_stats),
            get_r_tikz_prop_plot(nsels, time_stats),
            get_r_tikz_info_plot(nsels, time_stats)]
    table_string = RUtil.get_table_string(time_stats, g_time_stats_headers)
    return table_string, scripts
Example #31
0
def get_response_content(fs):
    f_info = ctmcmi.get_mutual_info_known_distn
    requested_triples = []
    for triple in g_process_triples:
        name, desc, zoo_obj = triple
        if getattr(fs, name):
            requested_triples.append(triple)
    if not requested_triples:
        raise ValueError('nothing to plot')
    # define the R table headers
    headers = ['t']
    if fs.log4:
        headers.append('log.4')
    if fs.log3:
        headers.append('log.3')
    r_names = [a.replace('_', '.') for a, b, c in requested_triples]
    headers.extend(r_names)
    # Spend a lot of time doing the optimizations
    # to construct the points for the R table.
    times = np.linspace(fs.start_time, fs.stop_time, 101)
    arr = []
    for t in times:
        row = [t]
        if fs.log4:
            row.append(math.log(4))
        if fs.log3:
            row.append(math.log(3))
        for python_name, desc, zoo_obj in requested_triples:
            X = np.array([])
            info_value = f_info(
                    zoo_obj.get_rate_matrix(X),
                    zoo_obj.get_distn(X),
                    t)
            row.append(info_value)
        arr.append(row)
    # create the R table string and scripts
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
            table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #32
0
def get_response_content(fs):
    Q_mut, Q_sels = get_qmut_qsels(fs)
    # compute the statistics
    ER_ratios, NSR_ratios, ER_NSR_ratios = get_statistic_ratios(Q_mut, Q_sels)
    M = zip(*(ER_ratios, NSR_ratios, ER_NSR_ratios))
    column_headers = ('ER.ratio', 'NSR.ratio', 'ER.times.NSR.ratio')
    table_string = RUtil.get_table_string(M, column_headers)
    nsels = len(Q_sels)
    # get the R script
    comboscript = get_r_comboscript(nsels, column_headers)
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
        table_string, comboscript, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #33
0
def get_response_content(fs):
    Q_mut, Q_sels = get_qmut_qsels(fs)
    # compute the statistics
    ER_ratios, NSR_ratios, ER_NSR_ratios  = get_statistic_ratios(Q_mut, Q_sels)
    M = zip(*(ER_ratios, NSR_ratios, ER_NSR_ratios))
    column_headers = ('ER.ratio', 'NSR.ratio', 'ER.times.NSR.ratio')
    table_string = RUtil.get_table_string(M, column_headers)
    nsels = len(Q_sels)
    # get the R script
    comboscript = get_r_comboscript(nsels, column_headers)
    # create the R plot image 
    device_name = Form.g_imageformat_to_r_function[fs.imageformat] 
    retcode, r_out, r_err, image_data = RUtil.run_plotter( 
        table_string, comboscript, device_name) 
    if retcode: 
        raise RUtil.RError(r_err) 
    return image_data 
Example #34
0
def get_response_content(fs):
    f_info = ctmcmi.get_mutual_info_known_distn
    requested_triples = []
    for triple in g_process_triples:
        name, desc, zoo_obj = triple
        if getattr(fs, name):
            requested_triples.append(triple)
    if not requested_triples:
        raise ValueError('nothing to plot')
    # define the R table headers
    headers = ['t']
    if fs.log4:
        headers.append('log.4')
    if fs.log3:
        headers.append('log.3')
    r_names = [a.replace('_', '.') for a, b, c in requested_triples]
    headers.extend(r_names)
    # Spend a lot of time doing the optimizations
    # to construct the points for the R table.
    times = np.linspace(fs.start_time, fs.stop_time, 101)
    arr = []
    for t in times:
        row = [t]
        if fs.log4:
            row.append(math.log(4))
        if fs.log3:
            row.append(math.log(3))
        for python_name, desc, zoo_obj in requested_triples:
            X = np.array([])
            info_value = f_info(zoo_obj.get_rate_matrix(X),
                                zoo_obj.get_distn(X), t)
            row.append(info_value)
        arr.append(row)
    # create the R table string and scripts
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
        table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #35
0
def get_response_content(fs):
    # create the R table string and scripts
    headers = [
            'entropy',
            'analog']
    distributions = []
    nstates = 4
    npoints = 5000
    arr = []
    best_pair = None
    for i in range(npoints):
        weights = [random.expovariate(1) for j in range(nstates)]
        total = sum(weights)
        distn = [x / total for x in weights]
        entropy = -sum(p * math.log(p) for p in distn)
        sum_squares = sum(p*p for p in distn)
        sum_cubes = sum(p*p*p for p in distn)
        analog = math.log(sum_squares / sum_cubes)
        row = [entropy, analog]
        arr.append(row)
        dist = (entropy - 1.0)**2 + (analog - 0.4)**2
        if (best_pair is None) or (dist < best_pair[0]):
            best_pair = (dist, distn)
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    out = StringIO()
    title = ', '.join(str(x) for x in best_pair[1])
    print >> out, RUtil.mk_call_str(
            'plot',
            'my.table$entropy',
            'my.table$analog',
            pch='20',
            main='"%s"' % title)
    script = out.getvalue()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
            table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #36
0
def get_response_content(fs):
    distn_modes = [x for x in g_ordered_modes if x in fs.distribution]
    if not distn_modes:
        raise ValueError("no distribution mode was specified")
    colors = [g_mode_to_color[m] for m in distn_modes]
    arr, headers = make_table(fs, distn_modes)
    distn_headers = headers[1:]
    # Get the largest value in the array,
    # skipping the first column.
    arrmax = np.max(arr[:, 1:])
    # write the R script body
    out = StringIO()
    ylim = RUtil.mk_call_str("c", 0, arrmax + 0.1)
    sel_str = {BALANCED: "f=1/2", HALPERN_BRUNO: "Halpern-Bruno"}[fs.selection]
    print >> out, RUtil.mk_call_str(
        "plot",
        "my.table$t",
        "my.table$%s" % distn_headers[0],
        type='"n"',
        ylim=ylim,
        xlab='"time"',
        ylab='"expected log L-ratio"',
        main='"Effect of selection (%s) on log L-ratio for %d states"' % (sel_str, fs.nstates),
    )
    for c, header in zip(colors, distn_headers):
        print >> out, RUtil.mk_call_str("lines", "my.table$t", "my.table$%s" % header, col='"%s"' % c)
    mode_names = [s.replace("_", " ") for s in distn_modes]
    legend_name_str = "c(" + ", ".join('"%s"' % s for s in mode_names) + ")"
    legend_col_str = "c(" + ", ".join('"%s"' % s for s in colors) + ")"
    legend_lty_str = "c(" + ", ".join(["1"] * len(distn_modes)) + ")"
    print >> out, RUtil.mk_call_str(
        "legend", '"%s"' % fs.legend_placement, legend_name_str, col=legend_col_str, lty=legend_lty_str
    )
    script_body = out.getvalue()
    # create the R plot image
    table_string = RUtil.get_table_string(arr, headers)
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(table_string, script_body, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #37
0
def get_response_content(fs):
    # unpack some options
    npoints = fs.npoints
    stddev = fs.stddev
    # define the data rows and the headers
    if fs.add_labels:
        headers = ('x', 'y', 'label')
        data_rows = list(SpiralSampler.gen_labeled_points(npoints, stddev))
    else:
        headers = ('x', 'y')
        data_rows = list(SpiralSampler.gen_points(npoints, stddev))
    # begin the response
    if fs.raw:
        lines = []
        for data_row in data_rows:
            line = '\t'.join(str(x) for x in data_row)
            lines.append(line)
        response_text = '\n'.join(lines)
    elif fs.table:
        response_text = RUtil.get_table_string(data_rows, headers)
    # return the response
    return response_text
Example #38
0
def get_response_content(fs):
    # create the R table string and scripts
    headers = ['entropy', 'analog']
    distributions = []
    nstates = 4
    npoints = 5000
    arr = []
    best_pair = None
    for i in range(npoints):
        weights = [random.expovariate(1) for j in range(nstates)]
        total = sum(weights)
        distn = [x / total for x in weights]
        entropy = -sum(p * math.log(p) for p in distn)
        sum_squares = sum(p * p for p in distn)
        sum_cubes = sum(p * p * p for p in distn)
        analog = math.log(sum_squares / sum_cubes)
        row = [entropy, analog]
        arr.append(row)
        dist = (entropy - 1.0)**2 + (analog - 0.4)**2
        if (best_pair is None) or (dist < best_pair[0]):
            best_pair = (dist, distn)
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    out = StringIO()
    title = ', '.join(str(x) for x in best_pair[1])
    print >> out, RUtil.mk_call_str('plot',
                                    'my.table$entropy',
                                    'my.table$analog',
                                    pch='20',
                                    main='"%s"' % title)
    script = out.getvalue()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
        table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #39
0
def get_response_content(fs):
    # unpack some options
    npoints = fs.npoints
    stddev = fs.stddev
    # define the data rows and the headers
    if fs.add_labels:
        headers = ('x', 'y', 'label')
        data_rows = list(SpiralSampler.gen_labeled_points(npoints, stddev))
    else:
        headers = ('x', 'y')
        data_rows = list(SpiralSampler.gen_points(npoints, stddev))
    # begin the response
    if fs.raw:
        lines = []
        for data_row in data_rows:
            line = '\t'.join(str(x) for x in data_row)
            lines.append(line)
        response_text = '\n'.join(lines)
    elif fs.table:
        response_text = RUtil.get_table_string(data_rows, headers)
    # return the response
    return response_text
Example #40
0
def get_table_string_and_scripts(start_stop_pairs, nsamples, header_seq_pairs):
    """
    Command-line only.
    """
    # build the array for the R table
    data_arr = []
    sequence_lengths = []
    midpoints = []
    for start_pos, stop_pos in start_stop_pairs:
        sequence_length = stop_pos - start_pos + 1
        midpoint = (start_pos + stop_pos) / 2.0
        arr = get_loganalysis_array(
                start_pos, stop_pos, nsamples, header_seq_pairs)
        mean_low = arr[1][4]
        mean_mean = arr[1][1]
        mean_high = arr[1][5]
        var_low = arr[2][4]
        var_mean = arr[2][1]
        var_high = arr[2][5]
        cov_low = arr[3][4]
        cov_mean = arr[3][1]
        cov_high = arr[3][5]
        row = [
                sequence_length, midpoint,
                mean_low, mean_mean, mean_high,
                var_low, var_mean, var_high,
                cov_low, cov_mean, cov_high]
        data_arr.append(row)
        sequence_lengths.append(sequence_length)
        midpoints.append(midpoint)
    # build the table string
    table_string = RUtil.get_table_string(data_arr, g_headers)
    # get the scripts
    scripts = beasttut.get_ggplot2_scripts(
            nsamples, sequence_lengths, midpoints)
    # return the table string and scripts
    return table_string, scripts
Example #41
0
def get_response_content(fs):
    # create the R table string and scripts
    headers = [
            'z',
            'c.neg2.0',
            'c.neg0.5',
            'c.0.5',
            'c.2.0',
            #'c.a',
            #'c.b',
            #'c.c',
            #'c.d',
            ]
    #C = numpy.array([-0.5, -0.2, 0.2, 0.5], dtype=float)
    #C = numpy.array([-1.0, -0.4, 0.4, 1.0], dtype=float)
    C = numpy.array([-2.0, -0.5, 0.5, 2.0], dtype=float)
    Z = numpy.linspace(-5, 5, 101)
    # get the data for the R table
    arr = []
    for z in Z:
        row = [z]
        for c in C:
            rate = 1.0 / kimrecessive.denom_piecewise(c, z*numpy.sign(c))
            row.append(rate)
        arr.append(row)
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
            table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #42
0
def get_response_content(fs):
    # create the R table string and scripts
    headers = [
        'z',
        'c.neg2.0',
        'c.neg0.5',
        'c.0.5',
        'c.2.0',
        #'c.a',
        #'c.b',
        #'c.c',
        #'c.d',
    ]
    #C = numpy.array([-0.5, -0.2, 0.2, 0.5], dtype=float)
    #C = numpy.array([-1.0, -0.4, 0.4, 1.0], dtype=float)
    C = numpy.array([-2.0, -0.5, 0.5, 2.0], dtype=float)
    Z = numpy.linspace(-5, 5, 101)
    # get the data for the R table
    arr = []
    for z in Z:
        row = [z]
        for c in C:
            rate = 1.0 / kimrecessive.denom_piecewise(c, z * numpy.sign(c))
            row.append(rate)
        arr.append(row)
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
        table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #43
0
def get_table_strings_and_scripts(
        xmldata, alignment_id, start_stop_pairs,
        nsamples):
    """
    Command-line only.
    @param xmldata: xml data already adjusted for nsamples and log filename
    @param alignment_id: xml element id
    @param start_stop_pairs: alignment interval bounds
    @param nsamples: an extra parameter for script generation
    @return: short table string, long table string, scripts (for short table)
    """
    # init the array for the full R table
    full_data_arr = []
    # build the array for the R table
    data_arr = []
    sequence_lengths = []
    midpoints = []
    for start_pos, stop_pos in start_stop_pairs:
        sequence_length = stop_pos - start_pos + 1
        midpoint = (start_pos + stop_pos) / 2.0
        interval_xml_data = beast.set_alignment_interval(
                xmldata, alignment_id, start_pos, stop_pos)
        row_labels, col_labels, arr = get_loganalysis_labeled_array(
                interval_xml_data)
        stat_name_to_row_index = dict(
                (x, i) for i, x in enumerate(row_labels))
        summary_name_to_col_index = dict(
                (x, i) for i, x in enumerate(col_labels))
        # define row indices of interest
        mean_row_index = stat_name_to_row_index['meanRate']
        var_row_index = stat_name_to_row_index['coefficientOfVariation']
        cov_row_index = stat_name_to_row_index['covariance']
        # define column indices of interest
        mean_col_index = summary_name_to_col_index['mean']
        low_col_index = summary_name_to_col_index['hpdLower']
        high_col_index = summary_name_to_col_index['hpdUpper']
        row = [
                sequence_length,
                midpoint,
                arr[mean_row_index][low_col_index],
                arr[mean_row_index][mean_col_index],
                arr[mean_row_index][high_col_index],
                arr[var_row_index][low_col_index],
                arr[var_row_index][mean_col_index],
                arr[var_row_index][high_col_index],
                arr[cov_row_index][low_col_index],
                arr[cov_row_index][mean_col_index],
                arr[cov_row_index][high_col_index],
                ]
        data_arr.append(row)
        # add rows to the full data array
        for row_index, row_label in enumerate(row_labels):
            for col_index, col_label in enumerate(col_labels):
                row = [
                        sequence_length,
                        midpoint,
                        '"' + row_label + '"',
                        '"' + col_label + '"',
                        arr[row_index][col_index],
                        ]
                full_data_arr.append(row)
        # add entries to some utility arrays
        sequence_lengths.append(sequence_length)
        midpoints.append(midpoint)
    # build the table strings
    table_string = RUtil.get_table_string(data_arr, g_headers)
    full_table_string = RUtil.get_table_string(
            full_data_arr,
            [
                'sequence.length',
                'midpoint',
                'statistic.name',
                'posterior.analysis',
                'value'
                ],
            force_float=False,
            )
    # get the scripts
    scripts = beasttut.get_ggplot2_scripts(
            nsamples, sequence_lengths, midpoints)
    # return the table string and scripts
    return table_string, full_table_string, scripts
Example #44
0
def get_response_content(fs):
    M = get_input_matrix(fs)
    nstates = len(M)
    nsites = fs.nsites
    if nstates ** nsites > 16:
        raise ValueError('the site dependent rate matrix is too big')
    # precompute some stuff
    M_site_indep = get_site_independent_process(M, nsites)
    v = mrate.R_to_distn(M)
    v_site_indep = get_site_independent_distn(v, nsites)
    if fs.info_fis:
        f_info = divtime.get_fisher_info_known_distn_fast
    elif fs.info_mut:
        f_info = ctmcmi.get_mutual_info_known_distn
    else:
        raise ValueError('no info type specified')
    f_selection = mrate.to_gtr_hb_known_energies
    # Spend a lot of time doing the optimizations
    # to construct the points for the R table.
    arr = []
    for t in cbreaker.throttled(
            progrid.gen_binary(fs.start_time, fs.stop_time),
            nseconds=4, ncount=100):
        row = [t]
        # get the site-dependent mutation selection balance information
        if fs.dep_balance:
            dep_balance = OptDep(
                    M_site_indep, v_site_indep, t, f_info, f_selection)
            X0 = np.random.randn(nstates ** nsites - 1)
            xopt = scipy.optimize.fmin(dep_balance, X0)
            max_dep_balance_info = -dep_balance(xopt)
            row.append(max_dep_balance_info)
            # for debug
            Q_bal, v_bal = dep_balance.get_process(xopt)
            print 'dependent balance:'
            print max_dep_balance_info
            print v_bal
            print Q_bal
            print
        # get the site-independent mutation selection balance information
        if fs.indep_balance:
            indep_balance = OptIndep(
                    M, v, nsites, t, f_info, f_selection)
            X0 = np.random.randn(nstates-1)
            xopt = scipy.optimize.fmin(indep_balance, X0)
            max_indep_balance_info = -indep_balance(xopt)
            row.append(max_indep_balance_info)
            # for debug
            Q_bal, v_bal = indep_balance.get_process(xopt)
            print 'independent balance:'
            print max_indep_balance_info
            print v_bal
            print Q_bal
            print
        # get the site-independent mutation process information
        if fs.indep_mutation:
            indep_mut_info = f_info(M_site_indep, v_site_indep, t)
            row.append(indep_mut_info)
        # add the data row to the table
        arr.append(row)
    arr.sort()
    npoints = len(arr)
    # create the R table string and scripts
    headers = ['t']
    if fs.dep_balance:
        headers.append('max.site.dep.balance')
    if fs.indep_balance:
        headers.append('max.site.indep.balance')
    if fs.indep_mutation:
        headers.append('site.indep.mutation')
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
            table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #45
0
def do_hard_coded_analysis_b(tree, tree_remark):
    """
    Do a hardcoded analysis of tree reconstruction methods.
    Make R files of ordered reconstruction losses.
    @param tree: a tree object
    @param tree_remark: a string that is a comment about the tree
    """
    # define an arbitrary order for the names of the leaves of the tree
    ordered_names = list(node.name for node in tree.gen_tips())
    # use some replicates
    reconstruction_count = 100
    # Make R files for reconstruction results from sequences
    # of some number of nucleotides in length.
    sequence_length = 2000
    # define the tree reconstruction methods to be used
    sims = [
        Simulation(Clustering.NeighborJoiningDMS(), 'nj', 'neighbor joining'),
        Simulation(Clustering.StoneSpectralSignDMS(), 'nj', 'spectral sign')
    ]
    # set tree reconstruction parameters
    for sim in sims:
        sim.set_original_tree(tree)
    # initialize the distance matrix sampler
    sampler = DMSampler.InfiniteAllelesSampler(tree, ordered_names,
                                               sequence_length)
    sampler.set_inf_replacement(20.0)
    sampler.set_zero_replacement(0.0)
    # start the progress bar
    pbar = Progress.Bar(1.0)
    # sample some distance matrices
    distance_matrix_start_time = time.time()
    distance_matrices = []
    for result in sampler.gen_samples_or_none():
        # if we got a result then update the distance matrix list
        if result:
            sequence_list, D = result
            distance_matrices.append(D)
        # Update the progressbar regardless of whether or not
        # the proposal was accepted.
        remaining_acceptances = reconstruction_count - len(distance_matrices)
        numerator = sampler.get_completed_proposals()
        denominator = numerator + sampler.get_remaining_proposals(
            remaining_acceptances)
        dms_fraction = float(numerator) / float(denominator)
        dms_total = 1.0 / (1 + len(sims))
        pbar.update(dms_fraction * dms_total)
        # if we have enough samples then break the loop
        if not remaining_acceptances:
            break
    distance_matrix_seconds = time.time() - distance_matrix_start_time
    # reconstruct trees using various methods
    reconstruction_seconds = []
    for i, sim in enumerate(sims):
        reconstruction_start_time = time.time()
        print 'reconstructing', len(distance_matrices), 'trees'
        print 'using', sim.description
        sim.run(distance_matrices, ordered_names)
        pbar.update(float(i + 2) / float(1 + len(sims)))
        reconstruction_seconds.append(time.time() - reconstruction_start_time)
    # stop the progress bar
    pbar.finish()
    # consider the neighbor joining and the spectral sign results
    nj_sim, ss_sim = sims
    # extract the simulation data
    label_list_pairs = [
        ('nj.unweighted', nj_sim.get_normalized_error_counts()),
        ('ss.unweighted', ss_sim.get_normalized_error_counts()),
        ('nj.weighted', nj_sim.get_normalized_loss_values()),
        ('ss.weighted', ss_sim.get_normalized_loss_values())
    ]
    labels, transposed_table = zip(*label_list_pairs)
    table = zip(*transposed_table)
    table_string = RUtil.get_table_string(table, labels)
    # write the table
    filename = 'out3.table'
    with open(filename, 'w') as fout:
        print >> fout, '# tree source:', tree_remark
        print >> fout, '# number of taxa:', len(ordered_names)
        print >> fout, '# sampled distance matrices:', len(distance_matrices)
        print >> fout, '# sampling seconds elapsed:', distance_matrix_seconds
        print >> fout, '# sites per sequence:', sequence_length
        for sim, seconds in zip(sims, reconstruction_seconds):
            msg_a = '# seconds elapsed for tree reconstruction using '
            msg_b = sim.description + ': ' + str(seconds)
            print >> fout, msg_a + msg_b
        print >> fout, table_string
    print 'wrote', filename
Example #46
0
def get_response_content(fs):
    M = get_input_matrix(fs)
    nstates = len(M)
    nsites = fs.nsites
    if nstates**nsites > 16:
        raise ValueError('the site dependent rate matrix is too big')
    # precompute some stuff
    M_site_indep = get_site_independent_process(M, nsites)
    v = mrate.R_to_distn(M)
    v_site_indep = get_site_independent_distn(v, nsites)
    if fs.info_fis:
        f_info = divtime.get_fisher_info_known_distn_fast
    elif fs.info_mut:
        f_info = ctmcmi.get_mutual_info_known_distn
    else:
        raise ValueError('no info type specified')
    f_selection = mrate.to_gtr_hb_known_energies
    # Spend a lot of time doing the optimizations
    # to construct the points for the R table.
    arr = []
    for t in cbreaker.throttled(progrid.gen_binary(fs.start_time,
                                                   fs.stop_time),
                                nseconds=4,
                                ncount=100):
        row = [t]
        # get the site-dependent mutation selection balance information
        if fs.dep_balance:
            dep_balance = OptDep(M_site_indep, v_site_indep, t, f_info,
                                 f_selection)
            X0 = np.random.randn(nstates**nsites - 1)
            xopt = scipy.optimize.fmin(dep_balance, X0)
            max_dep_balance_info = -dep_balance(xopt)
            row.append(max_dep_balance_info)
            # for debug
            Q_bal, v_bal = dep_balance.get_process(xopt)
            print 'dependent balance:'
            print max_dep_balance_info
            print v_bal
            print Q_bal
            print
        # get the site-independent mutation selection balance information
        if fs.indep_balance:
            indep_balance = OptIndep(M, v, nsites, t, f_info, f_selection)
            X0 = np.random.randn(nstates - 1)
            xopt = scipy.optimize.fmin(indep_balance, X0)
            max_indep_balance_info = -indep_balance(xopt)
            row.append(max_indep_balance_info)
            # for debug
            Q_bal, v_bal = indep_balance.get_process(xopt)
            print 'independent balance:'
            print max_indep_balance_info
            print v_bal
            print Q_bal
            print
        # get the site-independent mutation process information
        if fs.indep_mutation:
            indep_mut_info = f_info(M_site_indep, v_site_indep, t)
            row.append(indep_mut_info)
        # add the data row to the table
        arr.append(row)
    arr.sort()
    npoints = len(arr)
    # create the R table string and scripts
    headers = ['t']
    if fs.dep_balance:
        headers.append('max.site.dep.balance')
    if fs.indep_balance:
        headers.append('max.site.indep.balance')
    if fs.indep_mutation:
        headers.append('site.indep.mutation')
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
        table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #47
0
def get_response_content(fs):
    # transform the arguments according to the diffusion approximation
    mutation_ab = (fs.pop * fs.mutation_ab) / fs.pop_gran
    mutation_ba = (fs.pop * fs.mutation_ba) / fs.pop_gran
    if mutation_ab > 1 or mutation_ba > 1:
        raise Exception(
                'the mutation probability is not small enough '
                'for the diffusion approximation to be meaningful')
    selection_ratio = 1 + (fs.pop * fs.additive_selection) / fs.pop_gran
    npop = fs.pop_gran
    ngenerations = fs.ngenerations
    nmutants_initial = int(fs.initial_freq * fs.pop_gran)
    nmutants_final = int(fs.final_freq * fs.pop_gran)
    # precompute some transition matrices
    P_drift_selection = pgmsinglesite.create_drift_selection_transition_matrix(
            npop, selection_ratio)
    MatrixUtil.assert_transition_matrix(P_drift_selection)
    P_mutation = pgmsinglesite.create_mutation_transition_matrix(
            npop, mutation_ab, mutation_ba)
    MatrixUtil.assert_transition_matrix(P_mutation)
    # define the R table headers
    headers = [
            'generation',
            'allele.frequency',
            'probability',
            'log.density',
            ]
    # compute the transition matrix
    P = np.dot(P_drift_selection, P_mutation)
    # Compute the endpoint conditional probabilities for various states
    # along the unobserved path.
    nstates = npop + 1
    M = np.zeros((nstates, ngenerations))
    M[nmutants_initial, 0] = 1.0
    M[nmutants_final, ngenerations-1] = 1.0
    for i in range(ngenerations-2):
        A_exponent = i + 1
        B_exponent = ngenerations - 1 - A_exponent
        A = np.linalg.matrix_power(P, A_exponent)
        B = np.linalg.matrix_power(P, B_exponent)
        weights = np.zeros(nstates)
        for k in range(nstates):
            weights[k] = A[nmutants_initial, k] * B[k, nmutants_final]
        weights /= np.sum(weights)
        for k, p in enumerate(weights):
            M[k, i+1] = p
    arr = []
    for g in range(ngenerations):
        for k in range(nstates):
            p = M[k, g]
            allele_frequency = k / float(npop)
            # Finer gridding needs larger scaling for the density
            # because each interval has a smaller support.
            density = p * nstates
            if density:
                log_density = math.log(density)
            else:
                log_density = float('-inf')
            row = [g, allele_frequency, p, log_density]
            arr.append(row)
    # create the R table string and scripts
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot(nstates)
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
            table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #48
0
def get_response_content(fs):
    # transform the arguments according to the diffusion approximation
    mutation_ab = (fs.pop * fs.mutation_ab) / fs.pop_gran
    mutation_ba = (fs.pop * fs.mutation_ba) / fs.pop_gran
    if mutation_ab > 1 or mutation_ba > 1:
        raise Exception('the mutation probability is not small enough '
                        'for the diffusion approximation to be meaningful')
    selection_ratio = 1 + (fs.pop * fs.additive_selection) / fs.pop_gran
    npop = fs.pop_gran
    ngenerations = fs.ngenerations
    nmutants_initial = int(fs.initial_freq * fs.pop_gran)
    nmutants_final = int(fs.final_freq * fs.pop_gran)
    # precompute some transition matrices
    P_drift_selection = pgmsinglesite.create_drift_selection_transition_matrix(
        npop, selection_ratio)
    MatrixUtil.assert_transition_matrix(P_drift_selection)
    P_mutation = pgmsinglesite.create_mutation_transition_matrix(
        npop, mutation_ab, mutation_ba)
    MatrixUtil.assert_transition_matrix(P_mutation)
    # define the R table headers
    headers = [
        'generation',
        'allele.frequency',
        'probability',
        'log.density',
    ]
    # compute the transition matrix
    P = np.dot(P_drift_selection, P_mutation)
    # Compute the endpoint conditional probabilities for various states
    # along the unobserved path.
    nstates = npop + 1
    M = np.zeros((nstates, ngenerations))
    M[nmutants_initial, 0] = 1.0
    M[nmutants_final, ngenerations - 1] = 1.0
    for i in range(ngenerations - 2):
        A_exponent = i + 1
        B_exponent = ngenerations - 1 - A_exponent
        A = np.linalg.matrix_power(P, A_exponent)
        B = np.linalg.matrix_power(P, B_exponent)
        weights = np.zeros(nstates)
        for k in range(nstates):
            weights[k] = A[nmutants_initial, k] * B[k, nmutants_final]
        weights /= np.sum(weights)
        for k, p in enumerate(weights):
            M[k, i + 1] = p
    arr = []
    for g in range(ngenerations):
        for k in range(nstates):
            p = M[k, g]
            allele_frequency = k / float(npop)
            # Finer gridding needs larger scaling for the density
            # because each interval has a smaller support.
            density = p * nstates
            if density:
                log_density = math.log(density)
            else:
                log_density = float('-inf')
            row = [g, allele_frequency, p, log_density]
            arr.append(row)
    # create the R table string and scripts
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot(nstates)
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
        table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data
Example #49
0
def get_table_strings_and_scripts(xmldata, alignment_id, start_stop_pairs,
                                  nsamples):
    """
    Command-line only.
    @param xmldata: xml data already adjusted for nsamples and log filename
    @param alignment_id: xml element id
    @param start_stop_pairs: alignment interval bounds
    @param nsamples: an extra parameter for script generation
    @return: short table string, long table string, scripts (for short table)
    """
    # init the array for the full R table
    full_data_arr = []
    # build the array for the R table
    data_arr = []
    sequence_lengths = []
    midpoints = []
    for start_pos, stop_pos in start_stop_pairs:
        sequence_length = stop_pos - start_pos + 1
        midpoint = (start_pos + stop_pos) / 2.0
        interval_xml_data = beast.set_alignment_interval(
            xmldata, alignment_id, start_pos, stop_pos)
        row_labels, col_labels, arr = get_loganalysis_labeled_array(
            interval_xml_data)
        stat_name_to_row_index = dict((x, i) for i, x in enumerate(row_labels))
        summary_name_to_col_index = dict(
            (x, i) for i, x in enumerate(col_labels))
        # define row indices of interest
        mean_row_index = stat_name_to_row_index['meanRate']
        var_row_index = stat_name_to_row_index['coefficientOfVariation']
        cov_row_index = stat_name_to_row_index['covariance']
        # define column indices of interest
        mean_col_index = summary_name_to_col_index['mean']
        low_col_index = summary_name_to_col_index['hpdLower']
        high_col_index = summary_name_to_col_index['hpdUpper']
        row = [
            sequence_length,
            midpoint,
            arr[mean_row_index][low_col_index],
            arr[mean_row_index][mean_col_index],
            arr[mean_row_index][high_col_index],
            arr[var_row_index][low_col_index],
            arr[var_row_index][mean_col_index],
            arr[var_row_index][high_col_index],
            arr[cov_row_index][low_col_index],
            arr[cov_row_index][mean_col_index],
            arr[cov_row_index][high_col_index],
        ]
        data_arr.append(row)
        # add rows to the full data array
        for row_index, row_label in enumerate(row_labels):
            for col_index, col_label in enumerate(col_labels):
                row = [
                    sequence_length,
                    midpoint,
                    '"' + row_label + '"',
                    '"' + col_label + '"',
                    arr[row_index][col_index],
                ]
                full_data_arr.append(row)
        # add entries to some utility arrays
        sequence_lengths.append(sequence_length)
        midpoints.append(midpoint)
    # build the table strings
    table_string = RUtil.get_table_string(data_arr, g_headers)
    full_table_string = RUtil.get_table_string(
        full_data_arr,
        [
            'sequence.length', 'midpoint', 'statistic.name',
            'posterior.analysis', 'value'
        ],
        force_float=False,
    )
    # get the scripts
    scripts = beasttut.get_ggplot2_scripts(nsamples, sequence_lengths,
                                           midpoints)
    # return the table string and scripts
    return table_string, full_table_string, scripts
Example #50
0
def do_hard_coded_analysis_b(tree, tree_remark):
    """
    Do a hardcoded analysis of tree reconstruction methods.
    Make R files of ordered reconstruction losses.
    @param tree: a tree object
    @param tree_remark: a string that is a comment about the tree
    """
    # define an arbitrary order for the names of the leaves of the tree
    ordered_names = list(node.name for node in tree.gen_tips())
    # use some replicates
    reconstruction_count = 100
    # Make R files for reconstruction results from sequences
    # of some number of nucleotides in length.
    sequence_length = 2000
    # define the tree reconstruction methods to be used
    sims = [
        Simulation(Clustering.NeighborJoiningDMS(), "nj", "neighbor joining"),
        Simulation(Clustering.StoneSpectralSignDMS(), "nj", "spectral sign"),
    ]
    # set tree reconstruction parameters
    for sim in sims:
        sim.set_original_tree(tree)
    # initialize the distance matrix sampler
    sampler = DMSampler.InfiniteAllelesSampler(tree, ordered_names, sequence_length)
    sampler.set_inf_replacement(20.0)
    sampler.set_zero_replacement(0.0)
    # start the progress bar
    pbar = Progress.Bar(1.0)
    # sample some distance matrices
    distance_matrix_start_time = time.time()
    distance_matrices = []
    for result in sampler.gen_samples_or_none():
        # if we got a result then update the distance matrix list
        if result:
            sequence_list, D = result
            distance_matrices.append(D)
        # Update the progressbar regardless of whether or not
        # the proposal was accepted.
        remaining_acceptances = reconstruction_count - len(distance_matrices)
        numerator = sampler.get_completed_proposals()
        denominator = numerator + sampler.get_remaining_proposals(remaining_acceptances)
        dms_fraction = float(numerator) / float(denominator)
        dms_total = 1.0 / (1 + len(sims))
        pbar.update(dms_fraction * dms_total)
        # if we have enough samples then break the loop
        if not remaining_acceptances:
            break
    distance_matrix_seconds = time.time() - distance_matrix_start_time
    # reconstruct trees using various methods
    reconstruction_seconds = []
    for i, sim in enumerate(sims):
        reconstruction_start_time = time.time()
        print "reconstructing", len(distance_matrices), "trees"
        print "using", sim.description
        sim.run(distance_matrices, ordered_names)
        pbar.update(float(i + 2) / float(1 + len(sims)))
        reconstruction_seconds.append(time.time() - reconstruction_start_time)
    # stop the progress bar
    pbar.finish()
    # consider the neighbor joining and the spectral sign results
    nj_sim, ss_sim = sims
    # extract the simulation data
    label_list_pairs = [
        ("nj.unweighted", nj_sim.get_normalized_error_counts()),
        ("ss.unweighted", ss_sim.get_normalized_error_counts()),
        ("nj.weighted", nj_sim.get_normalized_loss_values()),
        ("ss.weighted", ss_sim.get_normalized_loss_values()),
    ]
    labels, transposed_table = zip(*label_list_pairs)
    table = zip(*transposed_table)
    table_string = RUtil.get_table_string(table, labels)
    # write the table
    filename = "out3.table"
    with open(filename, "w") as fout:
        print >> fout, "# tree source:", tree_remark
        print >> fout, "# number of taxa:", len(ordered_names)
        print >> fout, "# sampled distance matrices:", len(distance_matrices)
        print >> fout, "# sampling seconds elapsed:", distance_matrix_seconds
        print >> fout, "# sites per sequence:", sequence_length
        for sim, seconds in zip(sims, reconstruction_seconds):
            msg_a = "# seconds elapsed for tree reconstruction using "
            msg_b = sim.description + ": " + str(seconds)
            print >> fout, msg_a + msg_b
        print >> fout, table_string
    print "wrote", filename
Example #51
0
def get_response_content(fs):
    M = get_input_matrix(fs)
    # create the R table string and scripts
    headers = ['t']
    if fs.show_entropy:
        headers.append('ub.entropy')
    headers.extend([
            'ub.jc.spectral',
            'ub.f81.spectral',
            'mutual.information',
            'lb.2.state.spectral',
            'lb.2.state',
            'lb.f81',
            ])
    npoints = 100
    t_low = fs.start_time
    t_high = fs.stop_time
    t_incr = (t_high - t_low) / (npoints - 1)
    t_values = [t_low + t_incr*i for i in range(npoints)]
    # define some extra stuff
    v = mrate.R_to_distn(M)
    entropy = -np.dot(v, np.log(v))
    n = len(M)
    gap = sorted(abs(x) for x in np.linalg.eigvals(M))[1]
    print 'stationary distn:', v
    print 'entropy:', entropy
    print 'spectral gap:', gap
    M_slow_jc = gap * (1.0 / n) * (np.ones((n,n)) - n*np.eye(n))
    M_slow_f81 = gap * np.outer(np.ones(n), v)
    M_slow_f81 -= np.diag(np.sum(M_slow_f81, axis=1))
    M_f81 = msimpl.get_fast_f81(M)
    M_2state = msimpl.get_fast_two_state_autobarrier(M)
    M_2state_spectral = -gap * M_2state / np.trace(M_2state)
    # get the data for the R table
    arr = []
    for u in t_values:
        # experiment with log time
        #t = math.exp(u)
        t = u
        mi_slow_jc = ctmcmi.get_mutual_information(M_slow_jc, t)
        mi_slow_f81 = ctmcmi.get_mutual_information(M_slow_f81, t)
        mi_mut = ctmcmi.get_mutual_information(M, t)
        mi_2state_spectral = ctmcmi.get_mutual_information(M_2state_spectral, t)
        mi_f81 = ctmcmi.get_mutual_information(M_f81, t)
        mi_2state = ctmcmi.get_mutual_information(M_2state, t)
        row = [u]
        if fs.show_entropy:
            row.append(entropy)
        row.extend([mi_slow_jc, mi_slow_f81,
                mi_mut, mi_2state_spectral, mi_2state, mi_f81])
        arr.append(row)
    # get the R table
    table_string = RUtil.get_table_string(arr, headers)
    # get the R script
    script = get_ggplot()
    # create the R plot image
    device_name = Form.g_imageformat_to_r_function[fs.imageformat]
    retcode, r_out, r_err, image_data = RUtil.run_plotter(
            table_string, script, device_name)
    if retcode:
        raise RUtil.RError(r_err)
    return image_data