Example #1
0
def plot_cts_and_outputs(expt_name,
                         expt_dir,
                         cts_by_codon_fname,
                         outputs_fname,
                         process_label,
                         tr_codons_fname,
                         cts_by_codon_xlims=False,
                         outputs_xlims=False,
                         log_plot=False):
    """
        process_label: string that denotes parameters of the processing used
            for tr_codons_fname, including fp size, # genes, and type/# of psct
    """

    #Load cts_by_codon and outputs
    cts_by_codon = proc.load_cts_by_codon(cts_by_codon_fname)
    outputs = proc.load_outputs(outputs_fname)
    #Load tr_codon_set
    tr_codon_bounds = proc.load_codon_set_bounds(tr_codons_fname)
    tr_codon_set = proc.expand_codon_set(tr_codon_bounds)
    #load training counts and outputs
    cts = proc.get_y(tr_codon_set, cts_by_codon).astype("int")
    outputs = proc.get_y(tr_codon_set, outputs)
    #Make plots
    xlab = "FP counts"
    ylab = "# Codons"
    title = "{0} Counts by Codon Frequencies".format(expt_name)
    xlims = cts_by_codon_xlims
    out_fname = expt_dir +\
        "/plots/cts_by_codon_freqs.{0}.pdf".format(process_label)
    plot.make_cts_by_codon_freq_plot(cts,
                                     title,
                                     xlab,
                                     ylab,
                                     out_fname,
                                     xlims=xlims)
    xlab = "Scaled counts"
    ylab = "# Codons"
    title = "{0} Scaled Counts Frequencies".format(expt_name)
    xlims = outputs_xlims
    out_fname = expt_dir + "/plots/output_freqs.{0}.pdf".format(process_label)
    plot.make_outputs_freq_plot(outputs,
                                title,
                                xlab,
                                ylab,
                                out_fname,
                                xlims=xlims,
                                num_bins=500)
    if log_plot:
        xlab = "Log scaled counts"
        ylab = "# Codons"
        title = "{0} Log Scaled Counts Frequencies".format(expt_name)
        out_fname = expt_dir +\
            "/plots/log_output_freqs.{0}.pdf".format(process_label)
        plot.make_log_outputs_freq_plot(outputs,
                                        title,
                                        xlab,
                                        ylab,
                                        out_fname,
                                        num_bins=500)
Example #2
0
def make_bin_outputs(expt_dir, outputs_fname, cutoff):
    """
    Makes binary transformed outputs file
        output = 1 if output >= cutoff else 0

    Args: 
        expt_dir (str) - name of experiment directory
        outputs_fname (str) - name of original outputs file
        cutoff (float) - scaled counts cutoff to transform to 1, else 0

    Returns: 
        void, just makes binary transformed outputs file w. scaled cts cutoff
    """
    outputs = proc.load_outputs(outputs_fname)
    bin_outputs = proc.bin_transform_outputs(outputs, cutoff)
    bin_out_fname = expt_dir + "/process/bin_outputs.cutoff_{0}.txt".format(
        cutoff)
    proc.write_outputs(bin_outputs, bin_out_fname)
Example #3
0
def make_log_outputs(expt_dir, outputs_fname, scaled_psct):
    """
    Makes log transformed outputs file with a scaled pseudocount

    Args: 
        expt_dir (str) - name of experiment directory
        outputs_fname (str) - name of original outputs file
        scaled_psct (float) - scaled pseudocount to add before log transform

    Returns: 
        void, just makes log transformed outputs file w. scaled psct.
    """
    # load outputs
    outputs = proc.load_outputs(outputs_fname)
    # add scaled pseudocount, log transform outputs
    log_outputs = proc.log_transform_outputs(outputs, scaled_psct)
    # write log transformed pseudocounts
    log_out_fname = expt_dir +\
        "/process/log_outputs.scaled_psct_{0}.txt".format(scaled_psct)
    proc.write_outputs(log_outputs, log_out_fname)
Example #4
0
        nn_dir = nn_parent_dir + "/" + model_name +\
            "_rep{0}".format(model_rep)

        cod_trunc_5p = 20
        cod_trunc_3p = 20
        min_tot_cts = 200
        min_cod_w_cts = 100

        # Load transcriptome dicts
        len_dict = proc.get_len_dict(gene_len_fname)
        cds_dict = proc.get_cds_dict(gene_seq_fname, len_dict)
        struc_dict = proc.get_struc_dict(struc_fname)

        # Compute cts_by_codon and outputs
        cts_by_codon = proc.load_cts_by_codon(cts_by_codon_fname)
        outputs = proc.load_outputs(outputs_fname)
        paralog_groups = proc.load_paralog_groups(paralog_groups_fname)

        # Get gene list to compute performance metrics
        gene_set = cts_by_codon.keys()
        # Filter genes that are shorter than truncation regions
        gene_set = filter(lambda gene: len(cts_by_codon[gene]) > (cod_trunc_5p +
            cod_trunc_3p), gene_set)
        # Filter genes that don't have enough counts to meet cutoffs
        gene_set = filter(lambda gene: proc.has_enough_cts( \
            cts_by_codon[gene][cod_trunc_5p:-cod_trunc_3p], min_tot_cts, \
            min_cod_w_cts), gene_set)
        # Sort genes by footprint density
        genes_by_density = proc.sort_genes_by_density(
            gene_set, cts_by_codon, cod_trunc_5p, cod_trunc_3p, descend=True)
        # NOTE: I'm not sure if I want to filter out paralogs here.