def plot_cts_and_outputs(expt_name, expt_dir, cts_by_codon_fname, outputs_fname, process_label, tr_codons_fname, cts_by_codon_xlims=False, outputs_xlims=False, log_plot=False): """ process_label: string that denotes parameters of the processing used for tr_codons_fname, including fp size, # genes, and type/# of psct """ #Load cts_by_codon and outputs cts_by_codon = proc.load_cts_by_codon(cts_by_codon_fname) outputs = proc.load_outputs(outputs_fname) #Load tr_codon_set tr_codon_bounds = proc.load_codon_set_bounds(tr_codons_fname) tr_codon_set = proc.expand_codon_set(tr_codon_bounds) #load training counts and outputs cts = proc.get_y(tr_codon_set, cts_by_codon).astype("int") outputs = proc.get_y(tr_codon_set, outputs) #Make plots xlab = "FP counts" ylab = "# Codons" title = "{0} Counts by Codon Frequencies".format(expt_name) xlims = cts_by_codon_xlims out_fname = expt_dir +\ "/plots/cts_by_codon_freqs.{0}.pdf".format(process_label) plot.make_cts_by_codon_freq_plot(cts, title, xlab, ylab, out_fname, xlims=xlims) xlab = "Scaled counts" ylab = "# Codons" title = "{0} Scaled Counts Frequencies".format(expt_name) xlims = outputs_xlims out_fname = expt_dir + "/plots/output_freqs.{0}.pdf".format(process_label) plot.make_outputs_freq_plot(outputs, title, xlab, ylab, out_fname, xlims=xlims, num_bins=500) if log_plot: xlab = "Log scaled counts" ylab = "# Codons" title = "{0} Log Scaled Counts Frequencies".format(expt_name) out_fname = expt_dir +\ "/plots/log_output_freqs.{0}.pdf".format(process_label) plot.make_log_outputs_freq_plot(outputs, title, xlab, ylab, out_fname, num_bins=500)
def make_bin_outputs(expt_dir, outputs_fname, cutoff): """ Makes binary transformed outputs file output = 1 if output >= cutoff else 0 Args: expt_dir (str) - name of experiment directory outputs_fname (str) - name of original outputs file cutoff (float) - scaled counts cutoff to transform to 1, else 0 Returns: void, just makes binary transformed outputs file w. scaled cts cutoff """ outputs = proc.load_outputs(outputs_fname) bin_outputs = proc.bin_transform_outputs(outputs, cutoff) bin_out_fname = expt_dir + "/process/bin_outputs.cutoff_{0}.txt".format( cutoff) proc.write_outputs(bin_outputs, bin_out_fname)
def make_log_outputs(expt_dir, outputs_fname, scaled_psct): """ Makes log transformed outputs file with a scaled pseudocount Args: expt_dir (str) - name of experiment directory outputs_fname (str) - name of original outputs file scaled_psct (float) - scaled pseudocount to add before log transform Returns: void, just makes log transformed outputs file w. scaled psct. """ # load outputs outputs = proc.load_outputs(outputs_fname) # add scaled pseudocount, log transform outputs log_outputs = proc.log_transform_outputs(outputs, scaled_psct) # write log transformed pseudocounts log_out_fname = expt_dir +\ "/process/log_outputs.scaled_psct_{0}.txt".format(scaled_psct) proc.write_outputs(log_outputs, log_out_fname)
nn_dir = nn_parent_dir + "/" + model_name +\ "_rep{0}".format(model_rep) cod_trunc_5p = 20 cod_trunc_3p = 20 min_tot_cts = 200 min_cod_w_cts = 100 # Load transcriptome dicts len_dict = proc.get_len_dict(gene_len_fname) cds_dict = proc.get_cds_dict(gene_seq_fname, len_dict) struc_dict = proc.get_struc_dict(struc_fname) # Compute cts_by_codon and outputs cts_by_codon = proc.load_cts_by_codon(cts_by_codon_fname) outputs = proc.load_outputs(outputs_fname) paralog_groups = proc.load_paralog_groups(paralog_groups_fname) # Get gene list to compute performance metrics gene_set = cts_by_codon.keys() # Filter genes that are shorter than truncation regions gene_set = filter(lambda gene: len(cts_by_codon[gene]) > (cod_trunc_5p + cod_trunc_3p), gene_set) # Filter genes that don't have enough counts to meet cutoffs gene_set = filter(lambda gene: proc.has_enough_cts( \ cts_by_codon[gene][cod_trunc_5p:-cod_trunc_3p], min_tot_cts, \ min_cod_w_cts), gene_set) # Sort genes by footprint density genes_by_density = proc.sort_genes_by_density( gene_set, cts_by_codon, cod_trunc_5p, cod_trunc_3p, descend=True) # NOTE: I'm not sure if I want to filter out paralogs here.