def make_bin_outputs(expt_dir, outputs_fname, cutoff): """ Makes binary transformed outputs file output = 1 if output >= cutoff else 0 Args: expt_dir (str) - name of experiment directory outputs_fname (str) - name of original outputs file cutoff (float) - scaled counts cutoff to transform to 1, else 0 Returns: void, just makes binary transformed outputs file w. scaled cts cutoff """ outputs = proc.load_outputs(outputs_fname) bin_outputs = proc.bin_transform_outputs(outputs, cutoff) bin_out_fname = expt_dir + "/process/bin_outputs.cutoff_{0}.txt".format( cutoff) proc.write_outputs(bin_outputs, bin_out_fname)
def make_log_outputs(expt_dir, outputs_fname, scaled_psct): """ Makes log transformed outputs file with a scaled pseudocount Args: expt_dir (str) - name of experiment directory outputs_fname (str) - name of original outputs file scaled_psct (float) - scaled pseudocount to add before log transform Returns: void, just makes log transformed outputs file w. scaled psct. """ # load outputs outputs = proc.load_outputs(outputs_fname) # add scaled pseudocount, log transform outputs log_outputs = proc.log_transform_outputs(outputs, scaled_psct) # write log transformed pseudocounts log_out_fname = expt_dir +\ "/process/log_outputs.scaled_psct_{0}.txt".format(scaled_psct) proc.write_outputs(log_outputs, log_out_fname)
def process_sam_file(expt_dir, sam_fname, gene_seq_fname, gene_len_fname, shift_dict, cod_trunc_5p, cod_trunc_3p, min_fp_size, max_fp_size, num_tr_genes, num_te_genes, min_cts_per_gene, min_cod_w_data, raw_psct=0, paralog_groups_fname=False, overwrite=False, folds=False): """ Processes an RP sam file for an experiment. Makes in expt_dir/process: cts_by_codon file (sum sam map wts per codon) outputs file (scaled cts_by_codon, each gene mean centered at 1) te_bounds file (first and last codon idxs per test set gene) te_data_table file (data table for test set codons) tr_bounds file (first and last codon idxs per training set gene) tr_data_table file (data table for training set codons) Args: expt_dir (str) - name of experiment directory sam_fname (str) - name of input sam file gene_seq_fname (str) - name of transcriptome fasta file gene_len_fname (str) - name of gene lengths file shift_dict (dict): {fp_size (int): {frame (int): shift (int, or False) for frame in range(2)} for fp_size in range(min_fp_size, max_fp_size + 1) } cod_trunc_5p (int): number of codons to exclude at start of each CDS cod_trunc_3p (int): number of codons to exclude at end of each CDS min_fp_size (int): minimum size footprint to accept in sam file max_fp_size (int): maximum size footprint to accept in sam file num_tr_genes (int): number of genes to sort into training set num_te_genes (int): number of genes to sort into test set min_cts_per_gene (int): cutoff for total cts on gene to include gene in tr/te sets min_cod_w_data (int): cutoff for codons with data to include gene in tr/te sets raw_psct (float): psct to add to raw cts_by_codon values paralog_groups_fname (str): file containing one group of mutually paralogous genes on each line results in filtering out all but one paralog (NOTE which one?) overwrite (bool): flag to overwrite processed files, default False folds (bool): might implement this later to divide genes into folds rather than explicit training/test sets Returns: void, makes files listed above """ # Load CDS dict, len dict len_dict = proc.get_len_dict(gene_len_fname) cds_dict = proc.get_cds_dict(gene_seq_fname, len_dict) # Process and write cts by codon file cts_by_codon = proc.get_cts_by_codon(sam_fname, cds_dict, len_dict, shift_dict, min_fp_size, max_fp_size) cts_by_codon_fname = expt_dir + \ "/process/cts_by_codon.size.{0}.{1}.txt".format( min_fp_size, max_fp_size) if not os.path.isfile(cts_by_codon_fname): print "making file " + cts_by_codon_fname proc.write_cts_by_codon(cts_by_codon_fname, cts_by_codon) else: print "file " + cts_by_codon_fname + " already exists" #Process and write outputs file outputs = proc.get_outputs(cts_by_codon, cod_trunc_5p, cod_trunc_3p, raw_psct=raw_psct) outputs_fname = expt_dir + \ "/process/outputs.size.{0}.{1}.txt".format(min_fp_size, max_fp_size) if raw_psct: outputs_fname = outputs_fname[:-4] +\ ".raw_psct.{0}.txt".format(raw_psct) if not os.path.isfile(outputs_fname): print "making file " + outputs_fname proc.write_outputs(outputs_fname, outputs) else: print "file " + outputs_fname + " already exists" #Make training and test set codon files tr_set_fname = expt_dir + "/process/tr_set_bounds.size." + \ "{0}.{1}.trunc.{2}.{3}.min_cts.{4}.min_cod.{5}.top.{6}.txt".format( min_fp_size, max_fp_size, cod_trunc_5p, cod_trunc_3p, min_cts_per_gene, min_cod_w_data, num_tr_genes + num_te_genes) te_set_fname = expt_dir + "/process/te_set_bounds.size." + \ "{0}.{1}.trunc.{2}.{3}.min_cts.{4}.min_cod.{5}.top.{6}.txt".format( min_fp_size, max_fp_size, cod_trunc_5p, cod_trunc_3p, min_cts_per_gene, min_cod_w_data, num_tr_genes + num_te_genes) print "making file " + tr_set_fname print "making file " + te_set_fname proc.make_codon_set_files(cds_dict, cts_by_codon_fname, outputs_fname, tr_set_fname, te_set_fname, num_tr_genes, num_te_genes, cod_trunc_5p, cod_trunc_3p, min_cts_per_gene, min_cod_w_data, paralog_groups_fname=paralog_groups_fname, overwrite=overwrite)