Exemple #1
0
def make_bin_outputs(expt_dir, outputs_fname, cutoff):
    """
    Makes binary transformed outputs file
        output = 1 if output >= cutoff else 0

    Args: 
        expt_dir (str) - name of experiment directory
        outputs_fname (str) - name of original outputs file
        cutoff (float) - scaled counts cutoff to transform to 1, else 0

    Returns: 
        void, just makes binary transformed outputs file w. scaled cts cutoff
    """
    outputs = proc.load_outputs(outputs_fname)
    bin_outputs = proc.bin_transform_outputs(outputs, cutoff)
    bin_out_fname = expt_dir + "/process/bin_outputs.cutoff_{0}.txt".format(
        cutoff)
    proc.write_outputs(bin_outputs, bin_out_fname)
Exemple #2
0
def make_log_outputs(expt_dir, outputs_fname, scaled_psct):
    """
    Makes log transformed outputs file with a scaled pseudocount

    Args: 
        expt_dir (str) - name of experiment directory
        outputs_fname (str) - name of original outputs file
        scaled_psct (float) - scaled pseudocount to add before log transform

    Returns: 
        void, just makes log transformed outputs file w. scaled psct.
    """
    # load outputs
    outputs = proc.load_outputs(outputs_fname)
    # add scaled pseudocount, log transform outputs
    log_outputs = proc.log_transform_outputs(outputs, scaled_psct)
    # write log transformed pseudocounts
    log_out_fname = expt_dir +\
        "/process/log_outputs.scaled_psct_{0}.txt".format(scaled_psct)
    proc.write_outputs(log_outputs, log_out_fname)
Exemple #3
0
def process_sam_file(expt_dir,
                     sam_fname,
                     gene_seq_fname,
                     gene_len_fname,
                     shift_dict,
                     cod_trunc_5p,
                     cod_trunc_3p,
                     min_fp_size,
                     max_fp_size,
                     num_tr_genes,
                     num_te_genes,
                     min_cts_per_gene,
                     min_cod_w_data,
                     raw_psct=0,
                     paralog_groups_fname=False,
                     overwrite=False,
                     folds=False):
    """
    Processes an RP sam file for an experiment.
    Makes in expt_dir/process:
        cts_by_codon file   (sum sam map wts per codon)
        outputs file        (scaled cts_by_codon, each gene mean centered at 1)
        te_bounds file      (first and last codon idxs per test set gene)
        te_data_table file  (data table for test set codons)
        tr_bounds file      (first and last codon idxs per training set gene)
        tr_data_table file  (data table for training set codons)

    Args: 
        expt_dir (str) - name of experiment directory
        sam_fname (str) - name of input sam file
        gene_seq_fname (str) - name of transcriptome fasta file
        gene_len_fname (str) - name of gene lengths file
        shift_dict (dict): 
            {fp_size (int): 
                {frame (int): shift (int, or False) for frame in range(2)}
                for fp_size in range(min_fp_size, max_fp_size + 1) }
        cod_trunc_5p (int): number of codons to exclude at start of each CDS
        cod_trunc_3p (int): number of codons to exclude at end of each CDS
        min_fp_size (int): minimum size footprint to accept in sam file
        max_fp_size (int): maximum size footprint to accept in sam file
        num_tr_genes (int): number of genes to sort into training set
        num_te_genes (int): number of genes to sort into test set
        min_cts_per_gene (int): 
            cutoff for total cts on gene to include gene in tr/te sets
        min_cod_w_data (int):
            cutoff for codons with data to include gene in tr/te sets
        raw_psct (float): psct to add to raw cts_by_codon values
        paralog_groups_fname (str):
            file containing one group of mutually paralogous genes on each line
            results in filtering out all but one paralog (NOTE which one?)
        overwrite (bool): flag to overwrite processed files, default False
        folds (bool): 
            might implement this later to divide genes into folds
            rather than explicit training/test sets

    Returns: 
        void, makes files listed above
    """
    # Load CDS dict, len dict
    len_dict = proc.get_len_dict(gene_len_fname)
    cds_dict = proc.get_cds_dict(gene_seq_fname, len_dict)

    # Process and write cts by codon file
    cts_by_codon = proc.get_cts_by_codon(sam_fname, cds_dict, len_dict,
                                         shift_dict, min_fp_size, max_fp_size)
    cts_by_codon_fname = expt_dir + \
        "/process/cts_by_codon.size.{0}.{1}.txt".format(
            min_fp_size, max_fp_size)
    if not os.path.isfile(cts_by_codon_fname):
        print "making file " + cts_by_codon_fname
        proc.write_cts_by_codon(cts_by_codon_fname, cts_by_codon)
    else:
        print "file " + cts_by_codon_fname + " already exists"

    #Process and write outputs file
    outputs = proc.get_outputs(cts_by_codon,
                               cod_trunc_5p,
                               cod_trunc_3p,
                               raw_psct=raw_psct)
    outputs_fname = expt_dir + \
        "/process/outputs.size.{0}.{1}.txt".format(min_fp_size, max_fp_size)
    if raw_psct:
        outputs_fname = outputs_fname[:-4] +\
            ".raw_psct.{0}.txt".format(raw_psct)
    if not os.path.isfile(outputs_fname):
        print "making file " + outputs_fname
        proc.write_outputs(outputs_fname, outputs)
    else:
        print "file " + outputs_fname + " already exists"

    #Make training and test set codon files
    tr_set_fname = expt_dir + "/process/tr_set_bounds.size." + \
        "{0}.{1}.trunc.{2}.{3}.min_cts.{4}.min_cod.{5}.top.{6}.txt".format(
            min_fp_size, max_fp_size, cod_trunc_5p, cod_trunc_3p,
            min_cts_per_gene, min_cod_w_data, num_tr_genes + num_te_genes)
    te_set_fname = expt_dir + "/process/te_set_bounds.size." + \
        "{0}.{1}.trunc.{2}.{3}.min_cts.{4}.min_cod.{5}.top.{6}.txt".format(
            min_fp_size, max_fp_size, cod_trunc_5p, cod_trunc_3p,
            min_cts_per_gene, min_cod_w_data, num_tr_genes + num_te_genes)
    print "making file " + tr_set_fname
    print "making file " + te_set_fname
    proc.make_codon_set_files(cds_dict,
                              cts_by_codon_fname,
                              outputs_fname,
                              tr_set_fname,
                              te_set_fname,
                              num_tr_genes,
                              num_te_genes,
                              cod_trunc_5p,
                              cod_trunc_3p,
                              min_cts_per_gene,
                              min_cod_w_data,
                              paralog_groups_fname=paralog_groups_fname,
                              overwrite=overwrite)