def make_temp_chosen_genes_filepaths(temp_sampled_genes_dirpath, pdf, cdf, num_diff_expr, gene_names, FPCAtest_output_genenames):
    """This script is super important for the module! It first makes a mock file that contains all of the gene names that where found in the 'FPCAtest_output_as_table' file from the differentially expressed genes directory. It then randomly samples genes (without replacement) from the cdf (which was made from the gene frequencies at day0). When a gene is selected, a mock '.pdf' file is created with its gene name as the name of the file."""
    fileout = open(temp_sampled_genes_dirpath + 'FPCAtest_output_as_table', 'w')
    for i in FPCAtest_output_genenames:
        fileout.write(i + '\tblah\n')
    for l in xrange(num_diff_expr):
        chosen_index = simulate_removing_reads.inverse_transform_sampling(cdf)
        del pdf[chosen_index]
        pdf = get_day0_pdf(pdf)
        cdf = simulate_removing_reads.convert_pdf2cdf(pdf)
        chosen_gene = gene_names[chosen_index]
        del gene_names[chosen_index]
        chosen_gene = re.sub('/', '_', chosen_gene)
        temp_chosen_gene_filepath = temp_sampled_genes_dirpath + chosen_gene + '_plot.pdf'
        f = open(temp_chosen_gene_filepath, "w")
def make_temp_chosen_genes_filepaths(temp_sampled_genes_dirpath, pdf, cdf,
                                     num_diff_expr, gene_names,
    """This script is super important for the module! It first makes a mock file that contains all of the gene names that where found in the 'FPCAtest_output_as_table' file from the differentially expressed genes directory. It then randomly samples genes (without replacement) from the cdf (which was made from the gene frequencies at day0). When a gene is selected, a mock '.pdf' file is created with its gene name as the name of the file."""
    fileout = open(temp_sampled_genes_dirpath + 'FPCAtest_output_as_table',
    for i in FPCAtest_output_genenames:
        fileout.write(i + '\tblah\n')
    for l in xrange(num_diff_expr):
        chosen_index = simulate_removing_reads.inverse_transform_sampling(cdf)
        del pdf[chosen_index]
        pdf = get_day0_pdf(pdf)
        cdf = simulate_removing_reads.convert_pdf2cdf(pdf)
        chosen_gene = gene_names[chosen_index]
        del gene_names[chosen_index]
        chosen_gene = re.sub('/', '_', chosen_gene)
        temp_chosen_gene_filepath = temp_sampled_genes_dirpath + chosen_gene + '_plot.pdf'
        f = open(temp_chosen_gene_filepath, "w")
def run(freq_prog_dirpath, gene_name_order_dirpath, diff_expr_genes_dirpath,
        output_dirpath, sim_row_sums_dirpath, trials, gene_classes):
    """This script's purpose is to sample a group of genes from each patient, based upon the frequency of the genes at day0, for each of the patients. The number of genes selected for each patient is equal to the number of genes that were significantly differentially expressed from the FPCAtest. The idea is to simulate a set of genes that are analogous to the set of differentially expressed genes, except this simulated set are selected purely based upon their freq before the vaccine response (day0). Once this null set is selected the 'each_gene_sig_or_not' file is created based on this set, and then the tests for independence are run on this file (both G test, and Fisher's exact test). This 'run' script is quite long. The reason for this is that, within a sampletype, the script has to first cycle through each of the patients to get the gene frequencies at day0, and then use this information to make the simulated sets of genes. This creates kind of a complicated for loop structure. Not so elegent... This list of genes, and then downstream tests are run 'trials' number of times.
    'freq_prog_dirpath' = This is the directory that contains the freq progressions for each of the genes.
    'gene_name_order_dirpath' = This directory contains the files that are the gene name order that correspons to the freq prog files.
    'diff_expr_genes_dirpath' = This is the directory that contains the plots of gene expression functions for the differentially expressed genes
    'output_dirpath' = This is where the output goes, which is file that is a long list of each of the pvalues from the tests for independence
    'trials' = Number of trials to sample the genes and run the tests etc.
    'gene_classes' = a list of names, where each name is a gene class to run this module on.
    'sim_row_sums_dirpath' = This is an output directory where the simulated 'sig_or_not' information will be written. This variable (and output) was added later. Essentially, the information in this file will be a simulated null distribution, and we want to compare this null to the observed. So, we are saving the info to disk, and then will compare later."""
    if freq_prog_dirpath[-1] != '/':
        freq_prog_dirpath += '/'
    if gene_name_order_dirpath[-1] != '/':
        gene_name_order_dirpath += '/'
    if diff_expr_genes_dirpath != '/':
        diff_expr_genes_dirpath += '/'
    if output_dirpath != '/':
        output_dirpath += '/'
    if not os.path.exists(output_dirpath):

    if sim_row_sums_dirpath[-1] != '/':
        sim_row_sums_dirpath += '/'
    if not os.path.exists(sim_row_sums_dirpath):

    temp_samplegenes_dirpath = os.path.dirname(output_dirpath[:-1]) + '/temp/'
    if os.path.exists(temp_samplegenes_dirpath):['rm', '-r', temp_samplegenes_dirpath])
    temp_is_sig_or_not_dirpath = os.path.dirname(
        output_dirpath[:-1]) + '/sig_or_not/'
    if os.path.exists(temp_is_sig_or_not_dirpath):['rm', '-r', temp_is_sig_or_not_dirpath])
    #for each sample type
    for i in os.listdir(freq_prog_dirpath):
        if i[0] == '.' or i == 'README':
        print '--------------------', i, '--------------------'
        output_sampletype_dirpath = output_dirpath + i + '/'
        if not os.path.exists(output_sampletype_dirpath):

        sim_row_sums_sampletype_dirpath = sim_row_sums_dirpath + i + '/'
        if not os.path.exists(sim_row_sums_sampletype_dirpath):

        #for each gene class
        for gene_class in gene_classes:
            print '============='
            print gene_class
            print '============='
            pdfs = []
            gene_name_lists = []
            num_diff_expr_list = []
            FPCAtest_output_genename_lists = []
            cdfs = []
            patient_ids = []
            no_genes = None
            #cycle through patients to get each patients cdf and such
            for j in os.listdir(freq_prog_dirpath + i):
                if j[0] == '.' or j == 'README':
                output_patient_dirpath = output_sampletype_dirpath + j + '/'
                if not os.path.exists(output_patient_dirpath):
                freq_prog_filepath = freq_prog_dirpath + i + '/' + j + '/' + gene_class
                pdf = get_day0_pdf(freq_prog_filepath)
                if len(pdf) == 0:
                    no_genes = 'yup'
                gene_name_order_filepath = gene_name_order_dirpath + i + '/' + j + '/' + gene_class
                gene_names = get_gene_names(gene_name_order_filepath)
                diff_expr_genes_subdirpath = diff_expr_genes_dirpath + i + '/' + j + '/' + gene_class + '/'
                num_diff_expr, FPCAtest_output_genenames = get_num_diff_expr(
                cdf = simulate_removing_reads.convert_pdf2cdf(pdf)
            if no_genes:
                print 'No genes found for', gene_class, 'in', j, 'so whole gene class aborted for', i
            #Now it's time to sample from the cdfs (without replacement) to
            #create lists of genes sampled from day 0
            #First prime the p values dictionary for each patient
            pvalues_dic_G = {}
            pvalues_dic_fisher = {}
            for j in patient_ids:
                pvalues_dic_G[j] = []
                pvalues_dic_fisher[j] = []

            #the index number in this list corresponds to each possible row
            #sum, and the value corresponds to the count for that row sum
            row_sums = [0 for j in xrange(len(patient_ids) + 1)]

            #For each trial
            for j in xrange(trials):
                #make tree of temp input files for each patient
                temp_samplegenes_sampletype_dirpath = temp_samplegenes_dirpath + i + '/'
                #Cycle through patients to sample from each cdf
                for k in xrange(len(cdfs)):
                    temp_samplegenes_patient_dirpath = temp_samplegenes_sampletype_dirpath + patient_ids[
                        k] + '/'
                    temp_samplegenes_final_dirpath = temp_samplegenes_patient_dirpath + gene_class + '/'
                    #now sample genes from day0
                    pdf = pdfs[k][:]
                    cdf = cdfs[k][:]
                    gene_names = gene_name_lists[k][:]
                    #This is what samples the genes from the cdf
                        temp_samplegenes_final_dirpath, pdf, cdf,
                        num_diff_expr_list[k], gene_names,
                    ds = os.listdir(temp_samplegenes_final_dirpath)
                #now asses independence b/t patients (i.e. get p values)

                genename_dic =
                    temp_samplegenes_dirpath, temp_is_sig_or_not_dirpath,
                    [gene_class], 'yes')
                #update row sum counts
                row_sums = update_row_sums(genename_dic, row_sums)

                sig_or_not_filepath = temp_is_sig_or_not_dirpath + i + '/' + gene_class
                for k in xrange(
                        len(patient_ids)):  #get p values for each patient
                    pvalue_G, pvalue_fisher =
                        sig_or_not_filepath, 0, 0, k)
                #now remove temp files
      ['rm', '-r', temp_samplegenes_dirpath + i])
      ['rm', '-r', temp_is_sig_or_not_dirpath])

            #now write the row sum counts to file for this gene_class
            sim_row_sums_filepath = sim_row_sums_sampletype_dirpath + gene_class
            write_row_sum_counts(row_sums, sim_row_sums_filepath)

            #These are the P values for the Gtest for independence results of the
            #genes that are actually differentially expressed
            write_output(pvalues_dic_G, pvalues_dic_fisher,
                         output_sampletype_dirpath, patient_ids, gene_class)['rm', '-r', temp_samplegenes_dirpath])
def run(freq_prog_dirpath, gene_name_order_dirpath, diff_expr_genes_dirpath, output_dirpath, sim_row_sums_dirpath, trials, gene_classes):
    """This script's purpose is to sample a group of genes from each patient, based upon the frequency of the genes at day0, for each of the patients. The number of genes selected for each patient is equal to the number of genes that were significantly differentially expressed from the FPCAtest. The idea is to simulate a set of genes that are analogous to the set of differentially expressed genes, except this simulated set are selected purely based upon their freq before the vaccine response (day0). Once this null set is selected the 'each_gene_sig_or_not' file is created based on this set, and then the tests for independence are run on this file (both G test, and Fisher's exact test). This 'run' script is quite long. The reason for this is that, within a sampletype, the script has to first cycle through each of the patients to get the gene frequencies at day0, and then use this information to make the simulated sets of genes. This creates kind of a complicated for loop structure. Not so elegent... This list of genes, and then downstream tests are run 'trials' number of times.
    'freq_prog_dirpath' = This is the directory that contains the freq progressions for each of the genes.
    'gene_name_order_dirpath' = This directory contains the files that are the gene name order that correspons to the freq prog files.
    'diff_expr_genes_dirpath' = This is the directory that contains the plots of gene expression functions for the differentially expressed genes
    'output_dirpath' = This is where the output goes, which is file that is a long list of each of the pvalues from the tests for independence
    'trials' = Number of trials to sample the genes and run the tests etc.
    'gene_classes' = a list of names, where each name is a gene class to run this module on.
    'sim_row_sums_dirpath' = This is an output directory where the simulated 'sig_or_not' information will be written. This variable (and output) was added later. Essentially, the information in this file will be a simulated null distribution, and we want to compare this null to the observed. So, we are saving the info to disk, and then will compare later."""
    if freq_prog_dirpath[-1] != '/':
        freq_prog_dirpath += '/'
    if gene_name_order_dirpath[-1] != '/':
        gene_name_order_dirpath += '/'
    if diff_expr_genes_dirpath != '/':
        diff_expr_genes_dirpath += '/'
    if output_dirpath != '/':
        output_dirpath += '/'
    if not os.path.exists(output_dirpath):

    if sim_row_sums_dirpath[-1] != '/':
        sim_row_sums_dirpath += '/'
    if not os.path.exists(sim_row_sums_dirpath):

    temp_samplegenes_dirpath = os.path.dirname(output_dirpath[:-1]) + '/temp/'
    if os.path.exists(temp_samplegenes_dirpath):['rm', '-r', temp_samplegenes_dirpath])
    temp_is_sig_or_not_dirpath = os.path.dirname(output_dirpath[:-1]) + '/sig_or_not/'
    if os.path.exists(temp_is_sig_or_not_dirpath):['rm', '-r', temp_is_sig_or_not_dirpath])
    #for each sample type
    for i in os.listdir(freq_prog_dirpath):
        if i[0] == '.' or i == 'README':
        print '--------------------', i, '--------------------'
        output_sampletype_dirpath = output_dirpath + i + '/'
        if not os.path.exists(output_sampletype_dirpath):

        sim_row_sums_sampletype_dirpath = sim_row_sums_dirpath + i + '/'
        if not os.path.exists(sim_row_sums_sampletype_dirpath):

        #for each gene class
        for gene_class in gene_classes:
            print '============='
            print gene_class
            print '============='
            pdfs = []
            gene_name_lists = []
            num_diff_expr_list = []
            FPCAtest_output_genename_lists = []
            cdfs = []
            patient_ids = []
            no_genes = None
            #cycle through patients to get each patients cdf and such
            for j in os.listdir(freq_prog_dirpath + i):
                if j[0] == '.' or j == 'README':
                output_patient_dirpath = output_sampletype_dirpath + j + '/'
                if not os.path.exists(output_patient_dirpath):
                freq_prog_filepath = freq_prog_dirpath + i + '/' + j + '/' + gene_class
                pdf = get_day0_pdf(freq_prog_filepath)
                if len(pdf) == 0:
                    no_genes = 'yup'
                gene_name_order_filepath = gene_name_order_dirpath + i + '/' + j + '/' + gene_class
                gene_names = get_gene_names(gene_name_order_filepath)
                diff_expr_genes_subdirpath = diff_expr_genes_dirpath + i + '/' + j + '/' + gene_class + '/'
                num_diff_expr, FPCAtest_output_genenames = get_num_diff_expr(diff_expr_genes_subdirpath)
                cdf = simulate_removing_reads.convert_pdf2cdf(pdf)
            if no_genes:
                print 'No genes found for', gene_class, 'in', j, 'so whole gene class aborted for', i
            #Now it's time to sample from the cdfs (without replacement) to
            #create lists of genes sampled from day 0
            #First prime the p values dictionary for each patient
            pvalues_dic_G = {}
            pvalues_dic_fisher = {}
            for j in patient_ids:
                pvalues_dic_G[j] = []
                pvalues_dic_fisher[j] = []

            #the index number in this list corresponds to each possible row
            #sum, and the value corresponds to the count for that row sum
            row_sums = [0 for j in xrange(len(patient_ids) + 1)]

            #For each trial
            for j in xrange(trials):
                #make tree of temp input files for each patient
                temp_samplegenes_sampletype_dirpath = temp_samplegenes_dirpath + i + '/'
                #Cycle through patients to sample from each cdf
                for k in xrange(len(cdfs)):
                    temp_samplegenes_patient_dirpath = temp_samplegenes_sampletype_dirpath + patient_ids[k] + '/'
                    temp_samplegenes_final_dirpath = temp_samplegenes_patient_dirpath + gene_class + '/'
                    #now sample genes from day0
                    pdf = pdfs[k][:]
                    cdf = cdfs[k][:]
                    gene_names = gene_name_lists[k][:]
                    #This is what samples the genes from the cdf
                    make_temp_chosen_genes_filepaths(temp_samplegenes_final_dirpath, pdf, cdf, num_diff_expr_list[k], gene_names, FPCAtest_output_genename_lists[k])
                    ds = os.listdir(temp_samplegenes_final_dirpath)
                #now asses independence b/t patients (i.e. get p values)
                genename_dic =, temp_is_sig_or_not_dirpath, [gene_class], 'yes')
                #update row sum counts
                row_sums = update_row_sums(genename_dic, row_sums)
                sig_or_not_filepath = temp_is_sig_or_not_dirpath + i + '/' + gene_class
                for k in xrange(len(patient_ids)): #get p values for each patient
                    pvalue_G, pvalue_fisher =, 0, 0, k)
                #now remove temp files
      ['rm', '-r', temp_samplegenes_dirpath + i])
      ['rm', '-r', temp_is_sig_or_not_dirpath])

            #now write the row sum counts to file for this gene_class
            sim_row_sums_filepath = sim_row_sums_sampletype_dirpath + gene_class
            write_row_sum_counts(row_sums, sim_row_sums_filepath)

            #These are the P values for the Gtest for independence results of the
            #genes that are actually differentially expressed
            write_output(pvalues_dic_G, pvalues_dic_fisher, output_sampletype_dirpath, patient_ids, gene_class)['rm', '-r', temp_samplegenes_dirpath])