Example #1
0
    def preprocess_data(self, position, data):
        (K,N) = data.shape

        if self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata+self.expdata, self.annotation_path)

        if self.LOESS:
            self.transit_message("Performing LOESS Correction")
            for j in range(K):
                data[j] = stat_tools.loess_correction(position, data[j])

        return data
Example #2
0
    def preprocess_data(self, position, data):
        (K,N) = data.shape

        if self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata+self.expdata, self.annotation_path)

        if self.LOESS:
            self.transit_message("Performing LOESS Correction")
            for j in range(K):
                data[j] = stat_tools.loess_correction(position, data[j])

        return data
Example #3
0
    def Run(self):

        #if not self.wxobj:
        #    # Force matplotlib to use good backend for png.
        #    import matplotlib.pyplot as plt
        #elif "matplotlib.pyplot" not in sys.modules:
        try:
            import matplotlib.pyplot as plt
        except:
            print "Error: cannot do histograms"
            self.doHistogram = False

        self.transit_message("Starting resampling Method")
        start_time = time.time()

        if self.doHistogram:
            histPath = os.path.join(
                os.path.dirname(self.output.name),
                transit_tools.fetch_name(self.output.name) + "_histograms")
            if not os.path.isdir(histPath):
                os.makedirs(histPath)
        else:
            histPath = ""

        Kctrl = len(self.ctrldata)
        Kexp = len(self.expdata)
        #Get orf data
        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(self.ctrldata +
                                                            self.expdata,
                                                            wxobj=self.wxobj)

        (K, N) = data.shape

        if self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data,
             factors) = norm_tools.normalize_data(data, self.normalization,
                                                  self.ctrldata + self.expdata,
                                                  self.annotation_path)

        if self.LOESS:
            self.transit_message("Performing LOESS Correction")
            for j in range(K):
                data[j] = stat_tools.loess_correction(position, data[j])

        G = tnseq_tools.Genes(self.ctrldata + self.expdata,
                              self.annotation_path,
                              ignoreCodon=self.ignoreCodon,
                              nterm=self.NTerminus,
                              cterm=self.CTerminus,
                              data=data,
                              position=position)

        #G = tnseq_tools.Genes(self.ctrldata+self.expdata, self.annotation_path, norm=self.normalization, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus)

        #Resampling
        data = []
        N = len(G)
        count = 0
        self.progress_range(N)
        for gene in G:
            count += 1
            if gene.k == 0 or gene.n == 0:
                (test_obs, mean1, mean2, log2FC, pval_ltail, pval_utail,
                 pval_2tail, testlist, data1, data2) = (0, 0, 0, 0, 1.00, 1.00,
                                                        1.00, [], [0], [0])
            else:

                if not self.includeZeros:
                    ii = numpy.sum(gene.reads, 0) > 0
                else:
                    ii = numpy.ones(gene.n) == 1

                data1 = gene.reads[:Kctrl, ii].flatten() + self.pseudocount
                data2 = gene.reads[Kctrl:, ii].flatten() + self.pseudocount

                (test_obs, mean1, mean2, log2FC, pval_ltail, pval_utail,
                 pval_2tail, testlist) = stat_tools.resampling(
                     data1,
                     data2,
                     S=self.samples,
                     testFunc=stat_tools.F_mean_diff_flat,
                     adaptive=self.adaptive)

            if self.doHistogram:
                import matplotlib.pyplot as plt
                if testlist:
                    n, bins, patches = plt.hist(testlist,
                                                density=1,
                                                facecolor='c',
                                                alpha=0.75,
                                                bins=100)
                else:
                    n, bins, patches = plt.hist([0, 0],
                                                density=1,
                                                facecolor='c',
                                                alpha=0.75,
                                                bins=100)
                plt.xlabel('Delta Mean')
                plt.ylabel('Probability')
                plt.title('%s - Histogram of Delta Mean' % gene.orf)
                plt.axvline(test_obs,
                            color='r',
                            linestyle='dashed',
                            linewidth=3)
                plt.grid(True)
                genePath = os.path.join(histPath, gene.orf + ".png")
                if not os.path.exists(histPath):
                    os.makedirs(histPath)
                plt.savefig(genePath)
                plt.clf()

            sum1 = numpy.sum(data1)
            sum2 = numpy.sum(data2)
            data.append([
                gene.orf, gene.name, gene.desc, gene.n, mean1, mean2, sum1,
                sum2, test_obs, log2FC, pval_2tail
            ])

            # Update progress
            text = "Running Resampling Method... %5.1f%%" % (100.0 * count / N)
            self.progress_update(text, count)

        #
        self.transit_message("")  # Printing empty line to flush stdout
        self.transit_message("Performing Benjamini-Hochberg Correction")
        data.sort()
        qval = stat_tools.BH_fdr_correction([row[-1] for row in data])

        self.output.write("#Resampling\n")
        if self.wxobj:
            members = sorted([
                attr for attr in dir(self) if not callable(getattr(self, attr))
                and not attr.startswith("__")
            ])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write(
                "#GUI with: norm=%s, samples=%s, pseudocounts=%1.2f, adaptive=%s, histogram=%s, includeZeros=%s, output=%s\n"
                % (self.normalization, self.samples, self.pseudocount,
                   self.adaptive, self.doHistogram, self.includeZeros,
                   self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python %s\n" % " ".join(sys.argv))
        self.output.write("#Control Data: %s\n" %
                          (",".join(self.ctrldata).encode('utf-8')))
        self.output.write("#Experimental Data: %s\n" %
                          (",".join(self.expdata).encode('utf-8')))
        self.output.write("#Annotation path: %s\n" %
                          (self.annotation_path.encode('utf-8')))
        self.output.write("#Time: %s\n" % (time.time() - start_time))
        self.output.write("#%s\n" % "\t".join(columns))

        for i, row in enumerate(data):
            (orf, name, desc, n, mean1, mean2, sum1, sum2, test_obs, log2FC,
             pval_2tail) = row
            self.output.write(
                "%s\t%s\t%s\t%d\t%1.1f\t%1.1f\t%1.2f\t%1.1f\t%1.2f\t%1.1f\t%1.5f\t%1.5f\n"
                % (orf, name, desc, n, mean1, mean2, log2FC, sum1, sum2,
                   test_obs, pval_2tail, qval[i]))
        self.output.close()

        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="Resampling")
        self.finish()
        self.transit_message("Finished resampling Method")
Example #4
0
    def Run(self):

        self.transit_message("Starting HMM Method")
        start_time = time.time()

        #Get data
        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(self.ctrldata,
                                                            wxobj=self.wxobj)
        (K, N) = data.shape

        # Normalize data
        if self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data,
             factors) = norm_tools.normalize_data(data, self.normalization,
                                                  self.ctrldata,
                                                  self.annotation_path)

        # Do LOESS
        if self.LOESS:
            self.transit_message("Performing LOESS Correction")
            for j in range(K):
                data[j] = stat_tools.loess_correction(position, data[j])

        hash = transit_tools.get_pos_hash(self.annotation_path)
        rv2info = transit_tools.get_gene_info(self.annotation_path)

        if len(self.ctrldata) > 1:
            self.transit_message("Combining Replicates as '%s'" %
                                 self.replicates)
        O = tnseq_tools.combine_replicates(
            data, method=self.replicates
        ) + 1  # Adding 1 to because of shifted geometric in scipy

        #Parameters
        Nstates = 4
        label = {0: "ES", 1: "GD", 2: "NE", 3: "GA"}

        reads = O - 1
        reads_nz = sorted(reads[reads != 0])
        size = len(reads_nz)
        mean_r = numpy.average(reads_nz[:int(0.95 * size)])
        mu = numpy.array([1 / 0.99, 0.01 * mean_r + 2, mean_r, mean_r * 5.0])
        #mu = numpy.array([1/0.99, 0.1 * mean_r + 2,  mean_r, mean_r*5.0])
        L = 1.0 / mu
        B = []  # Emission Probability Distributions
        for i in range(Nstates):
            B.append(scipy.stats.geom(L[i]).pmf)

        pins = self.calculate_pins(O - 1)
        pins_obs = sum([1 for rd in O if rd >= 2]) / float(len(O))
        pnon = 1.0 - pins
        pnon_obs = 1.0 - pins_obs

        for r in range(100):
            if pnon**r < 0.01: break

        A = numpy.zeros((Nstates, Nstates))
        a = math.log1p(-B[int(Nstates / 2)](1)**r)
        b = r * math.log(B[int(Nstates / 2)](1)) + math.log(
            1.0 / 3)  # change to Nstates-1?
        for i in range(Nstates):
            A[i] = [b] * Nstates
            A[i][i] = a

        PI = numpy.zeros(Nstates)  # Initial state distribution
        PI[0] = 0.7
        PI[1:] = 0.3 / (Nstates - 1)

        self.progress_range(self.maxiterations)

        ###############
        ### VITERBI ###
        (Q_opt, delta, Q) = self.viterbi(A, B, PI, O)
        ###############

        ##################
        ### ALPHA PASS ###
        (log_Prob_Obs, alpha,
         C) = self.forward_procedure(numpy.exp(A), B, PI, O)
        ##################

        #################
        ### BETA PASS ###
        beta = self.backward_procedure(numpy.exp(A), B, PI, O, C)
        #################

        T = len(O)
        total = 0
        state2count = dict.fromkeys(range(Nstates), 0)
        for t in range(T):
            state = Q_opt[t]
            state2count[state] += 1
            total += 1

        self.output.write("#HMM - Sites\n")
        self.output.write("# Tn-HMM\n")

        if self.wxobj:
            members = sorted([
                attr for attr in dir(self) if not callable(getattr(self, attr))
                and not attr.startswith("__")
            ])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write(
                "#GUI with: ctrldata=%s, annotation=%s, output=%s\n" %
                (",".join(self.ctrldata).encode('utf-8'),
                 self.annotation_path.encode('utf-8'),
                 self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python3 %s\n" % " ".join(sys.argv))

        self.output.write("# \n")
        self.output.write("# Mean:\t%2.2f\n" % (numpy.average(reads_nz)))
        self.output.write("# Median:\t%2.2f\n" % numpy.median(reads_nz))
        self.output.write("# Normalization:\t%s\n" % self.normalization)
        self.output.write("# LOESS Correction:\t%s\n" % str(self.LOESS))
        self.output.write("# pins (obs):\t%f\n" % pins_obs)
        self.output.write("# pins (est):\t%f\n" % pins)
        self.output.write("# Run length (r):\t%d\n" % r)
        self.output.write("# State means:\n")
        self.output.write("#    %s\n" % "   ".join(
            ["%s: %8.4f" % (label[i], mu[i]) for i in range(Nstates)]))
        self.output.write("# Self-Transition Prob:\n")
        self.output.write("#    %s\n" % "   ".join(
            ["%s: %2.4e" % (label[i], A[i][i]) for i in range(Nstates)]))
        self.output.write("# State Emission Parameters (theta):\n")
        self.output.write("#    %s\n" % "   ".join(
            ["%s: %1.4f" % (label[i], L[i]) for i in range(Nstates)]))
        self.output.write("# State Distributions:")
        self.output.write("#    %s\n" % "   ".join([
            "%s: %2.2f%%" % (label[i], state2count[i] * 100.0 / total)
            for i in range(Nstates)
        ]))

        states = [int(Q_opt[t]) for t in range(T)]
        last_orf = ""
        for t in range(T):
            s_lab = label.get(states[t], "Unknown State")
            gamma_t = (alpha[:, t] * beta[:, t]) / numpy.sum(
                alpha[:, t] * beta[:, t])
            genes_at_site = hash.get(position[t], [""])
            genestr = ""
            if not (len(genes_at_site) == 1 and not genes_at_site[0]):
                genestr = ",".join([
                    "%s_(%s)" % (g, rv2info.get(g, "-")[0])
                    for g in genes_at_site
                ])

            self.output.write("%s\t%s\t%s\t%s\t%s\n" %
                              (int(position[t]), int(O[t]) - 1, "\t".join(
                                  ["%-9.2e" % g
                                   for g in gamma_t]), s_lab, genestr))

        self.output.close()

        self.transit_message("")  # Printing empty line to flush stdout
        self.transit_message("Finished HMM - Sites Method")
        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="HMM - Sites")

        #Gene Files
        self.transit_message("Creating HMM Genes Level Output")
        genes_path = ".".join(self.output.name.split(
            ".")[:-1]) + "_genes." + self.output.name.split(".")[-1]

        tempObs = numpy.zeros((1, len(O)))
        tempObs[0, :] = O - 1
        self.post_process_genes(tempObs, position, states, genes_path)

        self.transit_message("Adding File: %s" % (genes_path))
        self.add_file(path=genes_path, filetype="HMM - Genes")
        self.finish()
        self.transit_message("Finished HMM Method")
Example #5
0
    def Run(self):

        self.transit_message("Starting Mann-Whitney U-test Method")
        start_time = time.time()



        Kctrl = len(self.ctrldata)
        Kexp = len(self.expdata)
        #Get orf data
        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(self.ctrldata+self.expdata, wxobj=self.wxobj)

        (K,N) = data.shape


        if self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data, factors) = norm_tools.normalize_data(data, self.normalization, self.ctrldata+self.expdata, self.annotation_path)

        if self.LOESS:
            self.transit_message("Performing LOESS Correction")
            for j in range(K):
                data[j] = stat_tools.loess_correction(position, data[j])


        G = tnseq_tools.Genes(self.ctrldata + self.expdata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data, position=position)


        #u-test
        data = []
        N = len(G)
        count = 0
        self.progress_range(N)
        for gene in G:
            count+=1
            if gene.k == 0 or gene.n == 0:
                (test_obs, mean1, mean2, log2FC, u_stat, pval_2tail) = (0, 0, 0, 0, 0.0, 1.00)
            else:

                if not self.includeZeros:
                    ii = numpy.sum(gene.reads,0) > 0
                else:
                    ii = numpy.ones(gene.n) == 1


                data1 = gene.reads[:Kctrl,ii].flatten()
                data2 = gene.reads[Kctrl:,ii].flatten()
                try:
                    u_stat, pval_2tail = scipy.stats.mannwhitneyu(data1, data2,
                        alternative="two-sided")
                except ValueError as e:
                    u_stat, pval_2tail = 0.0, 1.00

                n1 = len(data1)
                n2 = len(data2)

                mean1 = 0
                if n1 > 0:
                    mean1 = numpy.mean(data1)
                mean2 = 0
                if n2 > 0:
                    mean2 = numpy.mean(data2)

                try:
                    # Only adjust log2FC if one of the means is zero
                    if mean1 > 0 and mean2 > 0:
                        log2FC = math.log((mean2)/(mean1),2)
                    else:
                        log2FC = math.log((mean2+1.0)/(mean1+1.0),2)
                except:
                    log2FC = 0.0


            #["Orf","Name","Desc","Sites","Mean Ctrl","Mean Exp","log2FC", "U-Statistic","p-value","Adj. p-value"]


            data.append([gene.orf, gene.name, gene.desc, gene.n, mean1, mean2, log2FC, u_stat, pval_2tail])

            # Update Progress
            text = "Running Mann-Whitney U-test Method... %1.1f%%" % (100.0*count/N)
            self.progress_update(text, count)


        #
        self.transit_message("") # Printing empty line to flush stdout
        self.transit_message("Performing Benjamini-Hochberg Correction")
        data.sort()
        qval = stat_tools.BH_fdr_correction([row[-1] for row in data])


        self.output.write("#utest\n")
        if self.wxobj:
            members = sorted([attr for attr in dir(self) if not callable(getattr(self,attr)) and not attr.startswith("__")])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write("#GUI with: norm=%s, includeZeros=%s, output=%s\n" % (self.normalization, self.includeZeros, self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python %s\n" % " ".join(sys.argv))
        self.output.write("#Control Data: %s\n" % (",".join(self.ctrldata).encode('utf-8')))
        self.output.write("#Experimental Data: %s\n" % (",".join(self.expdata).encode('utf-8')))
        self.output.write("#Annotation path: %s\n" % (self.annotation_path.encode('utf-8')))
        self.output.write("#Time: %s\n" % (time.time() - start_time))
        self.output.write("#%s\n" % "\t".join(columns))

        for i,row in enumerate(data):
            (orf, name, desc, n, mean1, mean2, log2FC, u_stat, pval_2tail) = row
            self.output.write("%s\t%s\t%s\t%d\t%1.1f\t%1.1f\t%1.2f\t%1.2f\t%1.5f\t%1.5f\n" % (orf, name, desc, n, mean1, mean2, log2FC, u_stat, pval_2tail, qval[i]))
        self.output.close()

        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="utest")
        self.finish()
        self.transit_message("Finished Mann-Whitney U-test Method")
Example #6
0
    def Run(self):


        self.transit_message("Starting Genetic Interactions Method")
        start_time = time.time()
        self.output.write("#GI\n")

        wiglist = self.ctrldataA + self.expdataA + self.ctrldataB + self.expdataB

        Nwig = len(wiglist)
        Na1 = len(self.ctrldataA)
        Nb1 = len(self.expdataA)
        Na2 = len(self.ctrldataB)
        Nb2 = len(self.expdataB)


        # Get data
        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(wiglist, wxobj=self.wxobj)

        # Normalize data if specified
        if self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data, factors) = norm_tools.normalize_data(data, self.normalization, wiglist, self.annotation_path)

        # Do LOESS correction if specified
        if self.LOESS:
            self.transit_message("Performing LOESS Correction")
            for j in range(K):
                data[j] = stat_tools.loess_correction(position, data[j])

        # Get Gene objects for each condition
        G_A1 = tnseq_tools.Genes([], self.annotation_path, data=data[:Na1], position=position,nterm=self.NTerminus,cterm=self.CTerminus)
        G_B1 = tnseq_tools.Genes([], self.annotation_path, data=data[Na1:(Na1+Nb1)], position=position,nterm=self.NTerminus,cterm=self.CTerminus)
        G_A2 = tnseq_tools.Genes([], self.annotation_path, data=data[(Na1+Nb1):(Na1+Nb1+Na2)], position=position,nterm=self.NTerminus,cterm=self.CTerminus)
        G_B2 = tnseq_tools.Genes([], self.annotation_path, data=data[(Na1+Nb1+Na2):], position=position,nterm=self.NTerminus,cterm=self.CTerminus)

        means_list_a1 = []
        means_list_b1 = []
        means_list_a2 = []
        means_list_b2 = []

        var_list_a1 = []
        var_list_a2 = []
        var_list_b1 = []
        var_list_b2 = []


        # Base priors on empirical observations accross genes.
        for gene in sorted(G_A1):
            if gene.n > 1:
                A1_data = G_A1[gene.orf].reads.flatten()
                B1_data = G_B1[gene.orf].reads.flatten()
                A2_data = G_A2[gene.orf].reads.flatten()
                B2_data = G_B2[gene.orf].reads.flatten()

                means_list_a1.append(numpy.mean(A1_data))
                var_list_a1.append(numpy.var(A1_data))

                means_list_b1.append(numpy.mean(B1_data))
                var_list_b1.append(numpy.var(B1_data))

                means_list_a2.append(numpy.mean(A2_data))
                var_list_a2.append(numpy.var(A2_data))

                means_list_b2.append(numpy.mean(B2_data))
                var_list_b2.append(numpy.var(B2_data))

        # Priors
        mu0_A1 = scipy.stats.trim_mean(means_list_a1, 0.01)
        mu0_B1 = scipy.stats.trim_mean(means_list_b1, 0.01)
        mu0_A2 = scipy.stats.trim_mean(means_list_a2, 0.01)
        mu0_B2 = scipy.stats.trim_mean(means_list_b2, 0.01)

        s20_A1 = scipy.stats.trim_mean(var_list_a1, 0.01)
        s20_B1 = scipy.stats.trim_mean(var_list_b1, 0.01)
        s20_A2 = scipy.stats.trim_mean(var_list_a2, 0.01)
        s20_B2 = scipy.stats.trim_mean(var_list_b2, 0.01)

        k0=1.0
        nu0=1.0
        data = []

        postprob = []
        count = 0
        N = len(G_A1)
        self.progress_range(N)
        # Perform actual analysis
        for gene in G_A1:

            # If there is some data
            if gene.n > 0:
                A1_data = G_A1[gene.orf].reads.flatten()
                B1_data = G_B1[gene.orf].reads.flatten()
                A2_data = G_A2[gene.orf].reads.flatten()
                B2_data = G_B2[gene.orf].reads.flatten()


            #            Time-1   Time-2
            #
            #  Strain-A     A       C
            #
            #  Strain-B     B       D

                try:
                    muA1_post, varA1_post = stat_tools.sample_trunc_norm_post(A1_data, self.samples,
                        mu0_A1, s20_A1, k0, nu0)
                    muB1_post, varB1_post = stat_tools.sample_trunc_norm_post(B1_data, self.samples,
                        mu0_B1, s20_B1, k0, nu0)
                    muA2_post, varA2_post = stat_tools.sample_trunc_norm_post(A2_data, self.samples,
                        mu0_A2, s20_A2, k0, nu0)
                    muB2_post, varB2_post = stat_tools.sample_trunc_norm_post(B2_data, self.samples,
                        mu0_B2, s20_B2, k0, nu0)

                except Exception as e:
                    muA1_post = varA1_post = numpy.ones(self.samples)
                    muB1_post = varB1_post = numpy.ones(self.samples)
                    muA2_post = varA2_post = numpy.ones(self.samples)
                    muB2_post = varB2_post = numpy.ones(self.samples)

                logFC_A_post = numpy.log2(muA2_post/muA1_post)
                logFC_B_post = numpy.log2(muB2_post/muB1_post)
                delta_logFC_post = logFC_B_post - logFC_A_post

                alpha = 0.05

                # Get Bounds of the HDI
                l_logFC_A, u_logFC_A = stat_tools.HDI_from_MCMC(logFC_A_post, 1-alpha)

                l_logFC_B, u_logFC_B = stat_tools.HDI_from_MCMC(logFC_B_post, 1-alpha)

                l_delta_logFC, u_delta_logFC = stat_tools.HDI_from_MCMC(delta_logFC_post, 1-alpha)


                mean_logFC_A = numpy.mean(logFC_A_post)
                mean_logFC_B = numpy.mean(logFC_B_post)
                mean_delta_logFC = numpy.mean(delta_logFC_post)

                # Is HDI significantly different than ROPE?
                not_HDI_overlap_bit = l_delta_logFC > self.rope or u_delta_logFC < -self.rope

                # Probability of posterior overlaping with ROPE
                probROPE = numpy.mean(numpy.logical_and(delta_logFC_post>=0.0-self.rope,  delta_logFC_post<=0.0+self.rope))

            # If there is no data, assume empty defaults
            else:
                A1_data = [0,0]
                B1_data = [0,0]
                A2_data = [0,0]
                B2_data = [0,0]
                muA1_post = varA1_post = numpy.ones(self.samples)
                muB1_post = varB1_post = numpy.ones(self.samples)
                muA2_post = varA2_post = numpy.ones(self.samples)
                muB2_post = varB2_post = numpy.ones(self.samples)
                logFC_A_post = numpy.log2(muA2_post/muA1_post)
                logFC_B_post = numpy.log2(muB2_post/muB1_post)
                delta_logFC_post = logFC_B_post - logFC_A_post

                mean_logFC_A = 0
                mean_logFC_B = 0
                mean_delta_logFC = 0
                l_logFC_A = 0
                u_logFC_A = 0
                l_logFC_B = 0
                u_logFC_B = 0
                l_delta_logFC = 0
                u_delta_logFC = 0
                probROPE = 1.0


            if numpy.isnan(l_logFC_A):
                l_logFC_A = -10
                u_logFC_A = 10
            if numpy.isnan(l_logFC_B):
                l_logFC_B = -10
                u_logFC_B = 10
            if numpy.isnan(l_delta_logFC):
                l_delta_logFC = -10
                u_delta_logFC = 10


            postprob.append(probROPE)
            data.append((gene.orf, gene.name, gene.n, numpy.mean(muA1_post), numpy.mean(muA2_post), numpy.mean(muB1_post), numpy.mean(muB2_post), mean_logFC_A, mean_logFC_B, mean_delta_logFC, l_delta_logFC, u_delta_logFC, probROPE, not_HDI_overlap_bit))


            text = "Running GI Method... %2.0f%%" % (100.0*(count+1)/N)
            self.progress_update(text, count)
            self.transit_message_inplace("Running Export Method... %1.1f%%" % (100.0*count/(N-1)))
            count+=1

        data.sort(key=lambda x: x[-2])

        if self.doBFDR or not self.doFWER:
            postprob = numpy.array(postprob)
            postprob.sort()
            bfdr = numpy.cumsum(postprob)/numpy.arange(1, len(postprob)+1)
            adjusted_prob = bfdr
            adjusted_label = "BFDR"
        elif doFWER:
            fwer = FWER_Bayes(postprob)
            fwer.sort()
            adjusted_prob = fwer
            adjusted_label = "FWER"

        # If not using adjustment for classification, sort correctly
        if not self.doBFDR and not self.doFWER:
            sorted_index = numpy.argsort([d[-1] for d in data])[::-1][:len(data)]
            adjusted_prob = [adjusted_prob[ii] for ii in sorted_index]
            data = [data[ii] for ii in sorted_index]



        # Print output
        if self.wxobj:
            members = sorted([attr for attr in dir(self) if not callable(getattr(self,attr)) and not attr.startswith("__")])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write("#GUI with: norm=%s, samples=%s, includeZeros=%s, output=%s\n" % (self.normalization, self.samples, self.includeZeros, self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python %s\n" % " ".join(sys.argv))


        self.output.write("#Control Data-A: %s\n" % (",".join(self.ctrldataA).encode('utf-8')))
        self.output.write("#Control Data-B: %s\n" % (",".join(self.ctrldataB).encode('utf-8')))
        self.output.write("#Experimental Data-A: %s\n" % (",".join(self.expdataA).encode('utf-8')))
        self.output.write("#Experimental Data-B: %s\n" % (",".join(self.expdataB).encode('utf-8')))
        self.output.write("#Annotation path: %s\n" % (self.annotation_path.encode('utf-8')))
        self.output.write("#Time: %s\n" % (time.time() - start_time))
        self.output.write("#%s\n" % "\t".join(columns))


        if self.doBFDR or self.doFWER:
            self.output.write("# Significant interactions are those whose adjusted probability of the delta-logFC falling within ROPE is < 0.05 (Adjusted using %s)\n" % (adjusted_label))
        else:
            self.output.write("# Significant interactions are those genes whose delta-logFC HDI does not overlap the ROPE\n")
        self.output.write("#\n")

        # Write column names
        self.output.write("#ORF\tName\tNumber of TA Sites\tMean count (Strain A Time 1)\tMean count (Strain A Time 2)\tMean count (Strain B Time 1)\tMean count (Strain B Time 2)\tMean logFC (Strain A)\tMean logFC (Strain B) \tMean delta logFC\tLower Bound delta logFC\tUpper Bound delta logFC\tProb. of delta-logFC being within ROPE\tAdjusted Probability (%s)\tIs HDI outside ROPE?\tType of Interaction\n" % adjusted_label)

        # Write gene results
        for i,row in enumerate(data):
        #1   2    3        4                5              6               7                8            9            10              11             12            13         14
            orf, name, n, mean_muA1_post, mean_muA2_post, mean_muB1_post, mean_muB2_post, mean_logFC_A, mean_logFC_B, mean_delta_logFC, l_delta_logFC, u_delta_logFC, probROPE, not_HDI_overlap_bit = row
            type_of_interaction = "No Interaction"
            if ((self.doBFDR or self.doFWER) and adjusted_prob[i] < 0.05):
                type_of_interaction = self.classify_interaction(mean_delta_logFC, mean_logFC_B, mean_logFC_A)
            elif not (self.doBFDR or self.doFWER) and not_HDI_overlap_bit:
                type_of_interaction = self.classify_interaction(mean_delta_logFC, mean_logFC_B, mean_logFC_A)

            new_row = tuple(list(row[:-1])+[adjusted_prob[i], not_HDI_overlap_bit, type_of_interaction])
            self.output.write("%s\t%s\t%d\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.8f\t%1.8f\t%s\t%s\n" % new_row)


        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="GI")
        self.finish()
        self.transit_message("Finished Genetic Interactions Method")
Example #7
0
    def Run(self):

        self.transit_message("Starting Genetic Interactions Method")
        start_time = time.time()
        self.output.write("#GI\n")

        wiglist = self.ctrldataA + self.ctrldataB + self.expdataA + self.expdataB

        Nwig = len(wiglist)
        Na1 = len(self.ctrldataA)
        Nb1 = len(self.ctrldataB)
        Na2 = len(self.expdataA)
        Nb2 = len(self.expdataB)

        # Get data
        self.transit_message("Getting Data")
        (data, position) = transit_tools.get_validated_data(wiglist,
                                                            wxobj=self.wxobj)

        # Normalize data if specified
        if self.normalization != "nonorm":
            self.transit_message("Normalizing using: %s" % self.normalization)
            (data,
             factors) = norm_tools.normalize_data(data, self.normalization,
                                                  wiglist,
                                                  self.annotation_path)

        # Do LOESS correction if specified
        if self.LOESS:
            self.transit_message("Performing LOESS Correction")
            for j in range(K):
                data[j] = stat_tools.loess_correction(position, data[j])

        # Get Gene objects for each condition
        G_A1 = tnseq_tools.Genes([],
                                 self.annotation_path,
                                 data=data[:Na1],
                                 position=position,
                                 nterm=self.NTerminus,
                                 cterm=self.CTerminus)
        G_B1 = tnseq_tools.Genes([],
                                 self.annotation_path,
                                 data=data[Na1:(Na1 + Nb1)],
                                 position=position,
                                 nterm=self.NTerminus,
                                 cterm=self.CTerminus)
        G_A2 = tnseq_tools.Genes([],
                                 self.annotation_path,
                                 data=data[(Na1 + Nb1):(Na1 + Nb1 + Na2)],
                                 position=position,
                                 nterm=self.NTerminus,
                                 cterm=self.CTerminus)
        G_B2 = tnseq_tools.Genes([],
                                 self.annotation_path,
                                 data=data[(Na1 + Nb1 + Na2):],
                                 position=position,
                                 nterm=self.NTerminus,
                                 cterm=self.CTerminus)

        means_list_a1 = []
        means_list_b1 = []
        means_list_a2 = []
        means_list_b2 = []

        var_list_a1 = []
        var_list_a2 = []
        var_list_b1 = []
        var_list_b2 = []

        # Base priors on empirical observations across genes.
        for gene in sorted(G_A1):
            if gene.n > 1:
                A1_data = G_A1[gene.orf].reads.flatten()
                B1_data = G_B1[gene.orf].reads.flatten()
                A2_data = G_A2[gene.orf].reads.flatten()
                B2_data = G_B2[gene.orf].reads.flatten()

                means_list_a1.append(numpy.mean(A1_data))
                var_list_a1.append(numpy.var(A1_data))

                means_list_b1.append(numpy.mean(B1_data))
                var_list_b1.append(numpy.var(B1_data))

                means_list_a2.append(numpy.mean(A2_data))
                var_list_a2.append(numpy.var(A2_data))

                means_list_b2.append(numpy.mean(B2_data))
                var_list_b2.append(numpy.var(B2_data))

        # Priors
        mu0_A1 = scipy.stats.trim_mean(means_list_a1, 0.01)
        mu0_B1 = scipy.stats.trim_mean(means_list_b1, 0.01)
        mu0_A2 = scipy.stats.trim_mean(means_list_a2, 0.01)
        mu0_B2 = scipy.stats.trim_mean(means_list_b2, 0.01)

        s20_A1 = scipy.stats.trim_mean(var_list_a1, 0.01)
        s20_B1 = scipy.stats.trim_mean(var_list_b1, 0.01)
        s20_A2 = scipy.stats.trim_mean(var_list_a2, 0.01)
        s20_B2 = scipy.stats.trim_mean(var_list_b2, 0.01)

        k0 = 1.0
        nu0 = 1.0
        data = []

        postprob = []
        count = 0
        N = len(G_A1)
        self.progress_range(N)
        # Perform actual analysis
        for gene in G_A1:

            # If there is some data
            if gene.n > 0:
                A1_data = G_A1[gene.orf].reads.flatten()
                B1_data = G_B1[gene.orf].reads.flatten()
                A2_data = G_A2[gene.orf].reads.flatten()
                B2_data = G_B2[gene.orf].reads.flatten()

                #            Time-1   Time-2
                #
                #  Strain-A     A       C
                #
                #  Strain-B     B       D

                try:
                    muA1_post, varA1_post = stat_tools.sample_trunc_norm_post(
                        A1_data, self.samples, mu0_A1, s20_A1, k0, nu0)
                    muB1_post, varB1_post = stat_tools.sample_trunc_norm_post(
                        B1_data, self.samples, mu0_B1, s20_B1, k0, nu0)
                    muA2_post, varA2_post = stat_tools.sample_trunc_norm_post(
                        A2_data, self.samples, mu0_A2, s20_A2, k0, nu0)
                    muB2_post, varB2_post = stat_tools.sample_trunc_norm_post(
                        B2_data, self.samples, mu0_B2, s20_B2, k0, nu0)

                except Exception as e:
                    muA1_post = varA1_post = numpy.ones(self.samples)
                    muB1_post = varB1_post = numpy.ones(self.samples)
                    muA2_post = varA2_post = numpy.ones(self.samples)
                    muB2_post = varB2_post = numpy.ones(self.samples)

                logFC_A_post = numpy.log2(muA2_post / muA1_post)
                logFC_B_post = numpy.log2(muB2_post / muB1_post)
                delta_logFC_post = logFC_B_post - logFC_A_post

                alpha = 0.05

                # Get Bounds of the HDI
                l_logFC_A, u_logFC_A = stat_tools.HDI_from_MCMC(
                    logFC_A_post, 1 - alpha)

                l_logFC_B, u_logFC_B = stat_tools.HDI_from_MCMC(
                    logFC_B_post, 1 - alpha)

                l_delta_logFC, u_delta_logFC = stat_tools.HDI_from_MCMC(
                    delta_logFC_post, 1 - alpha)

                mean_logFC_A = numpy.mean(logFC_A_post)
                mean_logFC_B = numpy.mean(logFC_B_post)
                mean_delta_logFC = numpy.mean(delta_logFC_post)

                # Is HDI significantly different than ROPE? (i.e. no overlap)
                not_HDI_overlap_bit = l_delta_logFC > self.rope or u_delta_logFC < -self.rope

                # Probability of posterior overlaping with ROPE
                probROPE = numpy.mean(
                    numpy.logical_and(delta_logFC_post >= 0.0 - self.rope,
                                      delta_logFC_post <= 0.0 + self.rope))

            # If there is no data, assume empty defaults
            else:
                A1_data = [0, 0]
                B1_data = [0, 0]
                A2_data = [0, 0]
                B2_data = [0, 0]
                muA1_post = varA1_post = numpy.ones(self.samples)
                muB1_post = varB1_post = numpy.ones(self.samples)
                muA2_post = varA2_post = numpy.ones(self.samples)
                muB2_post = varB2_post = numpy.ones(self.samples)
                logFC_A_post = numpy.log2(muA2_post / muA1_post)
                logFC_B_post = numpy.log2(muB2_post / muB1_post)
                delta_logFC_post = logFC_B_post - logFC_A_post

                mean_logFC_A = 0
                mean_logFC_B = 0
                mean_delta_logFC = 0
                l_logFC_A = 0
                u_logFC_A = 0
                l_logFC_B = 0
                u_logFC_B = 0
                l_delta_logFC = 0
                u_delta_logFC = 0
                probROPE = 1.0

            if numpy.isnan(l_logFC_A):
                l_logFC_A = -10
                u_logFC_A = 10
            if numpy.isnan(l_logFC_B):
                l_logFC_B = -10
                u_logFC_B = 10
            if numpy.isnan(l_delta_logFC):
                l_delta_logFC = -10
                u_delta_logFC = 10

            postprob.append(probROPE)
            data.append((gene.orf, gene.name, gene.n, numpy.mean(muA1_post),
                         numpy.mean(muA2_post), numpy.mean(muB1_post),
                         numpy.mean(muB2_post), mean_logFC_A, mean_logFC_B,
                         mean_delta_logFC, l_delta_logFC, u_delta_logFC,
                         probROPE, not_HDI_overlap_bit))

            text = "Running GI Method... %2.0f%%" % (100.0 * (count + 1) / N)
            self.progress_update(text, count)
            self.transit_message_inplace("Running Export Method... %1.1f%%" %
                                         (100.0 * count / (N - 1)))
            count += 1

        # for HDI, maybe I should sort on abs(mean_delta_logFC); however, need to sort by prob to calculate BFDR
        probcol = -2  # probROPEs
        data.sort(key=lambda x: x[probcol])
        sortedprobs = numpy.array([x[probcol] for x in data])

        # BFDR method: Newton M.A., Noueiry A., Sarkar D., Ahlquist P. (2004). Detecting differential gene expression with a semiparametric hierarchical mixture method. Biostatistics, 5:155–176.

        if self.signif == "BFDR":
            sortedprobs = numpy.array(sortedprobs)
            #sortedprobs.sort() # why, since already sorted?
            bfdr = numpy.cumsum(sortedprobs) / numpy.arange(
                1,
                len(sortedprobs) + 1)
            adjusted_prob = bfdr  # should be same order as sorted above by probROPE
            adjusted_label = "BFDR"

        elif self.signif == "FWER":
            fwer = stat_tools.FWER_Bayes(sortedprobs)
            #fwer.sort() # should not need this if monotonic
            adjusted_prob = fwer
            adjusted_label = "FWER"

        # If not using adjustment for classification, sort correctly
        else:
            adjusted_prob = sortedprobs
            adjusted_label = "un"
            # should I stable-sort by overlap_bit?


#            sorted_index = numpy.argsort([d[-1] for d in data])[::-1][:len(data)]
#            adjusted_prob = [adjusted_prob[ii] for ii in sorted_index]
#            data = [data[ii] for ii in sorted_index]

# Print(output)
        if self.wxobj:
            members = sorted([
                attr for attr in dir(self) if not callable(getattr(self, attr))
                and not attr.startswith("__")
            ])
            memberstr = ""
            for m in members:
                memberstr += "%s = %s, " % (m, getattr(self, m))
            self.output.write(
                "#GUI with: norm=%s, samples=%s, includeZeros=%s, output=%s\n"
                % (self.normalization, self.samples, self.includeZeros,
                   self.output.name.encode('utf-8')))
        else:
            self.output.write("#Console: python3 %s\n" % " ".join(sys.argv))

        now = str(datetime.datetime.now())
        now = now[:now.rfind('.')]
        self.output.write("#Date: " + now + "\n")
        #self.output.write("#Runtime: %s s\n" % (time.time() - start_time))

        self.output.write("#Control Data-A: %s\n" %
                          (",".join(self.ctrldataA).encode('utf-8')))
        self.output.write("#Control Data-B: %s\n" %
                          (",".join(self.ctrldataB).encode('utf-8')))
        self.output.write("#Experimental Data-A: %s\n" %
                          (",".join(self.expdataA).encode('utf-8')))
        self.output.write("#Experimental Data-B: %s\n" %
                          (",".join(self.expdataB).encode('utf-8')))
        self.output.write("#Annotation path: %s\n" %
                          (self.annotation_path.encode('utf-8')))
        self.output.write("#ROPE=%s, method for significance=%s\n" %
                          (self.rope, self.signif))
        #self.output.write("#%s\n" % "\t".join(columns))

        if self.signif == "HDI":
            self.output.write(
                "#Significant interactions are those genes whose delta-logFC HDI does not overlap the ROPE\n"
            )
        elif self.signif in "prob BDFR FWER":
            self.output.write(
                "#Significant interactions are those whose %s-adjusted probability of the delta-logFC falling within ROPE is < 0.05.\n"
                % (adjusted_label))

        # Write column names (redundant with self.columns)
        self.output.write(
            "#ORF\tName\tNumber of TA Sites\tMean count (Strain A Condition 1)\tMean count (Strain A Condition 2)\tMean count (Strain B Condition 1)\tMean count (Strain B Condition 2)\tMean logFC (Strain A)\tMean logFC (Strain B) \tMean delta logFC\tLower Bound delta logFC\tUpper Bound delta logFC\tIs HDI outside ROPE?\tProb. of delta-logFC being within ROPE\t%s-Adjusted Probability\tType of Interaction\n"
            % adjusted_label)

        # Write gene results
        for i, row in enumerate(data):
            #1   2    3        4                5              6               7                8            9            10              11             12            13         14
            orf, name, n, mean_muA1_post, mean_muA2_post, mean_muB1_post, mean_muB2_post, mean_logFC_A, mean_logFC_B, mean_delta_logFC, l_delta_logFC, u_delta_logFC, probROPE, not_HDI_overlap_bit = row

            interaction = self.classify_interaction(mean_delta_logFC,
                                                    mean_logFC_B, mean_logFC_A)
            type_of_interaction = "No Interaction"
            if self.signif in "prob BFDR FWER" and adjusted_prob[i] < 0.05:
                type_of_interaction = interaction
            if self.signif == "HDI" and not_HDI_overlap_bit:
                type_of_interaction = interaction

            new_row = tuple(
                list(row[:-2]) + [
                    not_HDI_overlap_bit, probROPE, adjusted_prob[i],
                    type_of_interaction
                ])
            self.output.write(
                "%s\t%s\t%d\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%1.2f\t%s\t%1.8f\t%1.8f\t%s\n"
                % new_row)

        self.transit_message("Adding File: %s" % (self.output.name))
        self.add_file(filetype="GI")
        self.finish()
        self.transit_message("Finished Genetic Interactions Method")