Esempio n. 1
0
    def Run(self):

        self.transit_message("Starting Normalization")
        start_time = time.time()

        infile = self.infile
        outputPath = self.outfile # output file exists, should I require -overwrite flag?

        # determine ref genome from first; assume they are all the same; assume wigs have 2 header lines
        line2 = "variableStep chrom=" # unknown
        for line in open(infile):
          if line.startswith("variableStep"): line2 = line.rstrip(); break

        if self.combined_wig==True: (sites,data,files) = tnseq_tools.read_combined_wig(self.ctrldata[0])
        else: (data, sites) = tnseq_tools.get_data(self.ctrldata)
        (data,factors) = norm_tools.normalize_data(data,self.normalization)

        print "writing",outputPath
        file = open(outputPath,"w")
        file.write("# %s normalization of %s\n" % (self.normalization,infile))
        if self.combined_wig==True:
          for f in files: file.write("#File: %s\n" % f)
          for i in range(len(sites)): file.write('\t'.join([str(sites[i])]+["%0.1f" % x for x in list(data[...,i])])+"\n")
        else:
          file.write(line2+"\n")
          for j in range(len(sites)):
            file.write("%s %s\n" % (sites[j],int(data[0,j])))
        file.close()

        self.finish()
        self.transit_message("Finished Normalization") 
Esempio n. 2
0
    def Run(self):

        self.transit_message("Starting Normalization")
        start_time = time.time()

        infile = self.infile
        outputPath = self.outfile # output file exists, should I require -overwrite flag?

        # determine ref genome from first; assume they are all the same; assume wigs have 2 header lines
        line2 = "variableStep chrom=" # unknown
        for line in open(infile):
          if line.startswith("variableStep"): line2 = line.rstrip(); break

        if self.combined_wig==True: (sites,data,files) = tnseq_tools.read_combined_wig(self.ctrldata[0])
        else: (data, sites) = tnseq_tools.get_data(self.ctrldata)
        (data,factors) = norm_tools.normalize_data(data,self.normalization)

        print "writing",outputPath
        file = open(outputPath,"w")
        file.write("# %s normalization of %s\n" % (self.normalization,infile))
        if self.combined_wig==True:
          for f in files: file.write("#File: %s\n" % f)
          for i in range(len(sites)): file.write('\t'.join([str(sites[i])]+["%0.1f" % x for x in list(data[...,i])])+"\n")
        else:
          file.write(line2+"\n")
          for j in range(len(sites)):
            file.write("%s %s\n" % (sites[j],int(data[0,j])))
        file.close()

        self.finish()
        self.transit_message("Finished Normalization")
Esempio n. 3
0
    def Run(self):
        self.transit_message("Starting Anova analysis")
        start_time = time.time()

        self.transit_message("Getting Data")
        (sites, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig)

        self.transit_message("Normalizing using: %s" % self.normalization)
        (data, factors) = norm_tools.normalize_data(data, self.normalization)

        conditionsByFile, _, _, orderingMetadata = tnseq_tools.read_samples_metadata(self.metadata)
        conditions = self.wigs_to_conditions(
            conditionsByFile,
            filenamesInCombWig)

        conditionsList = self.select_conditions(conditions,self.included_conditions,self.ignored_conditions,orderingMetadata)
        data, conditions, _, _ = self.filter_wigs_by_conditions2(data, conditions, conditionsList)

        genes = tnseq_tools.read_genes(self.annotation_path)

        TASiteindexMap = {TA: i for i, TA in enumerate(sites)}
        RvSiteindexesMap = tnseq_tools.rv_siteindexes_map(genes, TASiteindexMap, nterm=self.NTerminus, cterm=self.CTerminus)
        MeansByRv = self.means_by_rv(data, RvSiteindexesMap, genes, conditions)

        self.transit_message("Running Anova")
        pvals,qvals,run_status = self.run_anova(data, genes, MeansByRv, RvSiteindexesMap, conditions)

        self.transit_message("Adding File: %s" % (self.output))
        file = open(self.output,"w")

        heads = ("Rv Gene TAs".split() +
                ["Mean_%s" % x for x in conditionsList] +
                ["LFC_%s" % x for x in conditionsList] +
                "pval padj".split() + ["status"])
        file.write("#Console: python3 %s\n" % " ".join(sys.argv))
        file.write("#parameters: normalization=%s, trimming=%s/%s%% (N/C), pseudocounts=%s\n" % (self.normalization,self.NTerminus,self.CTerminus,self.PC))
        file.write('#'+'\t'.join(heads)+EOL)
        for gene in genes:
            Rv = gene["rv"]
            if Rv in MeansByRv:
              means = [MeansByRv[Rv][c] for c in conditionsList]
              LFCs = self.calcLFCs(means,self.PC)
              vals = ([Rv, gene["gene"], str(len(RvSiteindexesMap[Rv]))] +
                      ["%0.2f" % x for x in means] + 
                      ["%0.3f" % x for x in LFCs] + 
                      ["%f" % x for x in [pvals[Rv], qvals[Rv]]] + [run_status[Rv]])
              file.write('\t'.join(vals)+EOL)
        file.close()
        self.transit_message("Finished Anova analysis")
        self.transit_message("Time: %0.1fs\n" % (time.time() - start_time))
Esempio n. 4
0
    def Run(self):

        self.transit_message("Starting TnseqStats")
        start_time = time.time()

        datasets = self.wigs
        if self.combined_wig == None:
            (data, sites) = tnseq_tools.get_data(self.wigs)
        else:
            (sites, data,
             datasets) = tnseq_tools.read_combined_wig(self.combined_wig)

        # write table of stats (saturation,NZmean)
        file = sys.stdout
        if self.outfile != None: file = open(self.outfile, "w")
        PTI = True
        if PTI == True:
            file.write(
                "dataset\tdensity\tmean_ct\tNZmean\tNZmedian\tmax_ct\ttotal_cts\tskewness\tkurtosis\tpickands_tail_index\n"
            )
        else:
            file.write(
                "dataset\tdensity\tmean_ct\tNZmean\tNZmedian\tmax_ct\ttotal_cts\tskewness\tkurtosis\n"
            )
        for i in range(data.shape[0]):
            density, meanrd, nzmeanrd, nzmedianrd, maxrd, totalrd, skew, kurtosis = tnseq_tools.get_data_stats(
                data[i, :])
            nzmedianrd = int(nzmedianrd) if numpy.isnan(
                nzmedianrd) == False else 0
            pti = self.pickands_tail_index(data[i, :])
            vals = [
                datasets[i],
                "%0.3f" % density,
                "%0.1f" % meanrd,
                "%0.1f" % nzmeanrd,
                "%d" % nzmedianrd, maxrd,
                int(totalrd),
                "%0.1f" % skew,
                "%0.1f" % kurtosis
            ]
            if PTI == True: vals.append("%0.3f" % pti)
            file.write('\t'.join([str(x) for x in vals]) + '\n')
        if self.outfile != None: file.close()

        self.finish()
        self.transit_message("Finished TnseqStats")
Esempio n. 5
0
    def Run(self):
        self.transit_message("Starting Anova analysis")
        start_time = time.time()

        self.transit_message("Getting Data")
        (sites, data,
         filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig)

        self.transit_message("Normalizing using: %s" % self.normalization)
        (data, factors) = norm_tools.normalize_data(data, self.normalization)

        conditions = self.wigs_to_conditions(
            self.read_samples_metadata(self.metadata), filenamesInCombWig)
        data, conditions = self.filter_by_conditions_blacklist(
            data, conditions, self.ignored_conditions)

        genes = tnseq_tools.read_genes(self.annotation_path)

        TASiteindexMap = {TA: i for i, TA in enumerate(sites)}
        RvSiteindexesMap = tnseq_tools.rv_siteindexes_map(
            genes, TASiteindexMap)
        MeansByRv = self.means_by_rv(data, RvSiteindexesMap, genes, conditions)

        self.transit_message("Running Anova")
        pvals, qvals = self.run_anova(data, genes, MeansByRv, RvSiteindexesMap,
                                      conditions)

        self.transit_message("Adding File: %s" % (self.output))
        file = open(self.output, "w")
        conditionsList = list(set(conditions))
        vals = "Rv Gene TAs".split() + conditionsList + "pval padj".split()
        file.write('\t'.join(vals) + EOL)
        for gene in genes:
            Rv = gene["rv"]
            if Rv in MeansByRv:
                vals = ([Rv, gene["gene"],
                         str(len(RvSiteindexesMap[Rv]))] +
                        ["%0.1f" % MeansByRv[Rv][c] for c in conditionsList] +
                        ["%f" % x for x in [pvals[Rv], qvals[Rv]]])
                file.write('\t'.join(vals) + EOL)
        file.close()
        self.transit_message("Finished Anova analysis")
Esempio n. 6
0
    def Run(self):

        #if not self.wxobj:
        #    # Force matplotlib to use good backend for png.
        #    import matplotlib.pyplot as plt
        #elif "matplotlib.pyplot" not in sys.modules:
        try:
            import matplotlib.pyplot as plt
        except:
            print("Error: cannot do histograms")
            self.doHistogram = False


        self.transit_message("Starting resampling Method")
        start_time = time.time()

        histPath = ""
        if self.doHistogram:
            histPath = os.path.join(os.path.dirname(self.output.name), transit_tools.fetch_name(self.output.name)+"_histograms")
            if not os.path.isdir(histPath):
                os.makedirs(histPath)

        #Get orf data
        self.transit_message("Getting Data")
        if self.diffStrains:
            self.transit_message("Multiple annotation files found")
            self.transit_message("Mapping ctrl data to {0}, exp data to {1}".format(self.annotation_path, self.annotation_path_exp))

        if self.combinedWigParams:
            (position, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combinedWigParams['combined_wig'])
            conditionsByFile, _, _, _ = tnseq_tools.read_samples_metadata(self.combinedWigParams['samples_metadata'])
            conditions = self.wigs_to_conditions(conditionsByFile, filenamesInCombWig)
            data, conditions = self.filter_wigs_by_conditions(data, conditions, self.combinedWigParams['conditions'])
            data_ctrl = numpy.array([d for i, d in enumerate(data) if conditions[i].lower() == self.combinedWigParams['conditions'][0]])
            data_exp = numpy.array([d for i, d in enumerate(data) if conditions[i].lower() == self.combinedWigParams['conditions'][1]])
            position_ctrl, position_exp = position, position
        else:
            (data_ctrl, position_ctrl) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj)
            (data_exp, position_exp) = transit_tools.get_validated_data(self.expdata, wxobj=self.wxobj)
        (K_ctrl, N_ctrl) = data_ctrl.shape
        (K_exp, N_exp) = data_exp.shape

        if not self.diffStrains and (N_ctrl != N_exp):
            self.transit_error("Error: Ctrl and Exp wig files don't have the same number of sites.")
            self.transit_error("Make sure all .wig files come from the same strain.")
            return
        # (data, position) = transit_tools.get_validated_data(self.ctrldata+self.expdata, wxobj=self.wxobj)

        self.transit_message("Preprocessing Ctrl data...")
        data_ctrl = self.preprocess_data(position_ctrl, data_ctrl)

        self.transit_message("Preprocessing Exp data...")
        data_exp = self.preprocess_data(position_exp, data_exp)

        G_ctrl = tnseq_tools.Genes(self.ctrldata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data_ctrl, position=position_ctrl)
        G_exp = tnseq_tools.Genes(self.expdata, self.annotation_path_exp, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data_exp, position=position_exp)

        doLibraryResampling = False
        # If library string not empty
        if self.ctrl_lib_str or self.exp_lib_str:
            letters_ctrl = set(self.ctrl_lib_str)
            letters_exp = set(self.exp_lib_str)

            # Check if using exactly 1 letters; i.e. no different libraries
            if len(letters_ctrl) == 1 and letters_exp==1:
                pass
            # If using more than one letter, then check no differences in set
            else:
                lib_diff = letters_ctrl ^ letters_exp
                # Check that their differences
                if not lib_diff:
                    doLibraryResampling = True
                else:
                    transit_tools.transit_error("Error: Library Strings (Ctrl = %s, Exp = %s) do not use the same letters. Make sure every letter / library is represented in both Control and Experimental Conditions. Proceeding with resampling assuming all datasets belong to the same library." % (self.ctrl_lib_str, self.exp_lib_str))
                    self.ctrl_lib_str = ""
                    self.exp_lib_str = ""

        (data, qval) = self.run_resampling(G_ctrl, G_exp, doLibraryResampling, histPath)
        self.write_output(data, qval, start_time)

        self.finish()
        self.transit_message("Finished resampling Method")
Esempio n. 7
0
    def Run(self):

        #if not self.wxobj:
        #    # Force matplotlib to use good backend for png.
        #    import matplotlib.pyplot as plt
        #elif "matplotlib.pyplot" not in sys.modules:
        try:
            import matplotlib.pyplot as plt
        except:
            print "Error: cannot do histograms"
            self.doHistogram = False


        self.transit_message("Starting resampling Method")
        start_time = time.time()

        histPath = ""
        if self.doHistogram:
            histPath = os.path.join(os.path.dirname(self.output.name), transit_tools.fetch_name(self.output.name)+"_histograms")
            if not os.path.isdir(histPath):
                os.makedirs(histPath)

        #Get orf data
        self.transit_message("Getting Data")
        if self.diffStrains:
            self.transit_message("Multiple annotation files found")
            self.transit_message("Mapping ctrl data to {0}, exp data to {1}".format(self.annotation_path, self.annotation_path_exp))

        if self.combinedWigParams:
            (position, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combinedWigParams['combined_wig'])
            conditionsByFile, _, _, _ = tnseq_tools.read_samples_metadata(self.combinedWigParams['samples_metadata'])
            conditions = self.wigs_to_conditions(conditionsByFile, filenamesInCombWig)
            data, conditions = self.filter_wigs_by_conditions(data, conditions, self.combinedWigParams['conditions'])
            data_ctrl = numpy.array([d for i, d in enumerate(data) if conditions[i].lower() == self.combinedWigParams['conditions'][0]])
            data_exp = numpy.array([d for i, d in enumerate(data) if conditions[i].lower() == self.combinedWigParams['conditions'][1]])
            position_ctrl, position_exp = position, position
        else:
            (data_ctrl, position_ctrl) = transit_tools.get_validated_data(self.ctrldata, wxobj=self.wxobj)
            (data_exp, position_exp) = transit_tools.get_validated_data(self.expdata, wxobj=self.wxobj)
        (K_ctrl, N_ctrl) = data_ctrl.shape
        (K_exp, N_exp) = data_exp.shape

        if not self.diffStrains and (N_ctrl != N_exp):
            self.transit_error("Error: Ctrl and Exp wig files don't have the same number of sites.")
            self.transit_error("Make sure all .wig files come from the same strain.")
            return
        # (data, position) = transit_tools.get_validated_data(self.ctrldata+self.expdata, wxobj=self.wxobj)

        self.transit_message("Preprocessing Ctrl data...")
        data_ctrl = self.preprocess_data(position_ctrl, data_ctrl)

        self.transit_message("Preprocessing Exp data...")
        data_exp = self.preprocess_data(position_exp, data_exp)

        G_ctrl = tnseq_tools.Genes(self.ctrldata, self.annotation_path, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data_ctrl, position=position_ctrl)
        G_exp = tnseq_tools.Genes(self.expdata, self.annotation_path_exp, ignoreCodon=self.ignoreCodon, nterm=self.NTerminus, cterm=self.CTerminus, data=data_exp, position=position_exp)

        doLibraryResampling = False
        # If library string not empty
        if self.ctrl_lib_str or self.exp_lib_str:
            letters_ctrl = set(self.ctrl_lib_str)
            letters_exp = set(self.exp_lib_str)

            # Check if using exactly 1 letters; i.e. no different libraries
            if len(letters_ctrl) == 1 and letters_exp==1:
                pass
            # If using more than one letter, then check no differences in set
            else:
                lib_diff = letters_ctrl ^ letters_exp
                # Check that their differences
                if not lib_diff:
                    doLibraryResampling = True
                else:
                    transit_tools.transit_error("Error: Library Strings (Ctrl = %s, Exp = %s) do not use the same letters. Make sure every letter / library is represented in both Control and Experimental Conditions. Proceeding with resampling assuming all datasets belong to the same library." % (self.ctrl_lib_str, self.exp_lib_str))
                    self.ctrl_lib_str = ""
                    self.exp_lib_str = ""

        (data, qval) = self.run_resampling(G_ctrl, G_exp, doLibraryResampling, histPath)
        self.write_output(data, qval, start_time)

        self.finish()
        self.transit_message("Finished resampling Method")
Esempio n. 8
0
    def Run(self):
        self.transit_message("Starting ZINB analysis")
        start_time = time.time()
        packnames = ("MASS", "pscl")
        r_packages_needed = [
            x for x in packnames if not rpackages.isinstalled(x)
        ]
        if (len(r_packages_needed) > 0):
            self.transit_error(
                "Error: Following R packages are required: %(0)s. From R console, You can install them using install.packages(c(%(0)s))"
                % ({
                    '0': '"{0}"'.format('", "'.join(r_packages_needed))
                }))
            sys.exit(1)

        self.transit_message("Getting Data")
        (sites, data,
         filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig)

        self.transit_message("Normalizing using: %s" % self.normalization)
        (data, factors) = norm_tools.normalize_data(data, self.normalization)

        condition_name = self.condition
        # if a covar is not found, this crashes; check for it?
        conditionsByFile, covariatesByFileList, interactionsByFileList, orderingMetadata = tnseq_tools.read_samples_metadata(
            self.metadata,
            self.covars,
            self.interactions,
            condition_name=condition_name)

        ## [Condition] in the order of files in combined wig
        conditions = self.wigs_to_conditions(conditionsByFile,
                                             filenamesInCombWig)
        ## [Covariate] in the order of files in combined wig
        covariates = self.wigs_to_covariates(covariatesByFileList,
                                             filenamesInCombWig)
        ## [Interaction] in the order of files in combined wig
        interactions = self.wigs_to_interactions(interactionsByFileList,
                                                 filenamesInCombWig)

        conditionsList = self.select_conditions(conditions,
                                                self.included_conditions,
                                                self.ignored_conditions,
                                                orderingMetadata)
        data, conditions, covariates, interactions = self.filter_wigs_by_conditions2(
            data,
            conditions,
            conditionsList,
            covariates=covariates,
            interactions=interactions)

        # show the samples associated with each condition (and covariates or interactions, if defined), and count samples in each cross-product of vars

        filesByCondition = self.invertDict(conditionsByFile)
        samples_used = set()
        for cond in conditionsList:
            samples_used.update(filesByCondition[cond])
        vars = [condition_name] + self.covars + self.interactions
        vars2vals = {}
        vars2vals[condition_name] = list(set(conditions))
        for i, var in enumerate(self.covars):
            vars2vals[var] = list(set(covariates[i]))
        for i, var in enumerate(self.interactions):
            vars2vals[var] = list(set(interactions[i]))
        varsByFileList = [conditionsByFile
                          ] + covariatesByFileList + interactionsByFileList
        for i, var in enumerate(vars):
            print("\nCondition/Covariate/Interaction: %s" % vars[i])
            filesByVar = self.invertDict(varsByFileList[i])
            for k, v in filesByVar.items():
                samples = list(samples_used.intersection(set(v)))
                if k in vars2vals.get(var, []):
                    print("%s: %s" % (k, ' '.join(samples)))
        pairs = []
        print("\nsamples in cross-product:")
        any_empty = self.expandVar([], vars, varsByFileList, vars2vals,
                                   set(samples_used))
        if any_empty:
            print(
                "warning: ZINB requires samples in all combinations of conditions; the fact that one is empty could result in Model Errors"
            )

        genes = tnseq_tools.read_genes(self.annotation_path)

        TASiteindexMap = {TA: i for i, TA in enumerate(sites)}
        RvSiteindexesMap = tnseq_tools.rv_siteindexes_map(genes,
                                                          TASiteindexMap,
                                                          nterm=self.NTerminus,
                                                          cterm=self.CTerminus)
        statsByRv, statGroupNames = self.stats_by_rv(data, RvSiteindexesMap,
                                                     genes, conditions,
                                                     interactions)
        LogZPercByRep, NZMeanByRep = self.global_stats_for_rep(data)

        self.transit_message("Running ZINB")
        pvals, qvals, run_status = self.run_zinb(data, genes, NZMeanByRep,
                                                 LogZPercByRep,
                                                 RvSiteindexesMap, conditions,
                                                 covariates, interactions)

        def orderStats(x, y):
            ic1 = x.split(SEPARATOR)
            ic2 = y.split(SEPARATOR)
            c1, i1 = (ic1[0], ic1[1]) if len(ic1) > 1 else (ic1[0], None)
            c2, i2 = (ic2[0], ic2[1]) if len(ic2) > 1 else (ic2[0], None)

            if len(self.included_conditions) > 0:
                condDiff = (self.included_conditions.index(c1) -
                            self.included_conditions.index(c2))
                ## Order by interaction, if stat belongs to same condition
                if condDiff == 0 and i1 is not None and i2 is not None:
                    return (orderingMetadata['interaction'].index(i1) -
                            orderingMetadata['interaction'].index(i2))
                return condDiff

            ## Order by samples metadata, if include flag not provided.
            condDiff = (orderingMetadata['condition'].index(c1) -
                        orderingMetadata['condition'].index(c2))
            if condDiff == 0 and i1 is not None and i2 is not None:
                return (orderingMetadata['interaction'].index(i1) -
                        orderingMetadata['interaction'].index(i2))
            return condDiff

        orderedStatGroupNames = sorted(statGroupNames,
                                       key=functools.cmp_to_key(orderStats))
        headersStatGroupNames = [
            x.replace(SEPARATOR, '_') for x in orderedStatGroupNames
        ]

        self.transit_message("Adding File: %s" % (self.output))
        file = open(self.output, "w")
        if len(headersStatGroupNames) == 2: lfcNames = ["LFC"]
        else: lfcNames = list(map(lambda v: "LFC_" + v, headersStatGroupNames))
        head = ("Rv Gene TAs".split() +
                list(map(lambda v: "Mean_" + v, headersStatGroupNames)) +
                lfcNames +
                list(map(lambda v: "NZmean_" + v, headersStatGroupNames)) +
                list(map(lambda v: "NZperc_" + v, headersStatGroupNames)) +
                "pval padj".split() + ["status"])

        file.write("#Console: python3 %s\n" % " ".join(sys.argv))
        file.write(
            "#parameters: normalization=%s, trimming=%s/%s%% (N/C), pseudocounts=%s\n"
            % (self.normalization, self.NTerminus, self.CTerminus, self.PC))
        file.write('#' + '\t'.join(head) + EOL)
        for gene in genes:
            Rv = gene["rv"]
            means = [
                statsByRv[Rv]['mean'][group] for group in orderedStatGroupNames
            ]
            PC = self.PC
            if len(means) == 2:
                LFCs = [numpy.math.log((means[1] + PC) / (means[0] + PC), 2)]
            else:
                m = numpy.mean(means)
                LFCs = [numpy.math.log((x + PC) / (m + PC), 2) for x in means]
            vals = ([Rv, gene["gene"],
                     str(len(RvSiteindexesMap[Rv]))] + [
                         "%0.1f" % statsByRv[Rv]['mean'][group]
                         for group in orderedStatGroupNames
                     ] + ["%0.3f" % x for x in LFCs] + [
                         "%0.1f" % statsByRv[Rv]['nz_mean'][group]
                         for group in orderedStatGroupNames
                     ] + [
                         "%0.2f" % statsByRv[Rv]['nz_perc'][group]
                         for group in orderedStatGroupNames
                     ] + ["%f" % x
                          for x in [pvals[Rv], qvals[Rv]]]) + [run_status[Rv]]
            file.write('\t'.join(vals) + EOL)
        file.close()
        self.transit_message("Finished Zinb analysis")
        self.transit_message("Time: %0.1fs\n" % (time.time() - start_time))
Esempio n. 9
0
    def Run(self):

        self.transit_message("Starting Gene Mean Counts Export")
        start_time = time.time()

        #Get orf data
        self.transit_message("Getting Data")
        if self.combined_wig:
            (position, fulldata,
             datasets) = tnseq_tools.read_combined_wig(self.ctrldata[0])
        else:
            (fulldata, position) = tnseq_tools.get_data(self.ctrldata)
        (fulldata,
         factors) = norm_tools.normalize_data(fulldata, self.normalization,
                                              self.ctrldata,
                                              self.annotation_path)
        position = position.astype(int)

        hash = transit_tools.get_pos_hash(self.annotation_path)
        rv2info = transit_tools.get_gene_info(self.annotation_path)

        self.transit_message("Normalizing")
        self.output.write("#Summarized to Mean Gene Counts with TRANSIT.\n")
        if self.normalization != "nonorm":
            self.output.write("#Reads normalized using '%s'\n" %
                              self.normalization)
            if type(factors[0]) == type(0.0):
                self.output.write(
                    "#Normalization Factors: %s\n" %
                    "\t".join(["%s" % f for f in factors.flatten()]))
            else:
                self.output.write("#Normalization Factors: %s\n" % " ".join(
                    [",".join(["%s" % bx for bx in b]) for b in factors]))

        self.output.write("#Files:\n")
        names = datasets if self.combined_wig else self.ctrldata
        for f in names:
            self.output.write("#%s\n" % f)

        K, Nsites = fulldata.shape
        # Get Gene objects
        if self.combined_wig:
            G = tnseq_tools.Genes(self.ctrldata,
                                  self.annotation_path,
                                  norm=self.normalization,
                                  data=fulldata,
                                  position=position)
        else:
            G = tnseq_tools.Genes(self.ctrldata,
                                  self.annotation_path,
                                  norm=self.normalization)
        N = len(G)
        self.progress_range(N)
        if self.combined_wig: dataset_header = '\t'.join(datasets)
        else:
            dataset_header = "\t".join(
                [transit_tools.fetch_name(D) for D in self.ctrldata])
        self.output.write("#Orf\tName\tNumber of TA sites\t%s\n" %
                          dataset_header)
        for i, gene in enumerate(G):
            if gene.n > 0:
                data_str = "\t".join(
                    ["%1.2f" % (M) for M in numpy.mean(gene.reads, 1)])
            else:
                data_str = "\t".join(["%1.2f" % (Z) for Z in numpy.zeros(K)])
            self.output.write("%s\t%s\t%s\t%s\n" %
                              (gene.orf, gene.name, gene.n, data_str))

            # Update progress
            text = "Running Export Method... %5.1f%%" % (100.0 * i / N)
            self.progress_update(text, i)
        self.output.close()

        self.transit_message("")  # Printing empty line to flush stdout
        self.finish()
        self.transit_message("Finished Export")
Esempio n. 10
0
    def Run(self):
        self.transit_message("Starting ZINB analysis")
        start_time = time.time()
        packnames = ("MASS", "pscl")
        r_packages_needed = [x for x in packnames if not rpackages.isinstalled(x)]
        if (len(r_packages_needed) > 0):
            self.transit_error(
                    "Error: Following R packages are required: %(0)s. From R console, You can install them using install.packages(c(%(0)s))"
                    % ({'0': '"{0}"'.format('", "'.join(r_packages_needed))}))
            sys.exit(1)


        self.transit_message("Getting Data")
        (sites, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig)

        self.transit_message("Normalizing using: %s" % self.normalization)
        (data, factors) = norm_tools.normalize_data(data, self.normalization)

        condition_name = self.condition
        conditionsByFile, covariatesByFileList, interactionsByFileList, orderingMetadata = tnseq_tools.read_samples_metadata(self.metadata, self.covars, self.interactions, condition_name=condition_name)

        ## [Condition] in the order of files in combined wig
        conditions = self.wigs_to_conditions(
            conditionsByFile,
            filenamesInCombWig)
        ## [Covariate] in the order of files in combined wig
        covariates = self.wigs_to_covariates(
            covariatesByFileList,
            filenamesInCombWig)
        ## [Interaction] in the order of files in combined wig
        interactions = self.wigs_to_interactions(
            interactionsByFileList,
            filenamesInCombWig)
        data, conditions, covariates, interactions = self.filter_wigs_by_conditions(
                data,
                conditions,
                covariates = covariates,
                interactions = interactions,
                ignored_conditions = self.ignored_conditions,
                included_conditions = self.included_conditions)

        genes = tnseq_tools.read_genes(self.annotation_path)

        TASiteindexMap = {TA: i for i, TA in enumerate(sites)}
        RvSiteindexesMap = tnseq_tools.rv_siteindexes_map(genes, TASiteindexMap, nterm=self.NTerminus, cterm=self.CTerminus)
        statsByRv, statGroupNames = self.stats_by_rv(data, RvSiteindexesMap, genes, conditions, interactions)
        LogZPercByRep, NZMeanByRep = self.global_stats_for_rep(data)

        self.transit_message("Running ZINB")
        pvals, qvals, run_status = self.run_zinb(data, genes, NZMeanByRep, LogZPercByRep, RvSiteindexesMap, conditions, covariates, interactions)

        def orderStats(x, y):
            ic1 = x.split("_")
            ic2 = y.split("_")
            c1, i1 = (ic1[0], ic1[1]) if len(ic1) > 1 else (ic1[0], None)
            c2, i2 = (ic2[0], ic2[1]) if len(ic2) > 1 else (ic2[0], None)

            if len(self.included_conditions) > 0:
                condDiff = (self.included_conditions.index(c1) - self.included_conditions.index(c2))
                ## Order by interaction, if stat belongs to same condition
                if condDiff == 0 and i1 is not None and i2 is not None:
                    return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2))
                return condDiff

            ## Order by samples metadata, if include flag not provided.
            condDiff = (orderingMetadata['condition'].index(c1) - orderingMetadata['condition'].index(c2))
            if condDiff == 0 and i1 is not None and i2 is not None:
                return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2))
            return condDiff

        orderedStatGroupNames = sorted(statGroupNames, orderStats)

        self.transit_message("Adding File: %s" % (self.output))
        file = open(self.output,"w")
        head = ("Rv Gene TAs".split() +
                map(lambda v: "Mean_" + v, orderedStatGroupNames) +
                map(lambda v: "NZmean_" + v, orderedStatGroupNames) +
                map(lambda v: "NZperc_" + v, orderedStatGroupNames) +
                "pval padj".split() + ["status"])

        file.write("#Console: python %s\n" % " ".join(sys.argv))
        file.write('\t'.join(head)+EOL)
        for gene in genes:
            Rv = gene["rv"]
            vals = ([Rv, gene["gene"], str(len(RvSiteindexesMap[Rv]))] +
                    ["%0.2f" % statsByRv[Rv]['mean'][group] for group in orderedStatGroupNames] +
                    ["%0.2f" % statsByRv[Rv]['nz_mean'][group] for group in orderedStatGroupNames] +
                    ["%0.2f" % statsByRv[Rv]['nz_perc'][group] for group in orderedStatGroupNames] +
                    ["%f" % x for x in [pvals[Rv], qvals[Rv]]]) + [run_status[Rv]]
            file.write('\t'.join(vals)+EOL)
        file.close()
        self.transit_message("Finished Zinb analysis")
        self.transit_message("Time: %0.1fs\n" % (time.time() - start_time))
Esempio n. 11
0
    def Run(self):
        self.transit_message("Starting ZINB analysis")
        start_time = time.time()
        packnames = ("MASS", "pscl")
        r_packages_needed = [x for x in packnames if not rpackages.isinstalled(x)]
        if (len(r_packages_needed) > 0):
            self.transit_error(
                    "Error: Following R packages are required: %(0)s. From R console, You can install them using install.packages(c(%(0)s))"
                    % ({'0': '"{0}"'.format('", "'.join(r_packages_needed))}))
            sys.exit(1)


        self.transit_message("Getting Data")
        (sites, data, filenamesInCombWig) = tnseq_tools.read_combined_wig(self.combined_wig)

        self.transit_message("Normalizing using: %s" % self.normalization)
        (data, factors) = norm_tools.normalize_data(data, self.normalization)

        condition_name = self.condition
        conditionsByFile, covariatesByFileList, interactionsByFileList, orderingMetadata = tnseq_tools.read_samples_metadata(self.metadata, self.covars, self.interactions, condition_name=condition_name)

        ## [Condition] in the order of files in combined wig
        conditions = self.wigs_to_conditions(
            conditionsByFile,
            filenamesInCombWig)
        ## [Covariate] in the order of files in combined wig
        covariates = self.wigs_to_covariates(
            covariatesByFileList,
            filenamesInCombWig)
        ## [Interaction] in the order of files in combined wig
        interactions = self.wigs_to_interactions(
            interactionsByFileList,
            filenamesInCombWig)
        data, conditions, covariates, interactions = self.filter_wigs_by_conditions(
                data,
                conditions,
                covariates = covariates,
                interactions = interactions,
                ignored_conditions = self.ignored_conditions,
                included_conditions = self.included_conditions)

        genes = tnseq_tools.read_genes(self.annotation_path)

        TASiteindexMap = {TA: i for i, TA in enumerate(sites)}
        RvSiteindexesMap = tnseq_tools.rv_siteindexes_map(genes, TASiteindexMap, nterm=self.NTerminus, cterm=self.CTerminus)
        statsByRv, statGroupNames = self.stats_by_rv(data, RvSiteindexesMap, genes, conditions, interactions)
        LogZPercByRep, NZMeanByRep = self.global_stats_for_rep(data)

        self.transit_message("Running ZINB")
        pvals, qvals, run_status = self.run_zinb(data, genes, NZMeanByRep, LogZPercByRep, RvSiteindexesMap, conditions, covariates, interactions)

        def orderStats(x, y):
            ic1 = x.split(SEPARATOR)
            ic2 = y.split(SEPARATOR)
            c1, i1 = (ic1[0], ic1[1]) if len(ic1) > 1 else (ic1[0], None)
            c2, i2 = (ic2[0], ic2[1]) if len(ic2) > 1 else (ic2[0], None)

            if len(self.included_conditions) > 0:
                condDiff = (self.included_conditions.index(c1) - self.included_conditions.index(c2))
                ## Order by interaction, if stat belongs to same condition
                if condDiff == 0 and i1 is not None and i2 is not None:
                    return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2))
                return condDiff

            ## Order by samples metadata, if include flag not provided.
            condDiff = (orderingMetadata['condition'].index(c1) - orderingMetadata['condition'].index(c2))
            if condDiff == 0 and i1 is not None and i2 is not None:
                return (orderingMetadata['interaction'].index(i1) - orderingMetadata['interaction'].index(i2))
            return condDiff

        orderedStatGroupNames = sorted(statGroupNames, key=functools.cmp_to_key(orderStats))
        headersStatGroupNames = [x.replace(SEPARATOR,'_') for x in orderedStatGroupNames]

        self.transit_message("Adding File: %s" % (self.output))
        file = open(self.output,"w")
        head = ("Rv Gene TAs".split() +
                list(map(lambda v: "Mean_" + v, headersStatGroupNames)) +
                list(map(lambda v: "NZmean_" + v, headersStatGroupNames)) +
                list(map(lambda v: "NZperc_" + v, headersStatGroupNames)) +
                "pval padj".split() + ["status"])

        file.write("#Console: python %s\n" % " ".join(sys.argv))
        file.write('\t'.join(head)+EOL)
        for gene in genes:
            Rv = gene["rv"]
            vals = ([Rv, gene["gene"], str(len(RvSiteindexesMap[Rv]))] +
                    ["%0.2f" % statsByRv[Rv]['mean'][group] for group in orderedStatGroupNames] +
                    ["%0.2f" % statsByRv[Rv]['nz_mean'][group] for group in orderedStatGroupNames] +
                    ["%0.2f" % statsByRv[Rv]['nz_perc'][group] for group in orderedStatGroupNames] +
                    ["%f" % x for x in [pvals[Rv], qvals[Rv]]]) + [run_status[Rv]]
            file.write('\t'.join(vals)+EOL)
        file.close()
        self.transit_message("Finished Zinb analysis")
        self.transit_message("Time: %0.1fs\n" % (time.time() - start_time))