def runSleuthAll(samples, base_dir, counts, tpm): ''' run sleuth for all samples to obtain counts and tpm tables Note: all samples in the design table must also have a directory with the same name in `base_dir` with kallisto results in a file called abundance.h5 ''' design = pd.DataFrame({ "group": ([0, 1] * ((len(samples) + 1) / 2))[0:len(samples)], "include": [ 1, ] * len(samples), "pair": [ 0, ] * len(samples) }) design.index = samples Design = Expression.ExperimentalDesign(design) exp = Expression.DEExperiment_Sleuth() res = exp.run(Design, base_dir, counts=counts, tpm=tpm, model="~group", dummy_run=True)
def runSleuth(design, base_dir, model, contrasts, outfile, counts, tpm, fdr, lrt=False, reduced_model=None): ''' run sleuth. Note: all samples in the design table must also have a directory with the same name in `base_dir` with kallisto results in a file called abundance.h5''' outfile_prefix = P.snip(outfile, ".tsv") Design = Expression.ExperimentalDesign(design) exp = Expression.DEExperiment_Sleuth() res = exp.run(Design, base_dir, model, contrasts, outfile_prefix, counts, tpm, fdr, lrt, reduced_model) res.getResults(fdr) for contrast in set(res.table['contrast']): res.plotMA(contrast, outfile_prefix) res.plotVolcano(contrast, outfile_prefix) res.table.to_csv(outfile, sep="\t", index=False)
def runRMATS(gtffile, designfile, pvalue, strand, outdir, permute=0): '''Module to generate rMATS statment Module offers the option to permute group name labels and calculates readlength, which must be identical in all reads. Arguments --------- gtffile: string path to :term:`gtf` file designfile: string path to design file pvalue: string threshold for FDR testing strand: string strandedness option: can be 'fr-unstranded', 'fr-firststrand', or 'fr-secondstrand' outdir: string directory path for rMATS results permute : 1 or 0 option to activate random shuffling of sample groups ''' design = Expression.ExperimentalDesign(designfile) if permute == 1: design.table.group = random.choice( list(itertools.permutations(design.table.group))) group1 = ",".join( ["%s.bam" % x for x in design.getSamplesInGroup(design.groups[0])]) with open(outdir + "/b1.txt", "w") as f: f.write(group1) group2 = ",".join( ["%s.bam" % x for x in design.getSamplesInGroup(design.groups[1])]) with open(outdir + "/b2.txt", "w") as f: f.write(group2) readlength = BamTools.estimateTagSize(design.samples[0] + ".bam") statement = '''rMATS --b1 %(outdir)s/b1.txt --b2 %(outdir)s/b2.txt --gtf <(gunzip -c %(gtffile)s) --od %(outdir)s --readLength %(readlength)s --cstat %(pvalue)s --libType %(strand)s ''' % locals() # if Paired End Reads if BamTools.isPaired(design.samples[0] + ".bam"): statement += '''-t paired''' % locals() statement += ''' > %(outdir)s/%(designfile)s.log ''' P.run()
def rmats2sashimi(infile, designfile, FDR, outfile): '''Module to generate sashimi plots from rMATS output Module generates a statement to call rmats2sashimiplot and provides it with correct arguments. Only results containing no NA in results and below FDR threshold are drawn to prevent unneccassary compute and memory use. Arguments --------- infile: string path to rMATS results file (can be one of five types) designfile: string path to design file FDR: string FDR threshold for drawing plots' outfile: string directory path for sashimiplot output ''' Design = Expression.ExperimentalDesign(designfile) if len(Design.groups) != 2: raise ValueError("Please specify exactly 2 groups per experiment.") g1 = Design.getSamplesInGroup(Design.groups[0]) g2 = Design.getSamplesInGroup(Design.groups[1]) if len(g1) != len(g2): g1 = g1[:min(len(g1), len(g2))] g2 = g2[:min(len(g1), len(g2))] E.info("The two groups compared were of unequal size. For " + "visual display using sashimi they have been truncated " + "to the same length") group1 = ",".join(["%s.bam" % x for x in g1]) group2 = ",".join(["%s.bam" % x for x in g2]) group1name = Design.groups[0] group2name = Design.groups[1] event = os.path.basename(os.path.normpath(outfile)) statement = '''cat %(infile)s|grep -v NA| awk '$20 < %(FDR)s' > %(infile)s_sig.txt; checkpoint; rmats2sashimiplot --b1 %(group1)s --b2 %(group2)s -t %(event)s -e %(infile)s_sig.txt --l1 %(group1name)s --l2 %(group2name)s -o %(outfile)s > %(outfile)s/%(event)s.log ''' % locals() P.run()
def makeExpressionSummaryPlots(counts_inf, design_inf, logfile): ''' use the plotting methods for Counts object to make summary plots''' with IOTools.openFile(logfile, "w") as log: plot_prefix = P.snip(logfile, ".log") # need to manually read in data as index column is not the first column counts = Counts.Counts(pd.read_table(counts_inf, sep="\t")) counts.table.set_index(["transcript_id"]) design = Expression.ExperimentalDesign(design_inf) # make certain counts table only include samples in design counts.restrict(design) cor_outfile = plot_prefix + "_pairwise_correlations.png" pca_var_outfile = plot_prefix + "_pca_variance.png" pca1_outfile = plot_prefix + "_pc1_pc2.png" pca2_outfile = plot_prefix + "_pc3_pc4.png" heatmap_outfile = plot_prefix + "_heatmap.png" counts_log10 = counts.log(base=10, pseudocount=0.1, inplace=False) counts_highExp = counts_log10.clone() counts_highExp.table['order'] = counts_highExp.table.apply(np.mean, axis=1) counts_highExp.table.sort(["order"], ascending=0, inplace=True) counts_highExp.table = counts_highExp.table.iloc[0:500, :] counts_highExp.table.drop("order", axis=1, inplace=True) log.write("plot correlations: %s\n" % cor_outfile) counts_log10.plotPairwiseCorrelations(cor_outfile, subset=1000) log.write("plot pc3,pc4: %s\n" % pca1_outfile) counts_log10.plotPCA(design, pca_var_outfile, pca1_outfile, x_axis="PC1", y_axis="PC2", colour="group", shape="group") log.write("plot pc3,pc4: %s\n" % pca2_outfile) counts_log10.plotPCA(design, pca_var_outfile, pca2_outfile, x_axis="PC3", y_axis="PC4", colour="group", shape="group") log.write("plot heatmap: %s\n" % heatmap_outfile) counts_highExp.heatmap(heatmap_outfile)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--tags-tsv-file", dest="input_filename_tags", type="string", help="input file with tag counts [default=%default].") parser.add_option( "--result-tsv-file", dest="input_filename_result", type="string", help="input file with results (for plotdetagstats) " "[default=%default].") parser.add_option("-d", "--design-tsv-file", dest="input_filename_design", type="string", help="input file with experimental design " "[default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=("sleuth", "edger", "deseq2", "mock"), help="differential expression method to apply " "[default=%default].") parser.add_option("--deseq-dispersion-method", dest="deseq_dispersion_method", type="choice", choices=("pooled", "per-condition", "blind"), help="dispersion method for deseq [default=%default].") parser.add_option("--deseq-fit-type", dest="deseq_fit_type", type="choice", choices=("parametric", "local"), help="fit type for deseq [default=%default].") parser.add_option("--deseq-sharing-mode", dest="deseq_sharing_mode", type="choice", choices=("maximum", "fit-only", "gene-est-only"), help="deseq sharing mode [default=%default].") parser.add_option("--edger-dispersion", dest="edger_dispersion", type="float", help="dispersion value for edgeR if there are no " "replicates [default=%default].") parser.add_option("-f", "--fdr", dest="fdr", type="float", help="fdr to apply [default=%default].") parser.add_option("-R", "--output-R-code", dest="save_r_environment", type="string", help="save R environment [default=%default].") parser.add_option("-r", "--reference-group", dest="ref_group", type="string", help="Group to use as reference to compute " "fold changes against [default=$default]") parser.add_option("--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int", help="remove rows with less than this " "number of counts in total [default=%default].") parser.add_option("--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int", help="remove samples with a maximum count per sample of " "less than this number [default=%default].") parser.add_option("--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int", help="remove percent of rows with " "lowest total counts [default=%default].") parser.add_option("--model", dest="model", type="string", help=("model for GLM")) parser.add_option("--contrasts", dest="contrasts", action="append", help=("contrasts for post-hoc testing writen as comma " "seperated list `condition,replicate` etc")) parser.add_option("--deseq2-fit-type", dest="deseq2_fit_type", type="string", help=("fit type used for observed dispersion mean " "relationship in deseq2")) parser.add_option("--sleuth-counts-dir", dest="sleuth_counts_dir", type="string", help=("directory containing counts for sleuth. Sleuth " "expects counts files to be called abundance.h5")) parser.add_option("--outfile-sleuth-count", dest="outfile_sleuth_count", type="string", help=("outfile for full count table generated by sleuth")) parser.add_option("--outfile-sleuth-tpm", dest="outfile_sleuth_tpm", type="string", help=("outfile for full tpm table generated by sleuth")) parser.add_option("--use-ihw", dest="use_ihw", action="store_true", help=("use the independent hypothesis weighting method " "to obtain weighted FDR")) parser.add_option("--sleuth-genewise", dest="sleuth_genewise", action="store_true", help=("run genewise, rather than transcript level testing")) parser.add_option("--gene-biomart", dest="gene_biomart", type="string", help=("name of ensemble gene biomart")) parser.set_defaults( input_filename_tags="-", input_filename_result=None, input_filename_design=None, output_filename=sys.stdout, method="deseq2", fdr=0.1, deseq_dispersion_method="pooled", deseq_fit_type="parametric", deseq_sharing_mode="maximum", edger_dispersion=0.4, ref_group=False, save_r_environment=None, filter_min_counts_per_row=None, filter_min_counts_per_sample=None, filter_percentile_rowsums=None, spike_foldchange_max=4.0, spike_expression_max=5.0, spike_expression_bin_width=0.5, spike_foldchange_bin_width=0.5, spike_max_counts_per_bin=50, model=None, contrasts=None, output_filename_pattern=None, deseq2_fit_type="parametric", sleuth_counts_dir=None, outfile_sleuth_count=None, outfile_sleuth_tpm=None, use_ihw=False, sleuth_genewise=False, gene_biomart=None ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) outfile_prefix = options.output_filename_pattern + "_" + options.method # Sleuth reads in data itself so we don't need to create a counts object if options.method == "sleuth": assert options.sleuth_counts_dir, ( "need to specify the location of the abundance.h5 counts files") # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) # validate design against counts and model design.validate(model=options.model) experiment = Expression.DEExperiment_Sleuth() results = experiment.run(design, base_dir=options.sleuth_counts_dir, model=options.model, contrasts=options.contrasts, outfile_prefix=outfile_prefix, counts=options.outfile_sleuth_count, tpm=options.outfile_sleuth_tpm, fdr=options.fdr, genewise=options.sleuth_genewise, gene_biomart=options.gene_biomart) else: # create Counts object if options.input_filename_tags == "-": counts = Counts.Counts(pd.io.parsers.read_csv( sys.stdin, sep="\t", index_col=0, comment="#")) else: counts = Counts.Counts(pd.io.parsers.read_csv( IOTools.openFile(options.input_filename_tags, "r"), sep="\t", index_col=0, comment="#")) # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) # validate design against counts and model design.validate(counts, options.model) # restrict counts to samples in design table counts.restrict(design) # remove sample with low counts if options.filter_min_counts_per_sample: counts.removeSamples( min_counts_per_sample=options.filter_min_counts_per_sample) # remove observations with low counts if options.filter_min_counts_per_row: counts.removeObservationsFreq( min_counts_per_row=options.filter_min_counts_per_row) # remove bottom percentile of observations if options.filter_percentile_rowsums: counts.removeObservationsPerc( percentile_rowsums=options.filter_percentile_rowsums) # check samples are the same in counts and design following counts # filtering and, if not, restrict design table and re-validate design.revalidate(counts, options.model) # set up experiment and run tests if options.method == "ttest": experiment = Expression.DEExperiment_TTest() results = experiment.run(counts, design) elif options.method == "edger": experiment = Expression.DEExperiment_edgeR() results = experiment.run(counts, design, model=options.model, disperion=options.edger_dispersion, ref_group=options.ref_group, contrasts=options.contrasts, outfile_prefix=outfile_prefix) elif options.method == "deseq2": experiment = Expression.DEExperiment_DESeq2() results = experiment.run(counts, design, model=options.model, contrasts=options.contrasts, outfile_prefix=outfile_prefix, fdr=options.fdr, fit_type=options.deseq2_fit_type, ref_group=options.ref_group) results.getResults(fdr=options.fdr) if options.use_ihw: results.calculateIHW(alpha=options.fdr) for contrast in set(results.table['contrast']): results.plotVolcano(contrast, outfile_prefix=outfile_prefix) results.plotMA(contrast, outfile_prefix=outfile_prefix) results.table.to_csv(sys.stdout, sep="\t", na_rep="NA", index=False) results.summariseDEResults() # write out summary tables for each comparison/contrast for test_group in results.Summary.keys(): outf = IOTools.openFile("_".join( [outfile_prefix, test_group, "summary.tsv"]), "w") outf.write("category\tcounts\n%s\n" % results.Summary[test_group].asTable()) outf.close() E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--tag-tsv-file", dest="input_filename_tags", type="string", help="input file with tag counts [default=%default].") parser.add_option("-d", "--design-tsv-file", dest="input_filename_design", type="string", help="input file with experimental design " "[default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=("ttest", "sleuth", "edger", "deseq2", "mock", "dexseq"), help="differential expression method to apply " "[default=%default].") parser.add_option("--deseq2-dispersion-method", dest="deseq2_dispersion_method", type="choice", choices=("pooled", "per-condition", "blind"), help="dispersion method for deseq2 [default=%default].") parser.add_option("--deseq2-fit-type", dest="deseq2_fit_type", type="choice", choices=("parametric", "local"), help="fit type for deseq2 [default=%default].") parser.add_option("--edger-dispersion", dest="edger_dispersion", type="float", help="dispersion value for edgeR if there are no " "replicates [default=%default].") parser.add_option("-f", "--fdr", dest="fdr", type="float", help="fdr to apply [default=%default].") # currently not implemented # parser.add_option("-R", "--output-R-code", dest="save_r_environment", # type="string", # help="save R environment to loc [default=%default]") parser.add_option("-r", "--reference-group", dest="ref_group", type="string", help="Group to use as reference to compute " "fold changes against [default=$default]") parser.add_option("--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int", help="remove rows with less than this " "number of counts in total [default=%default].") parser.add_option("--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int", help="remove samples with a maximum count per sample of " "less than this number [default=%default].") parser.add_option("--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int", help="remove percent of rows with " "lowest total counts [default=%default].") parser.add_option("--model", dest="model", type="string", help=("model for GLM")) parser.add_option("--reduced-model", dest="reduced_model", type="string", help=("reduced model for LRT")) parser.add_option("--contrast", dest="contrast", type="string", help=("contrast for differential expression testing")) parser.add_option("--sleuth-counts-dir", dest="sleuth_counts_dir", type="string", help=("directory containing expression estimates" "from sleuth. Sleuth expects counts" "files to be called abundance.h5")) parser.add_option("--dexseq-counts-dir", dest="dexseq_counts_dir", type="string", help=("directory containing counts for dexseq. DEXSeq " "expects counts files to be called .txt and" "to be generated by the DEXSeq_counts.py script")) parser.add_option("--dexseq-flattened-file", dest="dexseq_flattened_file", type="string", help=("directory containing flat gtf for dexseq. DEXSeq " "expects this to be generated by the" "DEXSeq_prepare_annotations.py script")) parser.add_option( "--outfile-sleuth-count", dest="outfile_sleuth_count", type="string", help=("outfile for full count table generated by sleuth")) parser.add_option("--outfile-sleuth-tpm", dest="outfile_sleuth_tpm", type="string", help=("outfile for full tpm table generated by sleuth")) parser.add_option("--use-ihw", dest="use_ihw", action="store_true", help=("use the independent hypothesis weighting method " "to obtain weighted FDR")) parser.add_option( "--sleuth-genewise", dest="sleuth_genewise", action="store_true", help=("run genewise, rather than transcript level testing")) parser.add_option("--gene-biomart", dest="gene_biomart", type="string", help=("name of ensemble gene biomart")) parser.add_option("--de-test", dest="DEtest", type="choice", choices=("wald", "lrt"), help=("Differential expression test")) parser.add_option("--Rhistory", dest="Rhistory", type="string", help=("Outfile for R history")) parser.add_option("--Rimage", dest="Rimage", type="string", help=("Outfile for R image")) parser.set_defaults(input_filename_tags="-", input_filename_design=None, output_filename=sys.stdout, method="deseq2", fdr=0.1, deseq2_dispersion_method="pooled", deseq2_fit_type="parametric", edger_dispersion=0.4, ref_group=False, filter_min_counts_per_row=None, filter_min_counts_per_sample=None, filter_percentile_rowsums=None, spike_foldchange_max=4.0, spike_expression_max=5.0, spike_expression_bin_width=0.5, spike_foldchange_bin_width=0.5, spike_max_counts_per_bin=50, model=None, contrast=None, output_filename_pattern=None, sleuth_counts_dir=None, dexseq_counts_dir=None, dexseq_flattened_file=None, outfile_sleuth_count=None, outfile_sleuth_tpm=None, use_ihw=False, sleuth_genewise=False, gene_biomart=None, DEtest="wald", reduced_model=None, Rhistory=None, Rimage=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) RH = None if options.Rhistory or options.Rimage: RH = R.R_with_History() outfile_prefix = options.output_filename_pattern # Expression.py currently expects a refernce group for edgeR and # sleuth, regardless of which test is used if not options.ref_group and (options.method is "edger" or options.method is "sleuth"): raise ValueError( "Must provide a reference group ('--reference-group')") # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) if len(set(design.table[options.contrast])) > 2: if options.method == "deseq2" or options.method == "sleuth": if options.DEtest == "wald": raise ValueError( "Factor must have exactly two levels for Wald Test. " "If you have more than two levels in your factor, " "consider LRT") else: E.info('''There are more than 2 levels for the contrast specified" "(%s:%s). The log2fold changes in the results table and MA plots will be for the first two levels in the contrast. The p-value will be the p-value for the overall significance of the contrast. Hence, some genes will have a signficant p-value but 0-fold change between the first two levels''' % (options.contrast, set(design[options.contrast]))) # Sleuth reads in data itself so we don't need to create a counts object if options.method == "sleuth": assert options.sleuth_counts_dir, ( "need to specify the location of the abundance.h5 counts files " " (--sleuth-counts-dir)") # validate design against counts and model design.validate(model=options.model) experiment = Expression.DEExperiment_Sleuth() results = experiment.run(design, base_dir=options.sleuth_counts_dir, model=options.model, contrast=options.contrast, outfile_prefix=outfile_prefix, counts=options.outfile_sleuth_count, tpm=options.outfile_sleuth_tpm, fdr=options.fdr, genewise=options.sleuth_genewise, gene_biomart=options.gene_biomart, DE_test=options.DEtest, ref_group=options.ref_group, reduced_model=options.reduced_model) # DEXSeq reads in data itself elif options.method == "dexseq": assert options.dexseq_counts_dir, ( "need to specify the location of the .txt counts files") # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) # validate design against counts and model # design.validate(model=options.model) experiment = Expression.DEExperiment_DEXSeq() results = experiment.run(design, base_dir=options.dexseq_counts_dir, model=options.model, contrast=options.contrast, ref_group=options.ref_group, outfile_prefix=outfile_prefix, flattenedfile=options.dexseq_flattened_file, fdr=options.fdr) else: # create Counts object if options.input_filename_tags == "-": counts = Counts.Counts( pd.io.parsers.read_csv(sys.stdin, sep="\t", index_col=0, comment="#")) else: counts = Counts.Counts( pd.io.parsers.read_csv(IOTools.openFile( options.input_filename_tags, "r"), sep="\t", index_col=0, comment="#")) # validate design against counts and model design.validate(counts, options.model) # restrict counts to samples in design table counts.restrict(design) # remove sample with low counts if options.filter_min_counts_per_sample: counts.removeSamples( min_counts_per_sample=options.filter_min_counts_per_sample) # remove observations with low counts if options.filter_min_counts_per_row: counts.removeObservationsFreq( min_counts_per_row=options.filter_min_counts_per_row) # remove bottom percentile of observations if options.filter_percentile_rowsums: counts.removeObservationsPerc( percentile_rowsums=options.filter_percentile_rowsums) # check samples are the same in counts and design following counts # filtering and, if not, restrict design table and re-validate design.revalidate(counts, options.model) # set up experiment and run tests if options.method == "ttest": experiment = Expression.DEExperiment_TTest() results = experiment.run(counts, design) elif options.method == "edger": experiment = Expression.DEExperiment_edgeR() results = experiment.run(counts, design, model=options.model, contrast=options.contrast, outfile_prefix=outfile_prefix, ref_group=options.ref_group, fdr=options.fdr, dispersion=options.edger_dispersion) elif options.method == "deseq2": experiment = Expression.DEExperiment_DESeq2() results = experiment.run(counts, design, model=options.model, contrast=options.contrast, outfile_prefix=outfile_prefix, fdr=options.fdr, fit_type=options.deseq2_fit_type, ref_group=options.ref_group, DEtest=options.DEtest, R=RH) results.getResults(fdr=options.fdr) if options.use_ihw: results.calculateIHW(alpha=options.fdr) for contrast in set(results.table['contrast']): results.plotVolcano(contrast, outfile_prefix=outfile_prefix, R=RH) results.plotMA(contrast, outfile_prefix=outfile_prefix, R=RH) results.plotPvalueHist(contrast, outfile_prefix=outfile_prefix, R=RH) results.plotPvalueQQ(contrast, outfile_prefix=outfile_prefix, R=RH) results.table.to_csv(sys.stdout, sep="\t", na_rep="NA", index=False) results.summariseDEResults() # write out summary tables for each comparison/contrast for test_group in list(results.Summary.keys()): outf = IOTools.openFile( "_".join([outfile_prefix, test_group, "summary.tsv"]), "w") outf.write("category\tcounts\n%s\n" % results.Summary[test_group].asTable()) outf.close() if options.Rhistory: RH.saveHistory(options.Rhistory) if options.Rimage: RH.saveImage(options.Rimage) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-d", "--design-tsv-file", dest="input_filename_design", type="string", help="input file with experimental design " "[default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=("filter", "spike", "normalize"), help="differential expression method to apply " "[default=%default].") parser.add_option("--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int", help="remove rows with less than this " "number of counts in total [default=%default].") parser.add_option("--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int", help="remove samples with a maximum count per sample of " "less than this numer [default=%default].") parser.add_option("--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int", help="remove percent of rows with " "lowest total counts [default=%default].") parser.add_option("--spike-change-bin-min", dest="min_cbin", type="float", help="minimum bin for change bins [default=%default].") parser.add_option("--spike-change-bin-max", dest="max_cbin", type="float", help="maximum bin for change bins [default=%default].") parser.add_option("--spike-change-bin-width", dest="width_cbin", type="float", help="bin width for change bins [default=%default].") parser.add_option("--spike-initial-bin-min", dest="min_ibin", type="float", help="minimum bin for initial bins[default=%default].") parser.add_option("--spike-initial-bin-max", dest="max_ibin", type="float", help="maximum bin for intitial bins[default=%default].") parser.add_option("--spike-initial-bin-width", dest="width_ibin", type="float", help="bin width intitial bins[default=%default].") parser.add_option( "--spike-minimum", dest="min_spike", type="int", help="minimum number of spike-ins required within each bin\ [default=%default].") parser.add_option( "--spike-maximum", dest="max_spike", type="int", help="maximum number of spike-ins allowed within each bin\ [default=%default].") parser.add_option("--spike-difference-method", dest="difference", type="choice", choices=("relative", "logfold", "abs_logfold"), help="method to use for calculating difference\ [default=%default].") parser.add_option("--spike-iterations", dest="iterations", type="int", help="number of iterations to generate spike-ins\ [default=%default].") parser.add_option("--spike-cluster-maximum-distance", dest="cluster_max_distance", type="int", help="maximum distance between adjacent loci in cluster\ [default=%default].") parser.add_option("--spike-cluster-minimum-size", dest="cluster_min_size", type="int", help="minimum number of loci required per cluster\ [default=%default].") parser.add_option("--spike-type", dest="spike_type", type="choice", choices=("row", "cluster"), help="spike in type [default=%default].") parser.add_option("--spike-subcluster-min-size", dest="min_sbin", type="int", help="minimum size of subcluster\ [default=%default].") parser.add_option("--spike-subcluster-max-size", dest="max_sbin", type="int", help="maximum size of subcluster\ [default=%default].") parser.add_option("--spike-subcluster-bin-width", dest="width_sbin", type="int", help="bin width for subcluster size\ [default=%default].") parser.add_option("--spike-output-method", dest="output_method", type="choice", choices=("append", "seperate"), help="defines whether the spike-ins should be appended\ to the original table or seperately [default=%default].") parser.add_option("--spike-shuffle-column-suffix", dest="shuffle_suffix", type="string", help="the suffix of the columns which are to be shuffled\ [default=%default].") parser.add_option("--spike-keep-column-suffix", dest="keep_suffix", type="string", help="a list of suffixes for the columns which are to be\ keep along with the shuffled columns[default=%default].") parser.add_option("--normalization-method", dest="normalization_method", type="choice", choices=("deseq-size-factors", "total-count", "total-column", "total-row"), help="normalization method to apply [%default]") parser.add_option("-t", "--tags-tsv-file", dest="input_filename_tags", type="string", help="input file with tag counts [default=%default].") parser.set_defaults(input_filename_tags="-", method="filter", filter_min_counts_per_row=None, filter_min_counts_per_sample=None, filter_percentile_rowsums=None, output_method="seperate", difference="logfold", spike_type="row", min_cbin=0, max_cbin=100, width_cbin=100, min_ibin=0, max_ibin=100, width_ibin=100, max_spike=100, min_spike=None, iterations=1, cluster_max_distance=100, cluster_min_size=10, min_sbin=1, max_sbin=1, width_sbin=1, shuffle_suffix=None, keep_suffix=None, normalization_method="deseq-size-factors") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) # load if options.keep_suffix: # if using suffix, loadTagDataPandas will throw an error as it # looks for column names which exactly match the design # "tracks" need to write function in Counts.py to handle # counts table and design table + suffix counts = pd.read_csv(options.stdin, sep="\t", comment="#") inf = IOTools.openFile(options.input_filename_design) design = pd.read_csv(inf, sep="\t", index_col=0) inf.close() design = design[design["include"] != 0] if options.method in ("filter", "spike"): if options.input_filename_design is None: raise ValueError("method '%s' requires a design file" % options.method) else: # create Counts object # TS if spike type is cluster, need to keep "contig" and "position" # columns out of index if options.spike_type == "cluster": index = None, else: index = 0 if options.input_filename_tags == "-": counts = Counts.Counts( pd.io.parsers.read_csv(options.stdin, sep="\t", index_col=index, comment="#")) else: counts = Counts.Counts(IOTools.openFile( options.input_filename_tags, "r"), sep="\t", index_col=index, comment="#") # TS normalization doesn't require a design table if not options.method == "normalize": assert options.input_filename_design and os.path.exists( options.input_filename_design) # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) if options.method == "filter": assert (options.filter_min_counts_per_sample is not None or options.filter_min_counts_per_row is not None or options.filter_percentile_rowsums is not None), \ "no filtering parameters have been suplied" # filter # remove sample with low counts if options.filter_min_counts_per_sample: counts.removeSamples( min_counts_per_sample=options.filter_min_counts_per_sample) # remove observations with low counts if options.filter_min_counts_per_row: counts.removeObservationsFreq( min_counts_per_row=options.filter_min_counts_per_row) # remove bottom percentile of observations if options.filter_percentile_rowsums: counts.removeObservationsPerc( percentile_rowsums=options.filter_percentile_rowsums) nobservations, nsamples = counts.table.shape if nobservations == 0: E.warn("no observations remaining after filtering- no output") return if nsamples == 0: E.warn("no samples remain after filtering - no output") return # write out counts.table.to_csv(options.stdout, sep="\t", header=True) elif options.method == "normalize": counts.normalise(method=options.normalization_method, row_title="total") # write out counts.table.to_csv(options.stdout, sep="\t", header=True) elif options.method == "spike": # check parameters are sensible and set parameters where they # are not explicitly set if not options.min_spike: E.info("setting minimum number of spikes per bin to equal" "maximum number of spikes per bin (%s)" % options.max_spike) options.min_spike = options.max_spike if options.spike_type == "cluster": assert options.max_sbin <= options.cluster_min_size, \ ("max size of subscluster: %s is greater than min size of" "cluster: %s" % (options.max_sbin, options.cluster_min_size)) counts_columns = set(counts.table.columns.values.tolist()) assert ("contig" in counts_columns and "position" in counts_columns), \ ("cluster analysis requires columns named 'contig' and" "'position' in the dataframe") counts.sort(sort_columns=["contig", "position"], reset_index=True) # restrict design table to first pair only design.firstPairOnly() # get dictionaries to map group members to column names # use different methods depending on whether suffixes are supplied if options.keep_suffix: g_to_keep_tracks, g_to_spike_tracks = design.mapGroupsSuffix( options.shuffle_suffix, options.keep_suffix) else: # if no suffixes supplied, spike and keep tracks are the same g_to_track = design.getGroups2Samples() g_to_spike_tracks, g_to_keep_tracks = (g_to_track, g_to_track) # set up numpy arrays for change and initial values change_bins = np.arange(options.min_cbin, options.max_cbin, options.width_cbin) initial_bins = np.arange(options.min_ibin, options.max_ibin, options.width_ibin) E.info("Column boundaries are: %s" % str(change_bins)) E.info("Row boundaries are: %s" % str(initial_bins)) # shuffle rows/clusters if options.spike_type == "cluster": E.info("looking for clusters...") clusters_dict = Counts.findClusters(counts_sort, options.cluster_max_distance, options.cluster_min_size, g_to_spike_tracks, groups) if len(clusters_dict) == 0: raise Exception("no clusters were found, check parameters") E.info("shuffling subcluster regions...") output_indices, counts = Counts.shuffleCluster( initial_bins, change_bins, g_to_spike_tracks, groups, options.difference, options.max_spike, options.iterations, clusters_dict, options.max_sbin, options.min_sbin, options.width_sbin) elif options.spike_type == "row": E.info("shuffling rows...") output_indices, bin_counts = counts.shuffleRows( options.min_cbin, options.max_cbin, options.width_cbin, options.min_ibin, options.max_ibin, options.width_ibin, g_to_spike_tracks, design.groups, options.difference, options.max_spike, options.iterations) filled_bins = Counts.thresholdBins(output_indices, bin_counts, options.min_spike) assert len(filled_bins) > 0, "No bins contained enough spike-ins" # write out counts.outputSpikes(filled_bins, g_to_keep_tracks, design.groups, output_method=options.output_method, spike_type=options.spike_type, min_cbin=options.min_cbin, width_cbin=options.width_cbin, max_cbin=options.max_cbin, min_ibin=options.min_ibin, width_ibin=options.width_ibin, max_ibin=options.max_ibin, min_sbin=options.min_sbin, width_sbin=options.width_sbin, max_sbin=options.max_sbin) E.Stop()