def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--tags-tsv-file", dest="input_filename_tags", type="string", help="input file with tag counts [default=%default].") parser.add_option( "--result-tsv-file", dest="input_filename_result", type="string", help="input file with results (for plotdetagstats) " "[default=%default].") parser.add_option("-d", "--design-tsv-file", dest="input_filename_design", type="string", help="input file with experimental design " "[default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=("sleuth", "edger", "deseq2", "mock"), help="differential expression method to apply " "[default=%default].") parser.add_option("--deseq-dispersion-method", dest="deseq_dispersion_method", type="choice", choices=("pooled", "per-condition", "blind"), help="dispersion method for deseq [default=%default].") parser.add_option("--deseq-fit-type", dest="deseq_fit_type", type="choice", choices=("parametric", "local"), help="fit type for deseq [default=%default].") parser.add_option("--deseq-sharing-mode", dest="deseq_sharing_mode", type="choice", choices=("maximum", "fit-only", "gene-est-only"), help="deseq sharing mode [default=%default].") parser.add_option("--edger-dispersion", dest="edger_dispersion", type="float", help="dispersion value for edgeR if there are no " "replicates [default=%default].") parser.add_option("-f", "--fdr", dest="fdr", type="float", help="fdr to apply [default=%default].") parser.add_option("-R", "--output-R-code", dest="save_r_environment", type="string", help="save R environment [default=%default].") parser.add_option("-r", "--reference-group", dest="ref_group", type="string", help="Group to use as reference to compute " "fold changes against [default=$default]") parser.add_option("--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int", help="remove rows with less than this " "number of counts in total [default=%default].") parser.add_option("--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int", help="remove samples with a maximum count per sample of " "less than this number [default=%default].") parser.add_option("--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int", help="remove percent of rows with " "lowest total counts [default=%default].") parser.add_option("--model", dest="model", type="string", help=("model for GLM")) parser.add_option("--contrasts", dest="contrasts", action="append", help=("contrasts for post-hoc testing writen as comma " "seperated list `condition,replicate` etc")) parser.add_option("--deseq2-fit-type", dest="deseq2_fit_type", type="string", help=("fit type used for observed dispersion mean " "relationship in deseq2")) parser.add_option("--sleuth-counts-dir", dest="sleuth_counts_dir", type="string", help=("directory containing counts for sleuth. Sleuth " "expects counts files to be called abundance.h5")) parser.add_option("--outfile-sleuth-count", dest="outfile_sleuth_count", type="string", help=("outfile for full count table generated by sleuth")) parser.add_option("--outfile-sleuth-tpm", dest="outfile_sleuth_tpm", type="string", help=("outfile for full tpm table generated by sleuth")) parser.add_option("--use-ihw", dest="use_ihw", action="store_true", help=("use the independent hypothesis weighting method " "to obtain weighted FDR")) parser.add_option("--sleuth-genewise", dest="sleuth_genewise", action="store_true", help=("run genewise, rather than transcript level testing")) parser.add_option("--gene-biomart", dest="gene_biomart", type="string", help=("name of ensemble gene biomart")) parser.set_defaults( input_filename_tags="-", input_filename_result=None, input_filename_design=None, output_filename=sys.stdout, method="deseq2", fdr=0.1, deseq_dispersion_method="pooled", deseq_fit_type="parametric", deseq_sharing_mode="maximum", edger_dispersion=0.4, ref_group=False, save_r_environment=None, filter_min_counts_per_row=None, filter_min_counts_per_sample=None, filter_percentile_rowsums=None, spike_foldchange_max=4.0, spike_expression_max=5.0, spike_expression_bin_width=0.5, spike_foldchange_bin_width=0.5, spike_max_counts_per_bin=50, model=None, contrasts=None, output_filename_pattern=None, deseq2_fit_type="parametric", sleuth_counts_dir=None, outfile_sleuth_count=None, outfile_sleuth_tpm=None, use_ihw=False, sleuth_genewise=False, gene_biomart=None ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) outfile_prefix = options.output_filename_pattern + "_" + options.method # Sleuth reads in data itself so we don't need to create a counts object if options.method == "sleuth": assert options.sleuth_counts_dir, ( "need to specify the location of the abundance.h5 counts files") # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) # validate design against counts and model design.validate(model=options.model) experiment = Expression.DEExperiment_Sleuth() results = experiment.run(design, base_dir=options.sleuth_counts_dir, model=options.model, contrasts=options.contrasts, outfile_prefix=outfile_prefix, counts=options.outfile_sleuth_count, tpm=options.outfile_sleuth_tpm, fdr=options.fdr, genewise=options.sleuth_genewise, gene_biomart=options.gene_biomart) else: # create Counts object if options.input_filename_tags == "-": counts = Counts.Counts(pd.io.parsers.read_csv( sys.stdin, sep="\t", index_col=0, comment="#")) else: counts = Counts.Counts(pd.io.parsers.read_csv( IOTools.openFile(options.input_filename_tags, "r"), sep="\t", index_col=0, comment="#")) # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) # validate design against counts and model design.validate(counts, options.model) # restrict counts to samples in design table counts.restrict(design) # remove sample with low counts if options.filter_min_counts_per_sample: counts.removeSamples( min_counts_per_sample=options.filter_min_counts_per_sample) # remove observations with low counts if options.filter_min_counts_per_row: counts.removeObservationsFreq( min_counts_per_row=options.filter_min_counts_per_row) # remove bottom percentile of observations if options.filter_percentile_rowsums: counts.removeObservationsPerc( percentile_rowsums=options.filter_percentile_rowsums) # check samples are the same in counts and design following counts # filtering and, if not, restrict design table and re-validate design.revalidate(counts, options.model) # set up experiment and run tests if options.method == "ttest": experiment = Expression.DEExperiment_TTest() results = experiment.run(counts, design) elif options.method == "edger": experiment = Expression.DEExperiment_edgeR() results = experiment.run(counts, design, model=options.model, disperion=options.edger_dispersion, ref_group=options.ref_group, contrasts=options.contrasts, outfile_prefix=outfile_prefix) elif options.method == "deseq2": experiment = Expression.DEExperiment_DESeq2() results = experiment.run(counts, design, model=options.model, contrasts=options.contrasts, outfile_prefix=outfile_prefix, fdr=options.fdr, fit_type=options.deseq2_fit_type, ref_group=options.ref_group) results.getResults(fdr=options.fdr) if options.use_ihw: results.calculateIHW(alpha=options.fdr) for contrast in set(results.table['contrast']): results.plotVolcano(contrast, outfile_prefix=outfile_prefix) results.plotMA(contrast, outfile_prefix=outfile_prefix) results.table.to_csv(sys.stdout, sep="\t", na_rep="NA", index=False) results.summariseDEResults() # write out summary tables for each comparison/contrast for test_group in results.Summary.keys(): outf = IOTools.openFile("_".join( [outfile_prefix, test_group, "summary.tsv"]), "w") outf.write("category\tcounts\n%s\n" % results.Summary[test_group].asTable()) outf.close() E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--tag-tsv-file", dest="input_filename_tags", type="string", help="input file with tag counts [default=%default].") parser.add_option("-d", "--design-tsv-file", dest="input_filename_design", type="string", help="input file with experimental design " "[default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=("ttest", "sleuth", "edger", "deseq2", "mock", "dexseq"), help="differential expression method to apply " "[default=%default].") parser.add_option("--deseq2-dispersion-method", dest="deseq2_dispersion_method", type="choice", choices=("pooled", "per-condition", "blind"), help="dispersion method for deseq2 [default=%default].") parser.add_option("--deseq2-fit-type", dest="deseq2_fit_type", type="choice", choices=("parametric", "local"), help="fit type for deseq2 [default=%default].") parser.add_option("--edger-dispersion", dest="edger_dispersion", type="float", help="dispersion value for edgeR if there are no " "replicates [default=%default].") parser.add_option("-f", "--fdr", dest="fdr", type="float", help="fdr to apply [default=%default].") # currently not implemented # parser.add_option("-R", "--output-R-code", dest="save_r_environment", # type="string", # help="save R environment to loc [default=%default]") parser.add_option("-r", "--reference-group", dest="ref_group", type="string", help="Group to use as reference to compute " "fold changes against [default=$default]") parser.add_option("--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int", help="remove rows with less than this " "number of counts in total [default=%default].") parser.add_option("--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int", help="remove samples with a maximum count per sample of " "less than this number [default=%default].") parser.add_option("--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int", help="remove percent of rows with " "lowest total counts [default=%default].") parser.add_option("--model", dest="model", type="string", help=("model for GLM")) parser.add_option("--reduced-model", dest="reduced_model", type="string", help=("reduced model for LRT")) parser.add_option("--contrast", dest="contrast", type="string", help=("contrast for differential expression testing")) parser.add_option("--sleuth-counts-dir", dest="sleuth_counts_dir", type="string", help=("directory containing expression estimates" "from sleuth. Sleuth expects counts" "files to be called abundance.h5")) parser.add_option("--dexseq-counts-dir", dest="dexseq_counts_dir", type="string", help=("directory containing counts for dexseq. DEXSeq " "expects counts files to be called .txt and" "to be generated by the DEXSeq_counts.py script")) parser.add_option("--dexseq-flattened-file", dest="dexseq_flattened_file", type="string", help=("directory containing flat gtf for dexseq. DEXSeq " "expects this to be generated by the" "DEXSeq_prepare_annotations.py script")) parser.add_option( "--outfile-sleuth-count", dest="outfile_sleuth_count", type="string", help=("outfile for full count table generated by sleuth")) parser.add_option("--outfile-sleuth-tpm", dest="outfile_sleuth_tpm", type="string", help=("outfile for full tpm table generated by sleuth")) parser.add_option("--use-ihw", dest="use_ihw", action="store_true", help=("use the independent hypothesis weighting method " "to obtain weighted FDR")) parser.add_option( "--sleuth-genewise", dest="sleuth_genewise", action="store_true", help=("run genewise, rather than transcript level testing")) parser.add_option("--gene-biomart", dest="gene_biomart", type="string", help=("name of ensemble gene biomart")) parser.add_option("--de-test", dest="DEtest", type="choice", choices=("wald", "lrt"), help=("Differential expression test")) parser.add_option("--Rhistory", dest="Rhistory", type="string", help=("Outfile for R history")) parser.add_option("--Rimage", dest="Rimage", type="string", help=("Outfile for R image")) parser.set_defaults(input_filename_tags="-", input_filename_design=None, output_filename=sys.stdout, method="deseq2", fdr=0.1, deseq2_dispersion_method="pooled", deseq2_fit_type="parametric", edger_dispersion=0.4, ref_group=False, filter_min_counts_per_row=None, filter_min_counts_per_sample=None, filter_percentile_rowsums=None, spike_foldchange_max=4.0, spike_expression_max=5.0, spike_expression_bin_width=0.5, spike_foldchange_bin_width=0.5, spike_max_counts_per_bin=50, model=None, contrast=None, output_filename_pattern=None, sleuth_counts_dir=None, dexseq_counts_dir=None, dexseq_flattened_file=None, outfile_sleuth_count=None, outfile_sleuth_tpm=None, use_ihw=False, sleuth_genewise=False, gene_biomart=None, DEtest="wald", reduced_model=None, Rhistory=None, Rimage=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) RH = None if options.Rhistory or options.Rimage: RH = R.R_with_History() outfile_prefix = options.output_filename_pattern # Expression.py currently expects a refernce group for edgeR and # sleuth, regardless of which test is used if not options.ref_group and (options.method is "edger" or options.method is "sleuth"): raise ValueError( "Must provide a reference group ('--reference-group')") # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) if len(set(design.table[options.contrast])) > 2: if options.method == "deseq2" or options.method == "sleuth": if options.DEtest == "wald": raise ValueError( "Factor must have exactly two levels for Wald Test. " "If you have more than two levels in your factor, " "consider LRT") else: E.info('''There are more than 2 levels for the contrast specified" "(%s:%s). The log2fold changes in the results table and MA plots will be for the first two levels in the contrast. The p-value will be the p-value for the overall significance of the contrast. Hence, some genes will have a signficant p-value but 0-fold change between the first two levels''' % (options.contrast, set(design[options.contrast]))) # Sleuth reads in data itself so we don't need to create a counts object if options.method == "sleuth": assert options.sleuth_counts_dir, ( "need to specify the location of the abundance.h5 counts files " " (--sleuth-counts-dir)") # validate design against counts and model design.validate(model=options.model) experiment = Expression.DEExperiment_Sleuth() results = experiment.run(design, base_dir=options.sleuth_counts_dir, model=options.model, contrast=options.contrast, outfile_prefix=outfile_prefix, counts=options.outfile_sleuth_count, tpm=options.outfile_sleuth_tpm, fdr=options.fdr, genewise=options.sleuth_genewise, gene_biomart=options.gene_biomart, DE_test=options.DEtest, ref_group=options.ref_group, reduced_model=options.reduced_model) # DEXSeq reads in data itself elif options.method == "dexseq": assert options.dexseq_counts_dir, ( "need to specify the location of the .txt counts files") # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) # validate design against counts and model # design.validate(model=options.model) experiment = Expression.DEExperiment_DEXSeq() results = experiment.run(design, base_dir=options.dexseq_counts_dir, model=options.model, contrast=options.contrast, ref_group=options.ref_group, outfile_prefix=outfile_prefix, flattenedfile=options.dexseq_flattened_file, fdr=options.fdr) else: # create Counts object if options.input_filename_tags == "-": counts = Counts.Counts( pd.io.parsers.read_csv(sys.stdin, sep="\t", index_col=0, comment="#")) else: counts = Counts.Counts( pd.io.parsers.read_csv(IOTools.openFile( options.input_filename_tags, "r"), sep="\t", index_col=0, comment="#")) # validate design against counts and model design.validate(counts, options.model) # restrict counts to samples in design table counts.restrict(design) # remove sample with low counts if options.filter_min_counts_per_sample: counts.removeSamples( min_counts_per_sample=options.filter_min_counts_per_sample) # remove observations with low counts if options.filter_min_counts_per_row: counts.removeObservationsFreq( min_counts_per_row=options.filter_min_counts_per_row) # remove bottom percentile of observations if options.filter_percentile_rowsums: counts.removeObservationsPerc( percentile_rowsums=options.filter_percentile_rowsums) # check samples are the same in counts and design following counts # filtering and, if not, restrict design table and re-validate design.revalidate(counts, options.model) # set up experiment and run tests if options.method == "ttest": experiment = Expression.DEExperiment_TTest() results = experiment.run(counts, design) elif options.method == "edger": experiment = Expression.DEExperiment_edgeR() results = experiment.run(counts, design, model=options.model, contrast=options.contrast, outfile_prefix=outfile_prefix, ref_group=options.ref_group, fdr=options.fdr, dispersion=options.edger_dispersion) elif options.method == "deseq2": experiment = Expression.DEExperiment_DESeq2() results = experiment.run(counts, design, model=options.model, contrast=options.contrast, outfile_prefix=outfile_prefix, fdr=options.fdr, fit_type=options.deseq2_fit_type, ref_group=options.ref_group, DEtest=options.DEtest, R=RH) results.getResults(fdr=options.fdr) if options.use_ihw: results.calculateIHW(alpha=options.fdr) for contrast in set(results.table['contrast']): results.plotVolcano(contrast, outfile_prefix=outfile_prefix, R=RH) results.plotMA(contrast, outfile_prefix=outfile_prefix, R=RH) results.plotPvalueHist(contrast, outfile_prefix=outfile_prefix, R=RH) results.plotPvalueQQ(contrast, outfile_prefix=outfile_prefix, R=RH) results.table.to_csv(sys.stdout, sep="\t", na_rep="NA", index=False) results.summariseDEResults() # write out summary tables for each comparison/contrast for test_group in list(results.Summary.keys()): outf = IOTools.openFile( "_".join([outfile_prefix, test_group, "summary.tsv"]), "w") outf.write("category\tcounts\n%s\n" % results.Summary[test_group].asTable()) outf.close() if options.Rhistory: RH.saveHistory(options.Rhistory) if options.Rimage: RH.saveImage(options.Rimage) E.Stop()