def setUp(self): self.counts = Counts.Counts( pandas.DataFrame({ 'sample1': [0, 1, 2], 'sample2': [2, 4, 3] }))
def makeExpressionSummaryPlots(counts_inf, design_inf, logfile): ''' use the plotting methods for Counts object to make summary plots''' with IOTools.openFile(logfile, "w") as log: plot_prefix = P.snip(logfile, ".log") # need to manually read in data as index column is not the first column counts = Counts.Counts(pd.read_table(counts_inf, sep="\t")) counts.table.set_index(["transcript_id"]) design = Expression.ExperimentalDesign(design_inf) # make certain counts table only include samples in design counts.restrict(design) cor_outfile = plot_prefix + "_pairwise_correlations.png" pca_var_outfile = plot_prefix + "_pca_variance.png" pca1_outfile = plot_prefix + "_pc1_pc2.png" pca2_outfile = plot_prefix + "_pc3_pc4.png" heatmap_outfile = plot_prefix + "_heatmap.png" counts_log10 = counts.log(base=10, pseudocount=0.1, inplace=False) counts_highExp = counts_log10.clone() counts_highExp.table['order'] = counts_highExp.table.apply(np.mean, axis=1) counts_highExp.table.sort(["order"], ascending=0, inplace=True) counts_highExp.table = counts_highExp.table.iloc[0:500, :] counts_highExp.table.drop("order", axis=1, inplace=True) log.write("plot correlations: %s\n" % cor_outfile) counts_log10.plotPairwiseCorrelations(cor_outfile, subset=1000) log.write("plot pc3,pc4: %s\n" % pca1_outfile) counts_log10.plotPCA(design, pca_var_outfile, pca1_outfile, x_axis="PC1", y_axis="PC2", colour="group", shape="group") log.write("plot pc3,pc4: %s\n" % pca2_outfile) counts_log10.plotPCA(design, pca_var_outfile, pca2_outfile, x_axis="PC3", y_axis="PC4", colour="group", shape="group") log.write("plot heatmap: %s\n" % heatmap_outfile) counts_highExp.heatmap(heatmap_outfile)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--tags-tsv-file", dest="input_filename_tags", type="string", help="input file with tag counts [default=%default].") parser.add_option( "--result-tsv-file", dest="input_filename_result", type="string", help="input file with results (for plotdetagstats) " "[default=%default].") parser.add_option("-d", "--design-tsv-file", dest="input_filename_design", type="string", help="input file with experimental design " "[default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=("sleuth", "edger", "deseq2", "mock"), help="differential expression method to apply " "[default=%default].") parser.add_option("--deseq-dispersion-method", dest="deseq_dispersion_method", type="choice", choices=("pooled", "per-condition", "blind"), help="dispersion method for deseq [default=%default].") parser.add_option("--deseq-fit-type", dest="deseq_fit_type", type="choice", choices=("parametric", "local"), help="fit type for deseq [default=%default].") parser.add_option("--deseq-sharing-mode", dest="deseq_sharing_mode", type="choice", choices=("maximum", "fit-only", "gene-est-only"), help="deseq sharing mode [default=%default].") parser.add_option("--edger-dispersion", dest="edger_dispersion", type="float", help="dispersion value for edgeR if there are no " "replicates [default=%default].") parser.add_option("-f", "--fdr", dest="fdr", type="float", help="fdr to apply [default=%default].") parser.add_option("-R", "--output-R-code", dest="save_r_environment", type="string", help="save R environment [default=%default].") parser.add_option("-r", "--reference-group", dest="ref_group", type="string", help="Group to use as reference to compute " "fold changes against [default=$default]") parser.add_option("--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int", help="remove rows with less than this " "number of counts in total [default=%default].") parser.add_option("--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int", help="remove samples with a maximum count per sample of " "less than this number [default=%default].") parser.add_option("--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int", help="remove percent of rows with " "lowest total counts [default=%default].") parser.add_option("--model", dest="model", type="string", help=("model for GLM")) parser.add_option("--contrasts", dest="contrasts", action="append", help=("contrasts for post-hoc testing writen as comma " "seperated list `condition,replicate` etc")) parser.add_option("--deseq2-fit-type", dest="deseq2_fit_type", type="string", help=("fit type used for observed dispersion mean " "relationship in deseq2")) parser.add_option("--sleuth-counts-dir", dest="sleuth_counts_dir", type="string", help=("directory containing counts for sleuth. Sleuth " "expects counts files to be called abundance.h5")) parser.add_option("--outfile-sleuth-count", dest="outfile_sleuth_count", type="string", help=("outfile for full count table generated by sleuth")) parser.add_option("--outfile-sleuth-tpm", dest="outfile_sleuth_tpm", type="string", help=("outfile for full tpm table generated by sleuth")) parser.add_option("--use-ihw", dest="use_ihw", action="store_true", help=("use the independent hypothesis weighting method " "to obtain weighted FDR")) parser.add_option("--sleuth-genewise", dest="sleuth_genewise", action="store_true", help=("run genewise, rather than transcript level testing")) parser.add_option("--gene-biomart", dest="gene_biomart", type="string", help=("name of ensemble gene biomart")) parser.set_defaults( input_filename_tags="-", input_filename_result=None, input_filename_design=None, output_filename=sys.stdout, method="deseq2", fdr=0.1, deseq_dispersion_method="pooled", deseq_fit_type="parametric", deseq_sharing_mode="maximum", edger_dispersion=0.4, ref_group=False, save_r_environment=None, filter_min_counts_per_row=None, filter_min_counts_per_sample=None, filter_percentile_rowsums=None, spike_foldchange_max=4.0, spike_expression_max=5.0, spike_expression_bin_width=0.5, spike_foldchange_bin_width=0.5, spike_max_counts_per_bin=50, model=None, contrasts=None, output_filename_pattern=None, deseq2_fit_type="parametric", sleuth_counts_dir=None, outfile_sleuth_count=None, outfile_sleuth_tpm=None, use_ihw=False, sleuth_genewise=False, gene_biomart=None ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) outfile_prefix = options.output_filename_pattern + "_" + options.method # Sleuth reads in data itself so we don't need to create a counts object if options.method == "sleuth": assert options.sleuth_counts_dir, ( "need to specify the location of the abundance.h5 counts files") # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) # validate design against counts and model design.validate(model=options.model) experiment = Expression.DEExperiment_Sleuth() results = experiment.run(design, base_dir=options.sleuth_counts_dir, model=options.model, contrasts=options.contrasts, outfile_prefix=outfile_prefix, counts=options.outfile_sleuth_count, tpm=options.outfile_sleuth_tpm, fdr=options.fdr, genewise=options.sleuth_genewise, gene_biomart=options.gene_biomart) else: # create Counts object if options.input_filename_tags == "-": counts = Counts.Counts(pd.io.parsers.read_csv( sys.stdin, sep="\t", index_col=0, comment="#")) else: counts = Counts.Counts(pd.io.parsers.read_csv( IOTools.openFile(options.input_filename_tags, "r"), sep="\t", index_col=0, comment="#")) # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) # validate design against counts and model design.validate(counts, options.model) # restrict counts to samples in design table counts.restrict(design) # remove sample with low counts if options.filter_min_counts_per_sample: counts.removeSamples( min_counts_per_sample=options.filter_min_counts_per_sample) # remove observations with low counts if options.filter_min_counts_per_row: counts.removeObservationsFreq( min_counts_per_row=options.filter_min_counts_per_row) # remove bottom percentile of observations if options.filter_percentile_rowsums: counts.removeObservationsPerc( percentile_rowsums=options.filter_percentile_rowsums) # check samples are the same in counts and design following counts # filtering and, if not, restrict design table and re-validate design.revalidate(counts, options.model) # set up experiment and run tests if options.method == "ttest": experiment = Expression.DEExperiment_TTest() results = experiment.run(counts, design) elif options.method == "edger": experiment = Expression.DEExperiment_edgeR() results = experiment.run(counts, design, model=options.model, disperion=options.edger_dispersion, ref_group=options.ref_group, contrasts=options.contrasts, outfile_prefix=outfile_prefix) elif options.method == "deseq2": experiment = Expression.DEExperiment_DESeq2() results = experiment.run(counts, design, model=options.model, contrasts=options.contrasts, outfile_prefix=outfile_prefix, fdr=options.fdr, fit_type=options.deseq2_fit_type, ref_group=options.ref_group) results.getResults(fdr=options.fdr) if options.use_ihw: results.calculateIHW(alpha=options.fdr) for contrast in set(results.table['contrast']): results.plotVolcano(contrast, outfile_prefix=outfile_prefix) results.plotMA(contrast, outfile_prefix=outfile_prefix) results.table.to_csv(sys.stdout, sep="\t", na_rep="NA", index=False) results.summariseDEResults() # write out summary tables for each comparison/contrast for test_group in results.Summary.keys(): outf = IOTools.openFile("_".join( [outfile_prefix, test_group, "summary.tsv"]), "w") outf.write("category\tcounts\n%s\n" % results.Summary[test_group].asTable()) outf.close() E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--tag-tsv-file", dest="input_filename_tags", type="string", help="input file with tag counts [default=%default].") parser.add_option("-d", "--design-tsv-file", dest="input_filename_design", type="string", help="input file with experimental design " "[default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=("ttest", "sleuth", "edger", "deseq2", "mock", "dexseq"), help="differential expression method to apply " "[default=%default].") parser.add_option("--deseq2-dispersion-method", dest="deseq2_dispersion_method", type="choice", choices=("pooled", "per-condition", "blind"), help="dispersion method for deseq2 [default=%default].") parser.add_option("--deseq2-fit-type", dest="deseq2_fit_type", type="choice", choices=("parametric", "local"), help="fit type for deseq2 [default=%default].") parser.add_option("--edger-dispersion", dest="edger_dispersion", type="float", help="dispersion value for edgeR if there are no " "replicates [default=%default].") parser.add_option("-f", "--fdr", dest="fdr", type="float", help="fdr to apply [default=%default].") # currently not implemented # parser.add_option("-R", "--output-R-code", dest="save_r_environment", # type="string", # help="save R environment to loc [default=%default]") parser.add_option("-r", "--reference-group", dest="ref_group", type="string", help="Group to use as reference to compute " "fold changes against [default=$default]") parser.add_option("--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int", help="remove rows with less than this " "number of counts in total [default=%default].") parser.add_option("--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int", help="remove samples with a maximum count per sample of " "less than this number [default=%default].") parser.add_option("--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int", help="remove percent of rows with " "lowest total counts [default=%default].") parser.add_option("--model", dest="model", type="string", help=("model for GLM")) parser.add_option("--reduced-model", dest="reduced_model", type="string", help=("reduced model for LRT")) parser.add_option("--contrast", dest="contrast", type="string", help=("contrast for differential expression testing")) parser.add_option("--sleuth-counts-dir", dest="sleuth_counts_dir", type="string", help=("directory containing expression estimates" "from sleuth. Sleuth expects counts" "files to be called abundance.h5")) parser.add_option("--dexseq-counts-dir", dest="dexseq_counts_dir", type="string", help=("directory containing counts for dexseq. DEXSeq " "expects counts files to be called .txt and" "to be generated by the DEXSeq_counts.py script")) parser.add_option("--dexseq-flattened-file", dest="dexseq_flattened_file", type="string", help=("directory containing flat gtf for dexseq. DEXSeq " "expects this to be generated by the" "DEXSeq_prepare_annotations.py script")) parser.add_option( "--outfile-sleuth-count", dest="outfile_sleuth_count", type="string", help=("outfile for full count table generated by sleuth")) parser.add_option("--outfile-sleuth-tpm", dest="outfile_sleuth_tpm", type="string", help=("outfile for full tpm table generated by sleuth")) parser.add_option("--use-ihw", dest="use_ihw", action="store_true", help=("use the independent hypothesis weighting method " "to obtain weighted FDR")) parser.add_option( "--sleuth-genewise", dest="sleuth_genewise", action="store_true", help=("run genewise, rather than transcript level testing")) parser.add_option("--gene-biomart", dest="gene_biomart", type="string", help=("name of ensemble gene biomart")) parser.add_option("--de-test", dest="DEtest", type="choice", choices=("wald", "lrt"), help=("Differential expression test")) parser.add_option("--Rhistory", dest="Rhistory", type="string", help=("Outfile for R history")) parser.add_option("--Rimage", dest="Rimage", type="string", help=("Outfile for R image")) parser.set_defaults(input_filename_tags="-", input_filename_design=None, output_filename=sys.stdout, method="deseq2", fdr=0.1, deseq2_dispersion_method="pooled", deseq2_fit_type="parametric", edger_dispersion=0.4, ref_group=False, filter_min_counts_per_row=None, filter_min_counts_per_sample=None, filter_percentile_rowsums=None, spike_foldchange_max=4.0, spike_expression_max=5.0, spike_expression_bin_width=0.5, spike_foldchange_bin_width=0.5, spike_max_counts_per_bin=50, model=None, contrast=None, output_filename_pattern=None, sleuth_counts_dir=None, dexseq_counts_dir=None, dexseq_flattened_file=None, outfile_sleuth_count=None, outfile_sleuth_tpm=None, use_ihw=False, sleuth_genewise=False, gene_biomart=None, DEtest="wald", reduced_model=None, Rhistory=None, Rimage=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) RH = None if options.Rhistory or options.Rimage: RH = R.R_with_History() outfile_prefix = options.output_filename_pattern # Expression.py currently expects a refernce group for edgeR and # sleuth, regardless of which test is used if not options.ref_group and (options.method is "edger" or options.method is "sleuth"): raise ValueError( "Must provide a reference group ('--reference-group')") # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) if len(set(design.table[options.contrast])) > 2: if options.method == "deseq2" or options.method == "sleuth": if options.DEtest == "wald": raise ValueError( "Factor must have exactly two levels for Wald Test. " "If you have more than two levels in your factor, " "consider LRT") else: E.info('''There are more than 2 levels for the contrast specified" "(%s:%s). The log2fold changes in the results table and MA plots will be for the first two levels in the contrast. The p-value will be the p-value for the overall significance of the contrast. Hence, some genes will have a signficant p-value but 0-fold change between the first two levels''' % (options.contrast, set(design[options.contrast]))) # Sleuth reads in data itself so we don't need to create a counts object if options.method == "sleuth": assert options.sleuth_counts_dir, ( "need to specify the location of the abundance.h5 counts files " " (--sleuth-counts-dir)") # validate design against counts and model design.validate(model=options.model) experiment = Expression.DEExperiment_Sleuth() results = experiment.run(design, base_dir=options.sleuth_counts_dir, model=options.model, contrast=options.contrast, outfile_prefix=outfile_prefix, counts=options.outfile_sleuth_count, tpm=options.outfile_sleuth_tpm, fdr=options.fdr, genewise=options.sleuth_genewise, gene_biomart=options.gene_biomart, DE_test=options.DEtest, ref_group=options.ref_group, reduced_model=options.reduced_model) # DEXSeq reads in data itself elif options.method == "dexseq": assert options.dexseq_counts_dir, ( "need to specify the location of the .txt counts files") # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) # validate design against counts and model # design.validate(model=options.model) experiment = Expression.DEExperiment_DEXSeq() results = experiment.run(design, base_dir=options.dexseq_counts_dir, model=options.model, contrast=options.contrast, ref_group=options.ref_group, outfile_prefix=outfile_prefix, flattenedfile=options.dexseq_flattened_file, fdr=options.fdr) else: # create Counts object if options.input_filename_tags == "-": counts = Counts.Counts( pd.io.parsers.read_csv(sys.stdin, sep="\t", index_col=0, comment="#")) else: counts = Counts.Counts( pd.io.parsers.read_csv(IOTools.openFile( options.input_filename_tags, "r"), sep="\t", index_col=0, comment="#")) # validate design against counts and model design.validate(counts, options.model) # restrict counts to samples in design table counts.restrict(design) # remove sample with low counts if options.filter_min_counts_per_sample: counts.removeSamples( min_counts_per_sample=options.filter_min_counts_per_sample) # remove observations with low counts if options.filter_min_counts_per_row: counts.removeObservationsFreq( min_counts_per_row=options.filter_min_counts_per_row) # remove bottom percentile of observations if options.filter_percentile_rowsums: counts.removeObservationsPerc( percentile_rowsums=options.filter_percentile_rowsums) # check samples are the same in counts and design following counts # filtering and, if not, restrict design table and re-validate design.revalidate(counts, options.model) # set up experiment and run tests if options.method == "ttest": experiment = Expression.DEExperiment_TTest() results = experiment.run(counts, design) elif options.method == "edger": experiment = Expression.DEExperiment_edgeR() results = experiment.run(counts, design, model=options.model, contrast=options.contrast, outfile_prefix=outfile_prefix, ref_group=options.ref_group, fdr=options.fdr, dispersion=options.edger_dispersion) elif options.method == "deseq2": experiment = Expression.DEExperiment_DESeq2() results = experiment.run(counts, design, model=options.model, contrast=options.contrast, outfile_prefix=outfile_prefix, fdr=options.fdr, fit_type=options.deseq2_fit_type, ref_group=options.ref_group, DEtest=options.DEtest, R=RH) results.getResults(fdr=options.fdr) if options.use_ihw: results.calculateIHW(alpha=options.fdr) for contrast in set(results.table['contrast']): results.plotVolcano(contrast, outfile_prefix=outfile_prefix, R=RH) results.plotMA(contrast, outfile_prefix=outfile_prefix, R=RH) results.plotPvalueHist(contrast, outfile_prefix=outfile_prefix, R=RH) results.plotPvalueQQ(contrast, outfile_prefix=outfile_prefix, R=RH) results.table.to_csv(sys.stdout, sep="\t", na_rep="NA", index=False) results.summariseDEResults() # write out summary tables for each comparison/contrast for test_group in list(results.Summary.keys()): outf = IOTools.openFile( "_".join([outfile_prefix, test_group, "summary.tsv"]), "w") outf.write("category\tcounts\n%s\n" % results.Summary[test_group].asTable()) outf.close() if options.Rhistory: RH.saveHistory(options.Rhistory) if options.Rimage: RH.saveImage(options.Rimage) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-d", "--design-tsv-file", dest="input_filename_design", type="string", help="input file with experimental design " "[default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=("filter", "spike", "normalize"), help="differential expression method to apply " "[default=%default].") parser.add_option("--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int", help="remove rows with less than this " "number of counts in total [default=%default].") parser.add_option("--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int", help="remove samples with a maximum count per sample of " "less than this numer [default=%default].") parser.add_option("--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int", help="remove percent of rows with " "lowest total counts [default=%default].") parser.add_option("--spike-change-bin-min", dest="min_cbin", type="float", help="minimum bin for change bins [default=%default].") parser.add_option("--spike-change-bin-max", dest="max_cbin", type="float", help="maximum bin for change bins [default=%default].") parser.add_option("--spike-change-bin-width", dest="width_cbin", type="float", help="bin width for change bins [default=%default].") parser.add_option("--spike-initial-bin-min", dest="min_ibin", type="float", help="minimum bin for initial bins[default=%default].") parser.add_option("--spike-initial-bin-max", dest="max_ibin", type="float", help="maximum bin for intitial bins[default=%default].") parser.add_option("--spike-initial-bin-width", dest="width_ibin", type="float", help="bin width intitial bins[default=%default].") parser.add_option( "--spike-minimum", dest="min_spike", type="int", help="minimum number of spike-ins required within each bin\ [default=%default].") parser.add_option( "--spike-maximum", dest="max_spike", type="int", help="maximum number of spike-ins allowed within each bin\ [default=%default].") parser.add_option("--spike-difference-method", dest="difference", type="choice", choices=("relative", "logfold", "abs_logfold"), help="method to use for calculating difference\ [default=%default].") parser.add_option("--spike-iterations", dest="iterations", type="int", help="number of iterations to generate spike-ins\ [default=%default].") parser.add_option("--spike-cluster-maximum-distance", dest="cluster_max_distance", type="int", help="maximum distance between adjacent loci in cluster\ [default=%default].") parser.add_option("--spike-cluster-minimum-size", dest="cluster_min_size", type="int", help="minimum number of loci required per cluster\ [default=%default].") parser.add_option("--spike-type", dest="spike_type", type="choice", choices=("row", "cluster"), help="spike in type [default=%default].") parser.add_option("--spike-subcluster-min-size", dest="min_sbin", type="int", help="minimum size of subcluster\ [default=%default].") parser.add_option("--spike-subcluster-max-size", dest="max_sbin", type="int", help="maximum size of subcluster\ [default=%default].") parser.add_option("--spike-subcluster-bin-width", dest="width_sbin", type="int", help="bin width for subcluster size\ [default=%default].") parser.add_option("--spike-output-method", dest="output_method", type="choice", choices=("append", "seperate"), help="defines whether the spike-ins should be appended\ to the original table or seperately [default=%default].") parser.add_option("--spike-shuffle-column-suffix", dest="shuffle_suffix", type="string", help="the suffix of the columns which are to be shuffled\ [default=%default].") parser.add_option("--spike-keep-column-suffix", dest="keep_suffix", type="string", help="a list of suffixes for the columns which are to be\ keep along with the shuffled columns[default=%default].") parser.add_option("--normalization-method", dest="normalization_method", type="choice", choices=("deseq-size-factors", "total-count", "total-column", "total-row"), help="normalization method to apply [%default]") parser.add_option("-t", "--tags-tsv-file", dest="input_filename_tags", type="string", help="input file with tag counts [default=%default].") parser.set_defaults(input_filename_tags="-", method="filter", filter_min_counts_per_row=None, filter_min_counts_per_sample=None, filter_percentile_rowsums=None, output_method="seperate", difference="logfold", spike_type="row", min_cbin=0, max_cbin=100, width_cbin=100, min_ibin=0, max_ibin=100, width_ibin=100, max_spike=100, min_spike=None, iterations=1, cluster_max_distance=100, cluster_min_size=10, min_sbin=1, max_sbin=1, width_sbin=1, shuffle_suffix=None, keep_suffix=None, normalization_method="deseq-size-factors") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) # load if options.keep_suffix: # if using suffix, loadTagDataPandas will throw an error as it # looks for column names which exactly match the design # "tracks" need to write function in Counts.py to handle # counts table and design table + suffix counts = pd.read_csv(options.stdin, sep="\t", comment="#") inf = IOTools.openFile(options.input_filename_design) design = pd.read_csv(inf, sep="\t", index_col=0) inf.close() design = design[design["include"] != 0] if options.method in ("filter", "spike"): if options.input_filename_design is None: raise ValueError("method '%s' requires a design file" % options.method) else: # create Counts object # TS if spike type is cluster, need to keep "contig" and "position" # columns out of index if options.spike_type == "cluster": index = None, else: index = 0 if options.input_filename_tags == "-": counts = Counts.Counts( pd.io.parsers.read_csv(options.stdin, sep="\t", index_col=index, comment="#")) else: counts = Counts.Counts(IOTools.openFile( options.input_filename_tags, "r"), sep="\t", index_col=index, comment="#") # TS normalization doesn't require a design table if not options.method == "normalize": assert options.input_filename_design and os.path.exists( options.input_filename_design) # create Design object design = Expression.ExperimentalDesign( pd.read_csv(IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) if options.method == "filter": assert (options.filter_min_counts_per_sample is not None or options.filter_min_counts_per_row is not None or options.filter_percentile_rowsums is not None), \ "no filtering parameters have been suplied" # filter # remove sample with low counts if options.filter_min_counts_per_sample: counts.removeSamples( min_counts_per_sample=options.filter_min_counts_per_sample) # remove observations with low counts if options.filter_min_counts_per_row: counts.removeObservationsFreq( min_counts_per_row=options.filter_min_counts_per_row) # remove bottom percentile of observations if options.filter_percentile_rowsums: counts.removeObservationsPerc( percentile_rowsums=options.filter_percentile_rowsums) nobservations, nsamples = counts.table.shape if nobservations == 0: E.warn("no observations remaining after filtering- no output") return if nsamples == 0: E.warn("no samples remain after filtering - no output") return # write out counts.table.to_csv(options.stdout, sep="\t", header=True) elif options.method == "normalize": counts.normalise(method=options.normalization_method, row_title="total") # write out counts.table.to_csv(options.stdout, sep="\t", header=True) elif options.method == "spike": # check parameters are sensible and set parameters where they # are not explicitly set if not options.min_spike: E.info("setting minimum number of spikes per bin to equal" "maximum number of spikes per bin (%s)" % options.max_spike) options.min_spike = options.max_spike if options.spike_type == "cluster": assert options.max_sbin <= options.cluster_min_size, \ ("max size of subscluster: %s is greater than min size of" "cluster: %s" % (options.max_sbin, options.cluster_min_size)) counts_columns = set(counts.table.columns.values.tolist()) assert ("contig" in counts_columns and "position" in counts_columns), \ ("cluster analysis requires columns named 'contig' and" "'position' in the dataframe") counts.sort(sort_columns=["contig", "position"], reset_index=True) # restrict design table to first pair only design.firstPairOnly() # get dictionaries to map group members to column names # use different methods depending on whether suffixes are supplied if options.keep_suffix: g_to_keep_tracks, g_to_spike_tracks = design.mapGroupsSuffix( options.shuffle_suffix, options.keep_suffix) else: # if no suffixes supplied, spike and keep tracks are the same g_to_track = design.getGroups2Samples() g_to_spike_tracks, g_to_keep_tracks = (g_to_track, g_to_track) # set up numpy arrays for change and initial values change_bins = np.arange(options.min_cbin, options.max_cbin, options.width_cbin) initial_bins = np.arange(options.min_ibin, options.max_ibin, options.width_ibin) E.info("Column boundaries are: %s" % str(change_bins)) E.info("Row boundaries are: %s" % str(initial_bins)) # shuffle rows/clusters if options.spike_type == "cluster": E.info("looking for clusters...") clusters_dict = Counts.findClusters(counts_sort, options.cluster_max_distance, options.cluster_min_size, g_to_spike_tracks, groups) if len(clusters_dict) == 0: raise Exception("no clusters were found, check parameters") E.info("shuffling subcluster regions...") output_indices, counts = Counts.shuffleCluster( initial_bins, change_bins, g_to_spike_tracks, groups, options.difference, options.max_spike, options.iterations, clusters_dict, options.max_sbin, options.min_sbin, options.width_sbin) elif options.spike_type == "row": E.info("shuffling rows...") output_indices, bin_counts = counts.shuffleRows( options.min_cbin, options.max_cbin, options.width_cbin, options.min_ibin, options.max_ibin, options.width_ibin, g_to_spike_tracks, design.groups, options.difference, options.max_spike, options.iterations) filled_bins = Counts.thresholdBins(output_indices, bin_counts, options.min_spike) assert len(filled_bins) > 0, "No bins contained enough spike-ins" # write out counts.outputSpikes(filled_bins, g_to_keep_tracks, design.groups, output_method=options.output_method, spike_type=options.spike_type, min_cbin=options.min_cbin, width_cbin=options.width_cbin, max_cbin=options.max_cbin, min_ibin=options.min_ibin, width_ibin=options.width_ibin, max_ibin=options.max_ibin, min_sbin=options.min_sbin, width_sbin=options.width_sbin, max_sbin=options.max_sbin) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-d", "--design-tsv-file", dest="input_filename_design", type="string", help="input file with experimental design " "[default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=("filter", "spike", "normalize"), help="differential expression method to apply " "[default=%default].") parser.add_option("--filter-min-counts-per-row", dest="filter_min_counts_per_row", type="int", help="remove rows with less than this " "number of counts in total [default=%default].") parser.add_option("--filter-min-counts-per-sample", dest="filter_min_counts_per_sample", type="int", help="remove samples with a maximum count per sample of " "less than this numer [default=%default].") parser.add_option("--filter-percentile-rowsums", dest="filter_percentile_rowsums", type="int", help="remove percent of rows with " "lowest total counts [default=%default].") parser.add_option("--spike-change-bin-min", dest="min_cbin", type="float", help="minimum bin for change bins [default=%default].") parser.add_option("--spike-change-bin-max", dest="max_cbin", type="float", help="maximum bin for change bins [default=%default].") parser.add_option("--spike-change-bin-width", dest="width_cbin", type="float", help="bin width for change bins [default=%default].") parser.add_option("--spike-initial-bin-min", dest="min_ibin", type="float", help="minimum bin for initial bins[default=%default].") parser.add_option("--spike-initial-bin-max", dest="max_ibin", type="float", help="maximum bin for intitial bins[default=%default].") parser.add_option("--spike-initial-bin-width", dest="width_ibin", type="float", help="bin width intitial bins[default=%default].") parser.add_option("--spike-minimum", dest="min_spike", type="int", help="minimum number of spike-ins required within each bin\ [default=%default].") parser.add_option("--spike-maximum", dest="max_spike", type="int", help="maximum number of spike-ins allowed within each bin\ [default=%default].") parser.add_option("--spike-difference-method", dest="difference", type="choice", choices=("relative", "logfold", "abs_logfold"), help="method to use for calculating difference\ [default=%default].") parser.add_option("--spike-iterations", dest="iterations", type="int", help="number of iterations to generate spike-ins\ [default=%default].") parser.add_option("--spike-cluster-maximum-distance", dest="cluster_max_distance", type="int", help="maximum distance between adjacent loci in cluster\ [default=%default].") parser.add_option("--spike-cluster-minimum-size", dest="cluster_min_size", type="int", help="minimum number of loci required per cluster\ [default=%default].") parser.add_option("--spike-type", dest="spike_type", type="choice", choices=("row", "cluster"), help="spike in type [default=%default].") parser.add_option("--spike-subcluster-min-size", dest="min_sbin", type="int", help="minimum size of subcluster\ [default=%default].") parser.add_option("--spike-subcluster-max-size", dest="max_sbin", type="int", help="maximum size of subcluster\ [default=%default].") parser.add_option("--spike-subcluster-bin-width", dest="width_sbin", type="int", help="bin width for subcluster size\ [default=%default].") parser.add_option("--spike-output-method", dest="output_method", type="choice", choices=("append", "seperate"), help="defines whether the spike-ins should be appended\ to the original table or seperately [default=%default].") parser.add_option("--spike-shuffle-column-suffix", dest="shuffle_suffix", type="string", help="the suffix of the columns which are to be shuffled\ [default=%default].") parser.add_option("--spike-keep-column-suffix", dest="keep_suffix", type="string", help="a list of suffixes for the columns which are to be\ keep along with the shuffled columns[default=%default].") parser.add_option("--normalization-method", dest="normalization_method", type="choice", choices=("deseq-size-factors", "total-count", "total-column", "total-row"), help="normalization method to apply [%default]") parser.add_option("-t", "--tags-tsv-file", dest="input_filename_tags", type="string", help="input file with tag counts [default=%default].") parser.set_defaults( input_filename_tags="-", method="filter", filter_min_counts_per_row=None, filter_min_counts_per_sample=None, filter_percentile_rowsums=None, output_method="seperate", difference="logfold", spike_type="row", min_cbin=0, max_cbin=100, width_cbin=100, min_ibin=0, max_ibin=100, width_ibin=100, max_spike=100, min_spike=None, iterations=1, cluster_max_distance=100, cluster_min_size=10, min_sbin=1, max_sbin=1, width_sbin=1, shuffle_suffix=None, keep_suffix=None, normalization_method="deseq-size-factors" ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) # load if options.keep_suffix: # if using suffix, loadTagDataPandas will throw an error as it # looks for column names which exactly match the design # "tracks" need to write function in Counts.py to handle # counts table and design table + suffix counts = pd.read_csv(options.stdin, sep="\t", comment="#") inf = IOTools.openFile(options.input_filename_design) design = pd.read_csv(inf, sep="\t", index_col=0) inf.close() design = design[design["include"] != 0] if options.method in ("filter", "spike"): if options.input_filename_design is None: raise ValueError("method '%s' requires a design file" % options.method) else: # create Counts object # TS if spike type is cluster, need to keep "contig" and "position" # columns out of index if options.spike_type == "cluster": index = None, else: index = 0 if options.input_filename_tags == "-": counts = Counts.Counts(pd.io.parsers.read_csv( options.stdin, sep="\t", index_col=index, comment="#")) else: counts = Counts.Counts( IOTools.openFile(options.input_filename_tags, "r"), sep="\t", index_col=index, comment="#") # TS normalization doesn't require a design table if not options.method == "normalize": assert options.input_filename_design and os.path.exists( options.input_filename_design) # create Design object design = Expression.ExperimentalDesign( pd.read_csv( IOTools.openFile(options.input_filename_design, "r"), sep="\t", index_col=0, comment="#")) if options.method == "filter": assert (options.filter_min_counts_per_sample is not None or options.filter_min_counts_per_row is not None or options.filter_percentile_rowsums is not None), \ "no filtering parameters have been suplied" # filter # remove sample with low counts if options.filter_min_counts_per_sample: counts.removeSamples( min_counts_per_sample=options.filter_min_counts_per_sample) # remove observations with low counts if options.filter_min_counts_per_row: counts.removeObservationsFreq( min_counts_per_row=options.filter_min_counts_per_row) # remove bottom percentile of observations if options.filter_percentile_rowsums: counts.removeObservationsPerc( percentile_rowsums=options.filter_percentile_rowsums) nobservations, nsamples = counts.table.shape if nobservations == 0: E.warn("no observations remaining after filtering- no output") return if nsamples == 0: E.warn("no samples remain after filtering - no output") return # write out counts.table.to_csv(options.stdout, sep="\t", header=True) elif options.method == "normalize": counts.normalise(method=options.normalization_method, row_title="total") # write out counts.table.to_csv(options.stdout, sep="\t", header=True) elif options.method == "spike": # check parameters are sensible and set parameters where they # are not explicitly set if not options.min_spike: E.info("setting minimum number of spikes per bin to equal" "maximum number of spikes per bin (%s)" % options.max_spike) options.min_spike = options.max_spike if options.spike_type == "cluster": assert options.max_sbin <= options.cluster_min_size, \ ("max size of subscluster: %s is greater than min size of" "cluster: %s" % (options.max_sbin, options.cluster_min_size)) counts_columns = set(counts.table.columns.values.tolist()) assert ("contig" in counts_columns and "position" in counts_columns), \ ("cluster analysis requires columns named 'contig' and" "'position' in the dataframe") counts.sort(sort_columns=["contig", "position"], reset_index=True) # restrict design table to first pair only design.firstPairOnly() # get dictionaries to map group members to column names # use different methods depending on whether suffixes are supplied if options.keep_suffix: g_to_keep_tracks, g_to_spike_tracks = design.mapGroupsSuffix( options.shuffle_suffix, options.keep_suffix) else: # if no suffixes supplied, spike and keep tracks are the same g_to_track = design.getGroups2Samples() g_to_spike_tracks, g_to_keep_tracks = (g_to_track, g_to_track) # set up numpy arrays for change and initial values change_bins = np.arange(options.min_cbin, options.max_cbin, options.width_cbin) initial_bins = np.arange(options.min_ibin, options.max_ibin, options.width_ibin) E.info("Column boundaries are: %s" % str(change_bins)) E.info("Row boundaries are: %s" % str(initial_bins)) # shuffle rows/clusters if options.spike_type == "cluster": E.info("looking for clusters...") clusters_dict = Counts.findClusters( counts_sort, options.cluster_max_distance, options.cluster_min_size, g_to_spike_tracks, groups) if len(clusters_dict) == 0: raise Exception("no clusters were found, check parameters") E.info("shuffling subcluster regions...") output_indices, counts = Counts.shuffleCluster( initial_bins, change_bins, g_to_spike_tracks, groups, options.difference, options.max_spike, options.iterations, clusters_dict, options.max_sbin, options.min_sbin, options.width_sbin) elif options.spike_type == "row": E.info("shuffling rows...") output_indices, bin_counts = counts.shuffleRows( options.min_cbin, options.max_cbin, options.width_cbin, options.min_ibin, options.max_ibin, options.width_ibin, g_to_spike_tracks, design.groups, options.difference, options.max_spike, options.iterations) filled_bins = Counts.thresholdBins(output_indices, bin_counts, options.min_spike) assert len(filled_bins) > 0, "No bins contained enough spike-ins" # write out counts.outputSpikes( filled_bins, g_to_keep_tracks, design.groups, output_method=options.output_method, spike_type=options.spike_type, min_cbin=options.min_cbin, width_cbin=options.width_cbin, max_cbin=options.max_cbin, min_ibin=options.min_ibin, width_ibin=options.width_ibin, max_ibin=options.max_ibin, min_sbin=options.min_sbin, width_sbin=options.width_sbin, max_sbin=options.max_sbin) E.Stop()