def test_repository(self): """ test that the repository object gets made """ repository = StageRepository(self.config) self.assertTrue(isinstance(repository, StageRepository))
def setUp(self): self.config_file = os.path.join("test", STAGENAME, "test_" + STAGENAME + ".yaml") with open(self.config_file) as in_handle: self.config = yaml.load(in_handle) self.repository = StageRepository(self.config) self.out_dir = os.path.join(self.config["dir"]["results"], "test", STAGENAME) safe_makedir(self.out_dir)
def test_custom_plugins(self): """ test that the custom plugin directory is working """ plugin_file = self._make_test_class() plugin_dir = os.path.dirname(plugin_file.name) repo = StageRepository({"dir": {"plugins": plugin_dir}}) testplugin = repo["test_plugin"](self.config) self.assertTrue(testplugin("Test") == "Test")
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for project input_dir = config["dir"]["data"] logger.info("Loading files from %s" % (input_dir)) input_files = list(locate("*.fq", input_dir)) input_files += list(locate("*.fastq", input_dir)) logger.info("Input files: %s" % (input_files)) results_dir = config["dir"]["results"] safe_makedir(results_dir) # make the stage repository repository = StageRepository(config) logger.info("Stages found: %s" % (repository.plugins)) if config.get("test_pipeline", False): logger.info("Running a test pipeline on a subset of the reads.") results_dir = os.path.join(results_dir, "test_pipeline") config["dir"]["results"] = results_dir safe_makedir(results_dir) curr_files = map(make_test, input_files, [config] * len(input_files)) logger.info("Converted %s to %s. " % (input_files, curr_files)) else: curr_files = input_files logger.info("Running RNASeq alignment pipeline on %s." % (curr_files)) for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = FastQC(config) view.map(stage_runner, curr_files) if stage == "cutadapt": curr_files = combine_pairs(curr_files) logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "tophat": logger.info("Running Tophat on %s." % (curr_files)) #tophat = repository["tophat"](config) tophat = Tophat(config) tophat_outputs = view.map(tophat, curr_files) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "disambiguate": logger.info("Disambiguating %s." % (curr_files)) disambiguate = repository[stage](config) view.map(disambiguate, curr_files) if stage == "htseq-count": logger.info("Running htseq-count on %s." % (bamfiles)) name_sorted = view.map(sam.bam_name_sort, bamfiles) curr_files = view.map(sam.bam2sam, name_sorted) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_count.combine_counts(htseq_outputs) if stage == "rnaseq_metrics": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) #coverage = repository[stage](config) coverage = RNASeqMetrics(config) view.map(coverage, curr_files) if stage == "rseqc": logger.info("Running rseqc on %s." % (curr_files)) #rseq_args = zip(*product(curr_files, [config])) rseq_args = zip(*product(final_bamfiles, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs # end gracefully stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for project input_dir = config["input_dir"] logger.info("Loading files from %s" % (input_dir)) input_files = list( locate("*.sam", os.path.join(input_dir, "tophat_control"))) input_files += list( locate("*.sam", os.path.join(input_dir, "tophat_exposed"))) print input_files input_files = [x for x in input_files if "accepted" not in x] input_files = [x for x in input_files if "innerdist_estimate" not in x] logger.info("Input files: %s" % (input_files)) results_dir = config["dir"]["results"] safe_makedir(results_dir) # make the stage repository repository = StageRepository(config) logger.info("Stages found: %s" % (repository.plugins)) curr_files = input_files logger.info("Running quantitation on %s." % (curr_files)) for stage in config["run"]: if stage == "htseq-count": logger.info("Running htseq-count on %s." % (curr_files)) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_count.combine_counts(htseq_outputs) if stage == "rnaseq_metrics": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) #coverage = repository[stage](config) curr_files = view.map(sam.bam2sam, curr_files) coverage = RNASeqMetrics(config) view.map(coverage, curr_files) if stage == "rseqc": logger.info("Running rseqc on %s." % (curr_files)) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) # end gracefully stop_cluster()