Esempio n. 1
0
    def test_repository(self):
        """
        test that the repository object gets made

        """
        repository = StageRepository(self.config)
        self.assertTrue(isinstance(repository, StageRepository))
Esempio n. 2
0
 def setUp(self):
     self.config_file = os.path.join("test", STAGENAME,
                                     "test_" + STAGENAME + ".yaml")
     with open(self.config_file) as in_handle:
         self.config = yaml.load(in_handle)
     self.repository = StageRepository(self.config)
     self.out_dir = os.path.join(self.config["dir"]["results"], "test",
                                 STAGENAME)
     safe_makedir(self.out_dir)
Esempio n. 3
0
    def test_custom_plugins(self):
        """
        test that the custom plugin directory is working

        """
        plugin_file = self._make_test_class()
        plugin_dir = os.path.dirname(plugin_file.name)
        repo = StageRepository({"dir": {"plugins": plugin_dir}})
        testplugin = repo["test_plugin"](self.config)
        self.assertTrue(testplugin("Test") == "Test")
Esempio n. 4
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for project
    input_dir = config["dir"]["data"]
    logger.info("Loading files from %s" % (input_dir))
    input_files = list(locate("*.fq", input_dir))
    input_files += list(locate("*.fastq", input_dir))
    logger.info("Input files: %s" % (input_files))

    results_dir = config["dir"]["results"]
    safe_makedir(results_dir)

    # make the stage repository
    repository = StageRepository(config)
    logger.info("Stages found: %s" % (repository.plugins))

    if config.get("test_pipeline", False):
        logger.info("Running a test pipeline on a subset of the reads.")
        results_dir = os.path.join(results_dir, "test_pipeline")
        config["dir"]["results"] = results_dir
        safe_makedir(results_dir)
        curr_files = map(make_test, input_files, [config] * len(input_files))
        logger.info("Converted %s to %s. " % (input_files, curr_files))
    else:
        curr_files = input_files
        logger.info("Running RNASeq alignment pipeline on %s." % (curr_files))

    for stage in config["run"]:
        if stage == "fastqc":
            logger.info("Running fastqc on %s." % (curr_files))
            stage_runner = FastQC(config)
            view.map(stage_runner, curr_files)

        if stage == "cutadapt":
            curr_files = combine_pairs(curr_files)
            logger.info("Running cutadapt on %s." % (curr_files))
            stage_runner = Cutadapt(config)
            curr_files = view.map(stage_runner, curr_files)

        if stage == "tophat":
            logger.info("Running Tophat on %s." % (curr_files))
            #tophat = repository["tophat"](config)
            tophat = Tophat(config)
            tophat_outputs = view.map(tophat, curr_files)
            bamfiles = view.map(sam.sam2bam, tophat_outputs)
            bamsort = view.map(sam.bamsort, bamfiles)
            view.map(sam.bamindex, bamsort)
            final_bamfiles = bamsort
            curr_files = tophat_outputs

        if stage == "disambiguate":
            logger.info("Disambiguating %s." % (curr_files))
            disambiguate = repository[stage](config)
            view.map(disambiguate, curr_files)

        if stage == "htseq-count":
            logger.info("Running htseq-count on %s." % (bamfiles))
            name_sorted = view.map(sam.bam_name_sort, bamfiles)
            curr_files = view.map(sam.bam2sam, name_sorted)
            htseq_args = zip(*product(curr_files, [config], [stage]))
            htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args)
            htseq_count.combine_counts(htseq_outputs)

        if stage == "rnaseq_metrics":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            #coverage = repository[stage](config)
            coverage = RNASeqMetrics(config)
            view.map(coverage, curr_files)

        if stage == "rseqc":
            logger.info("Running rseqc on %s." % (curr_files))
            #rseq_args = zip(*product(curr_files, [config]))
            rseq_args = zip(*product(final_bamfiles, [config]))
            view.map(rseqc.bam_stat, *rseq_args)
            view.map(rseqc.genebody_coverage, *rseq_args)
            view.map(rseqc.junction_annotation, *rseq_args)
            view.map(rseqc.junction_saturation, *rseq_args)
            RPKM_args = zip(*product(final_bamfiles, [config]))
            RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
            RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                        RPKM_count_out)
            """
                            annotate_args = zip(*product(RPKM_count_fixed,
                                         ["gene_id"],
                                         ["ensembl_gene_id"],
                                         ["human"]))
            view.map(annotate.annotate_table_with_biomart,
                     *annotate_args)
                     """
            view.map(rseqc.RPKM_saturation, *rseq_args)
            curr_files = tophat_outputs

    # end gracefully
    stop_cluster()
Esempio n. 5
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for project
    input_dir = config["input_dir"]
    logger.info("Loading files from %s" % (input_dir))
    input_files = list(
        locate("*.sam", os.path.join(input_dir, "tophat_control")))
    input_files += list(
        locate("*.sam", os.path.join(input_dir, "tophat_exposed")))
    print input_files
    input_files = [x for x in input_files if "accepted" not in x]
    input_files = [x for x in input_files if "innerdist_estimate" not in x]
    logger.info("Input files: %s" % (input_files))

    results_dir = config["dir"]["results"]
    safe_makedir(results_dir)

    # make the stage repository
    repository = StageRepository(config)
    logger.info("Stages found: %s" % (repository.plugins))

    curr_files = input_files
    logger.info("Running quantitation on %s." % (curr_files))

    for stage in config["run"]:

        if stage == "htseq-count":
            logger.info("Running htseq-count on %s." % (curr_files))
            htseq_args = zip(*product(curr_files, [config], [stage]))
            htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args)
            htseq_count.combine_counts(htseq_outputs)

        if stage == "rnaseq_metrics":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            #coverage = repository[stage](config)
            curr_files = view.map(sam.bam2sam, curr_files)
            coverage = RNASeqMetrics(config)
            view.map(coverage, curr_files)

        if stage == "rseqc":
            logger.info("Running rseqc on %s." % (curr_files))
            rseq_args = zip(*product(curr_files, [config]))
            view.map(rseqc.bam_stat, *rseq_args)
            view.map(rseqc.genebody_coverage, *rseq_args)
            view.map(rseqc.junction_annotation, *rseq_args)
            view.map(rseqc.junction_saturation, *rseq_args)
            RPKM_args = zip(*product(final_bamfiles, [config]))
            RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
            RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                        RPKM_count_out)
            """
                            annotate_args = zip(*product(RPKM_count_fixed,
                                         ["gene_id"],
                                         ["ensembl_gene_id"],
                                         ["human"]))
            view.map(annotate.annotate_table_with_biomart,
                     *annotate_args)
                     """
            view.map(rseqc.RPKM_saturation, *rseq_args)

    # end gracefully
    stop_cluster()