Example #1
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for project
    input_dir = config["dir"]["data"]
    logger.info("Loading files from %s" % (input_dir))
    input_files = list(locate("*.fq", input_dir))
    input_files += list(locate("*.fastq", input_dir))
    logger.info("Input files: %s" % (input_files))

    results_dir = config["dir"]["results"]
    safe_makedir(results_dir)

    # make the stage repository
    repository = StageRepository(config)
    logger.info("Stages found: %s" % (repository.plugins))

    if config.get("test_pipeline", False):
        logger.info("Running a test pipeline on a subset of the reads.")
        results_dir = os.path.join(results_dir, "test_pipeline")
        config["dir"]["results"] = results_dir
        safe_makedir(results_dir)
        curr_files = map(make_test, input_files, [config] * len(input_files))
        logger.info("Converted %s to %s. " % (input_files, curr_files))
    else:
        curr_files = input_files
        logger.info("Running RNASeq alignment pipeline on %s." % (curr_files))

    for stage in config["run"]:
        if stage == "fastqc":
            logger.info("Running fastqc on %s." % (curr_files))
            stage_runner = FastQC(config)
            view.map(stage_runner, curr_files)

        if stage == "cutadapt":
            curr_files = combine_pairs(curr_files)
            logger.info("Running cutadapt on %s." % (curr_files))
            stage_runner = Cutadapt(config)
            curr_files = view.map(stage_runner, curr_files)

        if stage == "tophat":
            logger.info("Running Tophat on %s." % (curr_files))
            #tophat = repository["tophat"](config)
            tophat = Tophat(config)
            tophat_outputs = view.map(tophat, curr_files)
            bamfiles = view.map(sam.sam2bam, tophat_outputs)
            bamsort = view.map(sam.bamsort, bamfiles)
            view.map(sam.bamindex, bamsort)
            final_bamfiles = bamsort
            curr_files = tophat_outputs

        if stage == "disambiguate":
            logger.info("Disambiguating %s." % (curr_files))
            disambiguate = repository[stage](config)
            view.map(disambiguate, curr_files)

        if stage == "htseq-count":
            logger.info("Running htseq-count on %s." % (bamfiles))
            name_sorted = view.map(sam.bam_name_sort, bamfiles)
            curr_files = view.map(sam.bam2sam, name_sorted)
            htseq_args = zip(*product(curr_files, [config], [stage]))
            htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args)
            htseq_count.combine_counts(htseq_outputs)

        if stage == "rnaseq_metrics":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            #coverage = repository[stage](config)
            coverage = RNASeqMetrics(config)
            view.map(coverage, curr_files)

        if stage == "rseqc":
            logger.info("Running rseqc on %s." % (curr_files))
            #rseq_args = zip(*product(curr_files, [config]))
            rseq_args = zip(*product(final_bamfiles, [config]))
            view.map(rseqc.bam_stat, *rseq_args)
            view.map(rseqc.genebody_coverage, *rseq_args)
            view.map(rseqc.junction_annotation, *rseq_args)
            view.map(rseqc.junction_saturation, *rseq_args)
            RPKM_args = zip(*product(final_bamfiles, [config]))
            RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
            RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                        RPKM_count_out)
            """
                            annotate_args = zip(*product(RPKM_count_fixed,
                                         ["gene_id"],
                                         ["ensembl_gene_id"],
                                         ["human"]))
            view.map(annotate.annotate_table_with_biomart,
                     *annotate_args)
                     """
            view.map(rseqc.RPKM_saturation, *rseq_args)
            curr_files = tophat_outputs

    # end gracefully
    stop_cluster()
Example #2
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for project
    input_dir = config["dir"]["data"]
    logger.info("Loading files from %s" % (input_dir))
    input_files = list(locate("*.fq", input_dir))
    input_files += list(locate("*.fastq", input_dir))
    logger.info("Input files: %s" % (input_files))

    results_dir = config["dir"]["results"]
    safe_makedir(results_dir)

    # make the stage repository
    repository = StageRepository(config)
    logger.info("Stages found: %s" % (repository.plugins))

    if config.get("test_pipeline", False):
        logger.info("Running a test pipeline on a subset of the reads.")
        results_dir = os.path.join(results_dir, "test_pipeline")
        config["dir"]["results"] = results_dir
        safe_makedir(results_dir)
        curr_files = map(make_test, input_files, [config] * len(input_files))
        logger.info("Converted %s to %s. " % (input_files, curr_files))
    else:
        curr_files = input_files
        logger.info("Running RNASeq alignment pipeline on %s." % (curr_files))

    for stage in config["run"]:
        if stage == "fastqc":
            logger.info("Running fastqc on %s." % (curr_files))
            stage_runner = FastQC(config)
            view.map(stage_runner, curr_files)

        if stage == "cutadapt":
            curr_files = combine_pairs(curr_files)
            logger.info("Running cutadapt on %s." % (curr_files))
            stage_runner = Cutadapt(config)
            curr_files = view.map(stage_runner, curr_files)

        if stage == "tophat":
            logger.info("Running Tophat on %s." % (curr_files))
            #tophat = repository["tophat"](config)
            tophat = Tophat(config)
            tophat_outputs = view.map(tophat, curr_files)
            sortsam = view.map(sam.coordinate_sort_sam, tophat_outputs,
                               [config] * len(tophat_outputs))
            bamfiles = view.map(sam.sam2bam, sortsam)
            bamsort = view.map(sam.bamsort, bamfiles)
            view.map(sam.bamindex, bamsort)
            final_bamfiles = bamsort
            curr_files = tophat_outputs

        if stage == "disambiguate":
            logger.info("Disambiguating %s." % (curr_files))
            disambiguate = repository[stage](config)
            view.map(disambiguate, curr_files)

        if stage == "htseq-count":
            logger.info("Running htseq-count on %s." % (bamfiles))
            name_sorted = view.map(sam.bam_name_sort, bamfiles)
            curr_files = view.map(sam.bam2sam, name_sorted)
            htseq_args = zip(*product(curr_files, [config], [stage]))
            htseq_outputs = view.map(htseq_count.run_with_config,
                                     *htseq_args)
            htseq_count.combine_counts(htseq_outputs)

        if stage == "rnaseq_metrics":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            #coverage = repository[stage](config)
            coverage = RNASeqMetrics(config)
            view.map(coverage, curr_files)

        if stage == "rseqc":
            logger.info("Running rseqc on %s." % (curr_files))
            #rseq_args = zip(*product(curr_files, [config]))
            rseq_args = zip(*product(final_bamfiles, [config]))
            view.map(rseqc.bam_stat, *rseq_args)
            down_args = zip(*product(final_bamfiles, [40000000]))
            down_bam = view.map(sam.downsample_bam, *down_args)
            view.map(rseqc.genebody_coverage, down_bam,
                     [config] * len(down_bam))
            view.map(rseqc.junction_annotation, *rseq_args)
            view.map(rseqc.junction_saturation, *rseq_args)
            RPKM_args = zip(*product(final_bamfiles, [config]))
            RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
            RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                        RPKM_count_out)
            """
                            annotate_args = zip(*product(RPKM_count_fixed,
                                         ["gene_id"],
                                         ["ensembl_gene_id"],
                                         ["human"]))
            view.map(annotate.annotate_table_with_biomart,
                     *annotate_args)
                     """
            view.map(rseqc.RPKM_saturation, *rseq_args)
            curr_files = tophat_outputs

    # end gracefully
    stop_cluster()
Example #3
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for project
    input_dir = config["input_dir"]
    logger.info("Loading files from %s" % (input_dir))
    input_files = list(locate("*.fq", input_dir))
    input_files += list(locate("*.fastq", input_dir))
    logger.info("Input files: %s" % (input_files))

    results_dir = config["dir"]["results"]
    safe_makedir(results_dir)

    if config.get("test_pipeline", False):
        logger.info("Running a test pipeline on a subset of the reads.")
        results_dir = os.path.join(results_dir, "test_pipeline")
        config["dir"]["results"] = results_dir
        safe_makedir(results_dir)
        curr_files = map(make_test, input_files, [config] * len(input_files))
        logger.info("Converted %s to %s. " % (input_files, curr_files))
    else:
        curr_files = input_files
        logger.info("Running RNASeq alignment pipeline on %s." % (curr_files))

    for stage in config["run"]:
        if stage == "fastqc":
            logger.info("Running fastqc on %s." % (curr_files))
            stage_runner = FastQC(config)
            view.map(stage_runner, curr_files)

        if stage == "cutadapt":
            curr_files = combine_pairs(curr_files)
            logger.info("Running cutadapt on %s." % (curr_files))
            stage_runner = Cutadapt(config)
            curr_files = view.map(stage_runner, curr_files)
            logger.info("Output of cutadapt: %s." % (curr_files))

        if stage == "bowtie":
            logger.info("Running Bowtie on %s." % (curr_files))
            bowtie = Bowtie(config)
            bowtie_outputs = view.map(bowtie, curr_files)
            bamfiles = view.map(sam.sam2bam, bowtie_outputs)
            bamsort = view.map(sam.bamsort, bamfiles)
            view.map(sam.bamindex, bamsort)
            
        if stage == "htseq-count":
            logger.info("Running htseq-count on %s." % (curr_files))
            htseq_args = zip(*product(curr_files, [config], [stage]))
            htseq_outputs = view.map(htseq_count.run_with_config,
                                     *htseq_args)
            htseq.combine_counts(htseq_outputs)

        if stage == "rnaseq_metrics":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            coverage = RNASeqMetrics(config)
            view.map(coverage, curr_files)

        if stage == "rseqc":
            logger.info("Running rseqc on %s." % (curr_files))
            #rseq_args = zip(*product(curr_files, [config]))
            rseq_args = zip(*product(final_bamfiles, [config]))
            view.map(rseqc.bam_stat, *rseq_args)
            view.map(rseqc.genebody_coverage, *rseq_args)
            view.map(rseqc.junction_annotation, *rseq_args)
            view.map(rseqc.junction_saturation, *rseq_args)
            RPKM_args = zip(*product(final_bamfiles, [config]))
            RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
            RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                        RPKM_count_out)
            """
                            annotate_args = zip(*product(RPKM_count_fixed,
                                         ["gene_id"],
                                         ["ensembl_gene_id"],
                                         ["human"]))
            view.map(annotate.annotate_table_with_biomart,
                     *annotate_args)
                     """
            view.map(rseqc.RPKM_saturation, *rseq_args)
            curr_files = tophat_outputs

    # end gracefully
    stop_cluster()
Example #4
0
def main(config, view):

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for project
    input_dir = config["dir"]["data"]
    logger.info("Loading files from %s" % (input_dir))
    input_files = list(locate("*.fq", input_dir))
    input_files += list(locate("*.fastq", input_dir))
    logger.info("Input files: %s" % (input_files))

    results_dir = config["dir"]["results"]
    safe_makedir(results_dir)

    # make the stage repository
    repository = StageRepository(config)
    logger.info("Stages found: %s" % (repository.plugins))

    if config.get("test_pipeline", False):
        logger.info("Running a test pipeline on a subset of the reads.")
        results_dir = os.path.join(results_dir, "test_pipeline")
        config["dir"]["results"] = results_dir
        safe_makedir(results_dir)
        curr_files = map(make_test, input_files, [config] * len(input_files))
        logger.info("Converted %s to %s. " % (input_files, curr_files))
    else:
        curr_files = input_files
        logger.info("Running RNASeq alignment pipeline on %s." % (curr_files))

    for stage in config["run"]:
        if stage == "fastqc":
            logger.info("Running fastqc on %s." % (curr_files))
            stage_runner = FastQC(config)
            view.map(stage_runner, curr_files)

        if stage == "cutadapt":
            curr_files = combine_pairs(curr_files)
            logger.info("Running cutadapt on %s." % (curr_files))
            stage_runner = Cutadapt(config)
            curr_files = view.map(stage_runner, curr_files)

        if stage == "tophat":
            logger.info("Running Tophat on %s." % (curr_files))
            #tophat = repository["tophat"](config)
            tophat = Tophat(config)
            tophat_outputs = view.map(tophat, curr_files)
            sortsam = view.map(sam.coordinate_sort_sam, tophat_outputs,
                               [config] * len(tophat_outputs))
            bamfiles = view.map(sam.sam2bam, sortsam)
            bamsort = view.map(sam.bamsort, bamfiles)
            view.map(sam.bamindex, bamsort)
            final_bamfiles = bamsort
            curr_files = tophat_outputs

        if stage == "disambiguate":
            logger.info("Disambiguating %s." % (curr_files))
            disambiguate = repository[stage](config)
            view.map(disambiguate, curr_files)

        if stage == "htseq-count":
            logger.info("Running htseq-count on %s." % (bamfiles))
            name_sorted = view.map(sam.bam_name_sort, bamfiles)
            curr_files = view.map(sam.bam2sam, name_sorted)
            htseq_args = zip(*product(curr_files, [config], [stage]))
            htseq_outputs = view.map(htseq_count.run_with_config,
                                     *htseq_args)
            htseq_count.combine_counts(htseq_outputs)

        if stage == "rnaseq_metrics":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            #coverage = repository[stage](config)
            coverage = RNASeqMetrics(config)
            view.map(coverage, curr_files)

        if stage == "hard_clip":
            logger.info("Trimming from the beginning of reads on %s." % (curr_files))
            hard_clipper = HardClipper(config)
            curr_files = view.map(hard_clipper, curr_files)

        if stage == "rseqc":
            logger.info("Running rseqc on %s." % (curr_files))
            curr_files = view.map(sam.sam2bam, curr_files)
            rseq_args = zip(*product(curr_files, [config]))
            view.map(rseqc.bam_stat, *rseq_args)
            view.map(rseqc.genebody_coverage, *rseq_args)
            view.map(rseqc.junction_annotation, *rseq_args)
            view.map(sam.bamindex, curr_files)
            RPKM_count_out = view.map(rseqc.RPKM_count, *rseq_args)
            view.map(rseqc.fix_RPKM_count_file, RPKM_count_out)
            """
Example #5
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for thesis pipeline
    input_dirs = config["input_dirs"]

    results_dir = config["dir"].get("results", "results")
    input_files = _find_input_files(config)
    conditions = _group_input_by_condition(input_files)
    logger.info("Input_files: %s" % (input_files))
    logger.info("Condition groups %s" %(conditions))
    htseq_outdict = {}

    for condition, curr_files in conditions.items():
        condition_dir = os.path.join(results_dir, condition)
        safe_makedir(condition_dir)
        config["dir"]["results"] = condition_dir

        for stage in config["run"]:
            if stage == "fastqc":
                _emit_stage_message(stage, curr_files)
                fastqc_config = _get_stage_config(config, stage)
                fastqc_args = zip(*product(curr_files, [fastqc_config],
                                           [config]))
                view.map(fastqc.run, *fastqc_args)

            if stage == "cutadapt":
                _emit_stage_message(stage, curr_files)
                cutadapt_config = _get_stage_config(config, stage)
                cutadapt_args = zip(*product(curr_files, [cutadapt_config],
                                             [config]))
                cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args)
                curr_files = cutadapt_outputs
                logger.info("Fixing mate pair information.")
                pairs = combine_pairs(curr_files)
                first = [x[0] for x in pairs]
                second = [x[1] for x in pairs]
                logger.info("Forward: %s" % (first))
                logger.info("Reverse: %s" % (second))
                fixed = view.map(fastq.fix_mate_pairs_with_config,
                                 first, second, [config] * len(first))
                curr_files = list(flatten(fixed))

            if stage == "sickle":
                _emit_stage_message(stage, curr_files)
                pairs = combine_pairs(curr_files)
                first = [x[0] for x in pairs]
                second = [x[1] for x in pairs]
                fixed = view.map(sickle.run_with_config,
                                 first, second, [config] * len(first))
                curr_files = list(flatten(fixed))

            if stage == "tophat":
                _emit_stage_message(stage, curr_files)
                tophat_config = _get_stage_config(config, stage)
                pairs = combine_pairs(curr_files)
                first = [x[0] for x in pairs]
                second = [x[1] for x in pairs]
                logger.info("first %s" % (first))
                logger.info("second %s" % (second))

                #tophat_args = zip(*product(first, second, [config["ref"]],
                #                           ["tophat"], [config]))
                tophat_outputs = view.map(tophat.run_with_config,
                                          first, second,
                                          [config["ref"]] * len(first),
                                          ["tophat"] * len(first),
                                          [config] * len(first))
                bamfiles = view.map(sam.sam2bam, tophat_outputs)
                bamsort = view.map(sam.bamsort, bamfiles)
                view.map(sam.bamindex, bamsort)
                final_bamfiles = bamsort
                curr_files = tophat_outputs

            if stage == "htseq-count":
                _emit_stage_message(stage, curr_files)
                htseq_config = _get_stage_config(config, stage)
                htseq_args = zip(*product(curr_files, [config], [stage]))
                htseq_outputs = view.map(htseq_count.run_with_config,
                                         *htseq_args)
                htseq_outdict[condition] = htseq_outputs

            if stage == "coverage":
                logger.info("Calculating RNASeq metrics on %s." % (curr_files))
                nrun = len(curr_files)
                ref = prepare_ref_file(config["stage"][stage]["ref"], config)
                ribo = config["stage"][stage]["ribo"]
                picard = BroadRunner(config["program"]["picard"])
                out_dir = os.path.join(results_dir, stage)
                safe_makedir(out_dir)
                out_files = [replace_suffix(os.path.basename(x),
                                            "metrics") for x in curr_files]
                out_files = [os.path.join(out_dir, x) for x in out_files]
                out_files = view.map(picardrun.picard_rnaseq_metrics,
                                     [picard] * nrun,
                                     curr_files,
                                     [ref] * nrun,
                                     [ribo] * nrun,
                                     out_files)

            if stage == "rseqc":
                _emit_stage_message(stage, curr_files)
                rseqc_config = _get_stage_config(config, stage)
                rseq_args = zip(*product(curr_files, [config]))
                view.map(rseqc.bam_stat, *rseq_args)
                view.map(rseqc.genebody_coverage, *rseq_args)
                view.map(rseqc.junction_annotation, *rseq_args)
                view.map(rseqc.junction_saturation, *rseq_args)
                RPKM_args = zip(*product(final_bamfiles, [config]))
                RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
                RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                            RPKM_count_out)
                """
                                annotate_args = zip(*product(RPKM_count_fixed,
                                             ["gene_id"],
                                             ["ensembl_gene_id"],
                                             ["human"]))
                view.map(annotate.annotate_table_with_biomart,
                         *annotate_args)
                         """
                view.map(rseqc.RPKM_saturation, *rseq_args)
                curr_files = tophat_outputs

    # combine htseq-count files and run deseq on them
    conditions, htseq_files = dict_to_vectors(htseq_outdict)
    deseq_config = _get_stage_config(config, "deseq")
    cell_types = _group_input_by_cell_type(htseq_files)
    for cell_type, files in cell_types.items():
        for comparison in deseq_config["comparisons"]:
            comparison_name = "_vs_".join(comparison)
            deseq_dir = os.path.join(results_dir, "deseq", cell_type,
                                     comparison_name)
            safe_makedir(deseq_dir)
            out_file = os.path.join(deseq_dir, comparison_name + ".counts.txt")
            files_by_condition = _group_input_by_condition(files)
            _emit_stage_message("deseq", files_by_condition)
            c, f = dict_to_vectors(files_by_condition)
            combined_out = htseq_count.combine_counts(f,
                                                      None,
                                                      out_file)
            deseq_out = os.path.join(deseq_dir, comparison_name)
            logger.info("Running deseq on %s with conditions %s "
                        "and writing ot %s" % (combined_out,
                                               conditions,
                                               deseq_out))
            deseq_out = view.map(deseq.run, [combined_out], [c], [deseq_out])
            annotate.annotate_table_with_biomart(deseq_out[0],
                                                 "id",
                                                 "ensembl_gene_id",
                                                 "human")
            #annotated_file = view.map(annotate.annotate_table_with_biomart,
            #                          [deseq_out],
            #                          ["id"],
            #                          ["ensembl_gene_id"],
            #                          ["human"])

    # end gracefully
    stop_cluster()
Example #6
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for thesis pipeline
    input_dirs = config["input_dirs"]

    results_dir = config["dir"].get("results", "results")
    input_files = _find_input_files(config)
    conditions = _group_input_by_condition(input_files)
    logger.info("Input_files: %s" % (input_files))
    logger.info("Condition groups %s" % (conditions))
    htseq_outdict = {}

    for condition, curr_files in conditions.items():
        condition_dir = os.path.join(results_dir, condition)
        safe_makedir(condition_dir)
        config["dir"]["results"] = condition_dir

        for stage in config["run"]:
            if stage == "fastqc":
                _emit_stage_message(stage, curr_files)
                fastqc_config = _get_stage_config(config, stage)
                fastqc_args = zip(
                    *product(curr_files, [fastqc_config], [config]))
                view.map(fastqc.run, *fastqc_args)

            if stage == "cutadapt":
                _emit_stage_message(stage, curr_files)
                cutadapt_config = _get_stage_config(config, stage)
                cutadapt_args = zip(
                    *product(curr_files, [cutadapt_config], [config]))
                cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args)
                curr_files = cutadapt_outputs
                logger.info("Fixing mate pair information.")
                pairs = combine_pairs(curr_files)
                first = [x[0] for x in pairs]
                second = [x[1] for x in pairs]
                logger.info("Forward: %s" % (first))
                logger.info("Reverse: %s" % (second))
                fixed = view.map(fastq.fix_mate_pairs_with_config, first,
                                 second, [config] * len(first))
                curr_files = list(flatten(fixed))

            if stage == "sickle":
                _emit_stage_message(stage, curr_files)
                pairs = combine_pairs(curr_files)
                first = [x[0] for x in pairs]
                second = [x[1] for x in pairs]
                fixed = view.map(sickle.run_with_config, first, second,
                                 [config] * len(first))
                curr_files = list(flatten(fixed))

            if stage == "tophat":
                _emit_stage_message(stage, curr_files)
                tophat_config = _get_stage_config(config, stage)
                pairs = combine_pairs(curr_files)
                first = [x[0] for x in pairs]
                second = [x[1] for x in pairs]
                logger.info("first %s" % (first))
                logger.info("second %s" % (second))

                #tophat_args = zip(*product(first, second, [config["ref"]],
                #                           ["tophat"], [config]))
                tophat_outputs = view.map(tophat.run_with_config, first,
                                          second, [config["ref"]] * len(first),
                                          ["tophat"] * len(first),
                                          [config] * len(first))
                bamfiles = view.map(sam.sam2bam, tophat_outputs)
                bamsort = view.map(sam.bamsort, bamfiles)
                view.map(sam.bamindex, bamsort)
                final_bamfiles = bamsort
                curr_files = tophat_outputs

            if stage == "htseq-count":
                _emit_stage_message(stage, curr_files)
                htseq_config = _get_stage_config(config, stage)
                htseq_args = zip(*product(curr_files, [config], [stage]))
                htseq_outputs = view.map(htseq_count.run_with_config,
                                         *htseq_args)
                htseq_outdict[condition] = htseq_outputs

            if stage == "coverage":
                logger.info("Calculating RNASeq metrics on %s." % (curr_files))
                nrun = len(curr_files)
                ref = prepare_ref_file(config["stage"][stage]["ref"], config)
                ribo = config["stage"][stage]["ribo"]
                picard = BroadRunner(config["program"]["picard"])
                out_dir = os.path.join(results_dir, stage)
                safe_makedir(out_dir)
                out_files = [
                    replace_suffix(os.path.basename(x), "metrics")
                    for x in curr_files
                ]
                out_files = [os.path.join(out_dir, x) for x in out_files]
                out_files = view.map(picardrun.picard_rnaseq_metrics,
                                     [picard] * nrun, curr_files, [ref] * nrun,
                                     [ribo] * nrun, out_files)

            if stage == "rseqc":
                _emit_stage_message(stage, curr_files)
                rseqc_config = _get_stage_config(config, stage)
                rseq_args = zip(*product(curr_files, [config]))
                view.map(rseqc.bam_stat, *rseq_args)
                view.map(rseqc.genebody_coverage, *rseq_args)
                view.map(rseqc.junction_annotation, *rseq_args)
                view.map(rseqc.junction_saturation, *rseq_args)
                RPKM_args = zip(*product(final_bamfiles, [config]))
                RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
                RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                            RPKM_count_out)
                """
                                annotate_args = zip(*product(RPKM_count_fixed,
                                             ["gene_id"],
                                             ["ensembl_gene_id"],
                                             ["human"]))
                view.map(annotate.annotate_table_with_biomart,
                         *annotate_args)
                         """
                view.map(rseqc.RPKM_saturation, *rseq_args)
                curr_files = tophat_outputs

    # combine htseq-count files and run deseq on them
    conditions, htseq_files = dict_to_vectors(htseq_outdict)
    deseq_config = _get_stage_config(config, "deseq")
    cell_types = _group_input_by_cell_type(htseq_files)
    for cell_type, files in cell_types.items():
        for comparison in deseq_config["comparisons"]:
            comparison_name = "_vs_".join(comparison)
            deseq_dir = os.path.join(results_dir, "deseq", cell_type,
                                     comparison_name)
            safe_makedir(deseq_dir)
            out_file = os.path.join(deseq_dir, comparison_name + ".counts.txt")
            files_by_condition = _group_input_by_condition(files)
            _emit_stage_message("deseq", files_by_condition)
            c, f = dict_to_vectors(files_by_condition)
            combined_out = htseq_count.combine_counts(f, None, out_file)
            deseq_out = os.path.join(deseq_dir, comparison_name)
            logger.info("Running deseq on %s with conditions %s "
                        "and writing ot %s" %
                        (combined_out, conditions, deseq_out))
            deseq_out = view.map(deseq.run, [combined_out], [c], [deseq_out])
            annotate.annotate_table_with_biomart(deseq_out[0], "id",
                                                 "ensembl_gene_id", "human")
            #annotated_file = view.map(annotate.annotate_table_with_biomart,
            #                          [deseq_out],
            #                          ["id"],
            #                          ["ensembl_gene_id"],
            #                          ["human"])

    # end gracefully
    stop_cluster()