Esempio n. 1
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    from bipy.log import logger
    start_cluster(config)

    data_dir = config["dir"]["data"]
    from bipy.cluster import view
    input_files = [glob.glob(os.path.join(data_dir, x, "*_rep*")) for x in
                   config["input_dirs"]]
    input_files = list(flatten(input_files))
    logger.info("Input files to process: %s" % (input_files))
    results_dir = config["dir"]["results"]

    map(safe_makedir, config["dir"].values())

    curr_files = input_files

    for stage in config["run"]:
        if stage == "fastqc":
            nfiles = len(curr_files)
            logger.info("Running %s on %s" % (stage, str(curr_files)))
            fastqc_config = _get_stage_config(config, stage)
            fastqc_outputs = view.map(fastqc.run, curr_files,
                                      [fastqc_config] * nfiles,
                                      [config] * nfiles)

        if stage == "cutadapt":
            nfiles = len(curr_files)
            cutadapt_config = _get_stage_config(config, stage)
            cutadapt_outputs = view.map(cutadapt_tool.run,
                                        curr_files,
                                        [cutadapt_config] * nfiles,
                                        [config] * nfiles)
            curr_files = cutadapt_outputs

        if stage == "novoalign":
            nfiles = len(curr_files)
            novoalign_config = _get_stage_config(config, stage)
            #db = novoindex.run(config["ref"],
            #                   _get_stage_config(config, "novoindex"),
            #                   config)
            db = config["genome"]["file"]
            novoalign_outputs = view.map(novoalign.run, curr_files,
                                         [db] * nfiles,
                                         [novoalign_config] * nfiles,
                                         [config] * nfiles)
            picard = BroadRunner(config["program"]["picard"])
            args = zip(*itertools.product([picard], novoalign_outputs))
            # conver to bam
            bamfiles = view.map(picardrun.picard_formatconverter,
                                *args)
            args = zip(*itertools.product([picard], bamfiles))
            # sort bam
            sorted_bf = view.map(picardrun.picard_sort, *args)
            # index bam
            args = zip(*itertools.product([picard], sorted_bf))
            view.map(picardrun.picard_index, *args)
            curr_files = novoalign_outputs

        if stage == "htseq-count":
            logger.info("Running htseq-count on %s" %(curr_files))
            htseq_outputs = curr_files
            column_names = _get_short_names(input_files)
            logger.info("Column names: %s" % (column_names))
            out_file = os.path.join(config["dir"]["results"], stage,
                                    "combined.counts")
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      column_names,
                                                      out_file)
            rpkm = htseq_count.calculate_rpkm(combined_out,
                                              config["annotation"]["file"])
            rpkm_file = os.path.join(config["dir"]["results"], stage,
                                     "rpkm.txt")
            rpkm.to_csv(rpkm_file, sep="\t")

        if stage == "coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = blastn.prepare_ref_file(config["stage"][stage]["ref"],
                                          config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"])
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            out_files = [replace_suffix(os.path.basename(x),
                                        "metrics") for x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun,
                                 curr_files,
                                 [ref] * nrun,
                                 [ribo] * nrun,
                                 out_files)

        if stage == "deseq":
            conditions = [os.path.basename(x).split("_")[0] for x in
                          input_files]
            deseq_config = _get_stage_config(config, stage)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for comparison in deseq_config["comparisons"]:
                comparison_name = "_vs_".join(comparison)
                out_dir = os.path.join(results_dir, stage, comparison_name)
                safe_makedir(out_dir)
                # get the of the conditons that match this comparison
                indexes = [x for x, y in enumerate(conditions) if
                           y in comparison]
                # find the htseq_files to combine and combine them
                htseq_files = [htseq_outputs[index] for index in indexes]
                htseq_columns = [column_names[index] for index in indexes]
                logger.info(htseq_files)
                logger.info(htseq_columns)
                out_file = os.path.join(out_dir,
                                        comparison_name + ".counts.txt")
                combined_out = htseq_count.combine_counts(htseq_files,
                                                          htseq_columns,
                                                          out_file)
                deseq_conds = [conditions[index] for index in indexes]
                deseq_prefix = os.path.join(out_dir, comparison_name)

                deseq_out = view.map(deseq.run, [combined_out],
                                     [deseq_conds], [deseq_prefix])
                logger.info("Annotating %s." % (deseq_out))
                annotated_file = view.map(annotate.annotate_table_with_biomart,
                                          deseq_out,
                                          ["id"],
                                          ["ensembl_gene_id"],
                                          ["human"])

        if stage == "dss":
            conditions = [os.path.basename(x).split("_")[0] for x in
                          input_files]
            dss_config = _get_stage_config(config, stage)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for comparison in dss_config["comparisons"]:
                comparison_name = "_vs_".join(comparison)
                out_dir = os.path.join(results_dir, stage, comparison_name)
                safe_makedir(out_dir)
                # get the of the conditons that match this comparison
                indexes = [x for x, y in enumerate(conditions) if
                           y in comparison]
                # find the htseq_files to combine and combine them
                htseq_files = [htseq_outputs[index] for index in indexes]
                htseq_columns = [column_names[index] for index in indexes]
                out_file = os.path.join(out_dir,
                                        comparison_name + ".counts.txt")
                combined_out = htseq_count.combine_counts(htseq_files,
                                                          htseq_columns,
                                                          out_file)
                dss_conds = [conditions[index] for index in indexes]
                dss_prefix = os.path.join(out_dir, comparison_name)
                logger.info("Running DSS on %s with conditions %s and comparison %s." % (combined_out, dss_conds, comparison))

                dss_out = dss.run(combined_out, dss_conds, comparison,
                                  dss_prefix)

    stop_cluster()
Esempio n. 2
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    from bipy.log import logger
    start_cluster(config)

    from bipy.cluster import view
    #    view.push({'logger': logger})

    input_files = [
        os.path.join(config["dir"]["data"], x) for x in config["input"]
    ]
    results_dir = config["dir"]["results"]

    map(safe_makedir, config["dir"].values())

    curr_files = input_files

    for stage in config["run"]:
        if stage == "fastqc":
            nfiles = len(curr_files)
            logger.info("Running %s on %s" % (stage, str(curr_files)))
            fastqc_config = _get_stage_config(config, stage)
            fastqc_outputs = view.map(fastqc.run, curr_files,
                                      [fastqc_config] * nfiles,
                                      [config] * nfiles)

        if stage == "cutadapt":
            nfiles = len(curr_files)
            cutadapt_config = _get_stage_config(config, stage)
            cutadapt_outputs = view.map(cutadapt_tool.run, curr_files,
                                        [cutadapt_config] * nfiles,
                                        [config] * nfiles)
            curr_files = cutadapt_outputs

        if stage == "novoalign":
            nfiles = len(curr_files)
            novoalign_config = _get_stage_config(config, stage)
            #db = novoindex.run(config["ref"],
            #                   _get_stage_config(config, "novoindex"),
            #                   config)
            db = config["genome"]["file"]
            novoalign_outputs = view.map(novoalign.run, curr_files,
                                         [db] * nfiles,
                                         [novoalign_config] * nfiles,
                                         [config] * nfiles)
            picard = BroadRunner(config["program"]["picard"])
            args = zip(*itertools.product([picard], novoalign_outputs))
            # conver to bam
            bamfiles = view.map(picardrun.picard_formatconverter, *args)
            args = zip(*itertools.product([picard], bamfiles))
            # sort bam
            sorted_bf = view.map(picardrun.picard_sort, *args)
            # index bam
            args = zip(*itertools.product([picard], sorted_bf))
            view.map(picardrun.picard_index, *args)
            curr_files = novoalign_outputs

        if stage == "htseq-count":
            nfiles = len(curr_files)
            htseq_config = _get_stage_config(config, stage)
            htseq_outputs = view.map(htseq_count.run_with_config, curr_files,
                                     [config] * nfiles, [stage] * nfiles)
            column_names = _get_short_names(input_files)
            logger.info("Column names: %s" % (column_names))
            out_file = os.path.join(config["dir"]["results"], stage,
                                    "combined.counts")
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      column_names, out_file)
            rpkm = htseq_count.calculate_rpkm(combined_out,
                                              config["annotation"]["file"])
            rpkm_file = os.path.join(config["dir"]["results"], stage,
                                     "rpkm.txt")
            rpkm.to_csv(rpkm_file, sep="\t")

        if stage == "coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = blastn.prepare_ref_file(config["stage"][stage]["ref"],
                                          config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"])
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            out_files = [
                replace_suffix(os.path.basename(x), "metrics")
                for x in curr_files
            ]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun, curr_files, [ref] * nrun,
                                 [ribo] * nrun, out_files)

        if stage == "deseq":
            conditions = [
                os.path.basename(x).split("_")[0] for x in input_files
            ]
            deseq_config = _get_stage_config(config, stage)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for comparison in deseq_config["comparisons"]:
                comparison_name = "_vs_".join(comparison)
                out_dir = os.path.join(results_dir, stage, comparison_name)
                safe_makedir(out_dir)
                # get the of the conditons that match this comparison
                indexes = [
                    x for x, y in enumerate(conditions) if y in comparison
                ]
                # find the htseq_files to combine and combine them
                htseq_files = [htseq_outputs[index] for index in indexes]
                htseq_columns = [column_names[index] for index in indexes]
                logger.info(htseq_files)
                logger.info(htseq_columns)
                out_file = os.path.join(out_dir,
                                        comparison_name + ".counts.txt")
                combined_out = htseq_count.combine_counts(
                    htseq_files, htseq_columns, out_file)
                deseq_conds = [conditions[index] for index in indexes]
                deseq_prefix = os.path.join(out_dir, comparison_name)

                deseq_out = view.map(deseq.run, [combined_out], [deseq_conds],
                                     [deseq_prefix])
                logger.info("Annotating %s." % (deseq_out))
                annotated_file = view.map(annotate.annotate_table_with_biomart,
                                          deseq_out, ["id"],
                                          ["ensembl_gene_id"], ["human"])

    stop_cluster()
Esempio n. 3
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    stage_dict = {"download_encode": _download_encode,
                  "fastqc": _run_fastqc}

    curr_files = config["encode_file"]

    results_dir = config["dir"].get("results", "results")

    for cell_type in config["cell_types"]:
        cell_type_dir = os.path.join(results_dir, cell_type)
        safe_makedir(cell_type_dir)
        config["dir"]["results"] = cell_type_dir
        in_files = glob.glob(os.path.join(config["dir"]["data"],
                                          cell_type, "*"))
        curr_files = in_files
        for stage in config["run"]:
            if stage == "fastqc":
                _emit_stage_message(stage, curr_files)
                fastqc_config = _get_stage_config(config, stage)
                fastqc_args = zip(*product(curr_files, [fastqc_config],
                                           [config]))
                view.map(fastqc.run, *fastqc_args)

            if stage == "cutadapt":
                _emit_stage_message(stage, curr_files)
                cutadapt_config = _get_stage_config(config, stage)
                cutadapt_args = zip(*product(curr_files, [cutadapt_config],
                                             [config]))
                cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args)
                curr_files = cutadapt_outputs

            if stage == "tophat":
                _emit_stage_message(stage, curr_files)
                tophat_config = _get_stage_config(config, stage)
                tophat_args = zip(*product(curr_files, [None], [config["ref"]],
                                           ["tophat"], [config]))
                tophat_outputs = view.map(tophat.run_with_config, *tophat_args)

                picard = BroadRunner(config["program"]["picard"])
                # convert to bam
                #args = zip(*product([picard], tophat_outputs))
                #bamfiles = view.map(picardrun.picard_formatconverter,
                #                    *args)
                bamfiles = view.map(sam.sam2bam, tophat_outputs)
                sorted_bf = view.map(sam.bamsort, bamfiles)
                view.map(sam.bamindex, sorted_bf)
                curr_files = sorted_bf

            if stage == "rseqc":
                _emit_stage_message(stage, curr_files)
                rseqc_config = _get_stage_config(config, stage)
                rseq_args = zip(*product(curr_files, [config]))
                view.map(rseqc.bam2bigwig, *rseq_args, block=False)
                view.map(rseqc.bam_stat, *rseq_args, block=False)
                view.map(rseqc.clipping_profile, *rseq_args, block=False)
                view.map(rseqc.genebody_coverage, *rseq_args, block=False)
                view.map(rseqc.junction_annotation, *rseq_args, block=False)
                view.map(rseqc.junction_saturation, *rseq_args, block=False)
                RPKM_count_files = view.map(rseqc.RPKM_count,
                                            *rseq_args)
                dirs_to_process = list(set(map(os.path.dirname,
                                               RPKM_count_files)))
                logger.info("Count files: %s" % (RPKM_count_files))
                logger.info("dirnames to process: %s" % (dirs_to_process))
                RPKM_merged = view.map(rseqc.merge_RPKM, dirs_to_process)

                view.map(rseqc.RPKM_saturation, *rseq_args, block=False)
                curr_files = tophat_outputs

            if stage == "htseq-count":
                _emit_stage_message(stage, curr_files)
                htseq_config = _get_stage_config(config, stage)
                htseq_args = zip(*product(curr_files, [config], [stage]))
                htseq_outputs = view.map(htseq_count.run_with_config,
                                         *htseq_args)
                column_names = in_files
                out_file = os.path.join(config["dir"]["results"], stage,
                                        cell_type + ".combined.counts")
                combined_out = htseq_count.combine_counts(htseq_outputs,
                                                          column_names,
                                                          out_file)
                rpkm = htseq_count.calculate_rpkm(combined_out,
                                                  config["annotation"]["file"])
                rpkm_file = os.path.join(config["dir"]["results"], stage,
                                         cell_type + ".rpkm.txt")
                rpkm.to_csv(rpkm_file, sep="\t")

            if stage == "coverage":
                logger.info("Calculating RNASeq metrics on %s." % (curr_files))
                nrun = len(curr_files)
                ref = prepare_ref_file(config["stage"][stage]["ref"],
                                              config)
                ribo = config["stage"][stage]["ribo"]
                picard = BroadRunner(config["program"]["picard"])
                out_dir = os.path.join(config["dir"]["results"], stage)
                safe_makedir(out_dir)
                out_files = [replace_suffix(os.path.basename(x),
                                            "metrics") for x in curr_files]
                out_files = [os.path.join(out_dir, x) for x in out_files]
                out_files = view.map(picardrun.picard_rnaseq_metrics,
                                     [picard] * nrun,
                                     curr_files,
                                     [ref] * nrun,
                                     [ribo] * nrun,
                                     out_files)

    # end gracefully, wait for jobs to finish, then exit
    view.wait()
    stop_cluster()