Ejemplo n.º 1
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    start_cluster(config)
    from bipy.cluster import view

    input_files = config["input"]
    for stage in config["run"]:
        if config["stage"][stage]["program"] == "tagdust":
            tagdust_config = config["stage"][stage]
            view.map(tagdust.run, input_files,
                     [tagdust_config] * len(input_files),
                     [config] * len(input_files))
    stop_cluster()
Ejemplo n.º 2
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    start_cluster(config)
    from bipy.cluster import view

    input_files = config["input"]
    for stage in config["run"]:
        if config["stage"][stage]["program"] == "tagdust":
            tagdust_config = config["stage"][stage]
            view.map(tagdust.run, input_files,
                     [tagdust_config] * len(input_files),
                     [config] * len(input_files))
    stop_cluster()
Ejemplo n.º 3
0
def main(config_file):

    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    from bipy.log import logger
    start_cluster(config)
    from bipy.cluster import view

    input_dir = config["dir"]["input_dir"]
    results_dir = config["dir"]["results"]
    input_files = glob.glob(os.path.join(input_dir, "*.bam"))
    """ example running with macs
    macs.run_with_config(input_file, config, control_file=None, stage=None)
    """

    curr_files = input_files
    # first combine all the negative controls into one file
    negative_control = _merge_condition(input_files,
                                        config["groups"]["negative"])
    test_files = [
        _merge_condition(input_files, condition)
        for condition in config["groups"]["test"]
    ]
    test_files = [x for x in test_files if x]
    curr_files = test_files

    for stage in config["run"]:
        # for now just run macs on all of these files without the control
        # file
        if stage == "macs":
            nfiles = len(curr_files)
            out_files = view.map(macs.run_with_config, curr_files,
                                 [config] * nfiles,
                                 [negative_control] * nfiles, [stage] * nfiles)
            # just use the peak files going forward
            peak_files = [x[0] for x in out_files]
            curr_files = peak_files

        if stage == "piranha":
            nfiles = len(curr_files)
            piranha_runner = piranha.PiranhaStage(config)
            out_files = view.map(piranha_runner, curr_files)

    stop_cluster()
Ejemplo n.º 4
0
def main(config_file):

    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    from bipy.log import logger
    start_cluster(config)
    from bipy.cluster import view

    input_dir = config["dir"]["input_dir"]
    results_dir = config["dir"]["results"]
    input_files = glob.glob(os.path.join(input_dir, "*.bam"))

    """ example running with macs
    macs.run_with_config(input_file, config, control_file=None, stage=None)
    """

    curr_files = input_files
    # first combine all the negative controls into one file
    negative_control = _merge_condition(input_files,
                                        config["groups"]["negative"])
    test_files = [_merge_condition(input_files, condition) for
                  condition in config["groups"]["test"]]
    test_files = [x for x in test_files if x]
    curr_files = test_files

    for stage in config["run"]:
        # for now just run macs on all of these files without the control
        # file
        if stage == "macs":
            nfiles = len(curr_files)
            out_files = view.map(macs.run_with_config, curr_files,
                                 [config] * nfiles,
                                 [negative_control] * nfiles,
                                 [stage] * nfiles)
            # just use the peak files going forward
            peak_files = [x[0] for x in out_files]
            curr_files = peak_files

        if stage == "piranha":
            nfiles = len(curr_files)
            piranha_runner = piranha.PiranhaStage(config)
            out_files = view.map(piranha_runner, curr_files)

    stop_cluster()
Ejemplo n.º 5
0
def test_cluster():
    with open(CONFIG_FILE) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    from bipy.log import logger
    start_cluster(config)

    from bipy.cluster import view
    logger.info("Serial result")
    serial_result = map(mappable_function, range(32))
    logger.info("Parallel result")
    parallel_result = view.map(mappable_function, range(32))
    assert (serial_result == parallel_result)
Ejemplo n.º 6
0
def test_cluster():
    with open(CONFIG_FILE) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    from bipy.log import logger
    start_cluster(config)

    from bipy.cluster import view
    logger.info("Serial result")
    serial_result = map(mappable_function, range(32))
    logger.info("Parallel result")
    parallel_result = view.map(mappable_function, range(32))
    assert(serial_result == parallel_result)
Ejemplo n.º 7
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    from bipy.log import logger
    start_cluster(config)

    from bipy.cluster import view
    #    view.push({'logger': logger})

    input_files = [
        os.path.join(config["dir"]["data"], x) for x in config["input"]
    ]
    results_dir = config["dir"]["results"]

    map(safe_makedir, config["dir"].values())

    curr_files = input_files

    for stage in config["run"]:
        if stage == "fastqc":
            nfiles = len(curr_files)
            logger.info("Running %s on %s" % (stage, str(curr_files)))
            fastqc_config = _get_stage_config(config, stage)
            fastqc_outputs = view.map(fastqc.run, curr_files,
                                      [fastqc_config] * nfiles,
                                      [config] * nfiles)

        if stage == "cutadapt":
            nfiles = len(curr_files)
            cutadapt_config = _get_stage_config(config, stage)
            cutadapt_outputs = view.map(cutadapt_tool.run, curr_files,
                                        [cutadapt_config] * nfiles,
                                        [config] * nfiles)
            curr_files = cutadapt_outputs

        if stage == "novoalign":
            nfiles = len(curr_files)
            novoalign_config = _get_stage_config(config, stage)
            #db = novoindex.run(config["ref"],
            #                   _get_stage_config(config, "novoindex"),
            #                   config)
            db = config["genome"]["file"]
            novoalign_outputs = view.map(novoalign.run, curr_files,
                                         [db] * nfiles,
                                         [novoalign_config] * nfiles,
                                         [config] * nfiles)
            picard = BroadRunner(config["program"]["picard"])
            args = zip(*itertools.product([picard], novoalign_outputs))
            # conver to bam
            bamfiles = view.map(picardrun.picard_formatconverter, *args)
            args = zip(*itertools.product([picard], bamfiles))
            # sort bam
            sorted_bf = view.map(picardrun.picard_sort, *args)
            # index bam
            args = zip(*itertools.product([picard], sorted_bf))
            view.map(picardrun.picard_index, *args)
            curr_files = novoalign_outputs

        if stage == "htseq-count":
            nfiles = len(curr_files)
            htseq_config = _get_stage_config(config, stage)
            htseq_outputs = view.map(htseq_count.run_with_config, curr_files,
                                     [config] * nfiles, [stage] * nfiles)
            column_names = _get_short_names(input_files)
            logger.info("Column names: %s" % (column_names))
            out_file = os.path.join(config["dir"]["results"], stage,
                                    "combined.counts")
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      column_names, out_file)
            rpkm = htseq_count.calculate_rpkm(combined_out,
                                              config["annotation"]["file"])
            rpkm_file = os.path.join(config["dir"]["results"], stage,
                                     "rpkm.txt")
            rpkm.to_csv(rpkm_file, sep="\t")

        if stage == "coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = blastn.prepare_ref_file(config["stage"][stage]["ref"],
                                          config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"])
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            out_files = [
                replace_suffix(os.path.basename(x), "metrics")
                for x in curr_files
            ]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun, curr_files, [ref] * nrun,
                                 [ribo] * nrun, out_files)

        if stage == "deseq":
            conditions = [
                os.path.basename(x).split("_")[0] for x in input_files
            ]
            deseq_config = _get_stage_config(config, stage)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for comparison in deseq_config["comparisons"]:
                comparison_name = "_vs_".join(comparison)
                out_dir = os.path.join(results_dir, stage, comparison_name)
                safe_makedir(out_dir)
                # get the of the conditons that match this comparison
                indexes = [
                    x for x, y in enumerate(conditions) if y in comparison
                ]
                # find the htseq_files to combine and combine them
                htseq_files = [htseq_outputs[index] for index in indexes]
                htseq_columns = [column_names[index] for index in indexes]
                logger.info(htseq_files)
                logger.info(htseq_columns)
                out_file = os.path.join(out_dir,
                                        comparison_name + ".counts.txt")
                combined_out = htseq_count.combine_counts(
                    htseq_files, htseq_columns, out_file)
                deseq_conds = [conditions[index] for index in indexes]
                deseq_prefix = os.path.join(out_dir, comparison_name)

                deseq_out = view.map(deseq.run, [combined_out], [deseq_conds],
                                     [deseq_prefix])
                logger.info("Annotating %s." % (deseq_out))
                annotated_file = view.map(annotate.annotate_table_with_biomart,
                                          deseq_out, ["id"],
                                          ["ensembl_gene_id"], ["human"])

    stop_cluster()
Ejemplo n.º 8
0
def main(config_file):
    # load yaml config file
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # setup logging
    setup_logging(config)
    from bipy.log import logger
    # start cluster
    start_cluster(config)
    from bipy.cluster import view

    found = sh.find(config["dir"]["data"], "-name", "Variations")
    var_dirs = [str(x).strip() for x in found]
    logger.info("Var_dirs: %s" % (var_dirs))
    in_dirs = map(os.path.dirname, var_dirs)
    logger.info("in_dirs: %s" % (in_dirs))
    # XXX for testing only load 3
    #curr_files = in_dirs[0:5]
    curr_files = in_dirs

    # run the illumina fixer
    logger.info("Running illumina fixer on %s." % (curr_files))
    illf_class = STAGE_LOOKUP.get("illumina_fixer")
    illf = illf_class(config)
    curr_files = view.map(illf, curr_files)

    # sort the vcf files
    def sort_vcf(in_file):
        from bipy.utils import append_stem
        from bcbio.distributed.transaction import file_transaction
        from bcbio.utils import file_exists
        import sh

        out_file = append_stem(in_file, "sorted")
        if file_exists(out_file):
            return out_file
        with file_transaction(out_file) as tmp_out_file:
            sh.vcf_sort(in_file, _out=tmp_out_file)
        return out_file

    # combine
    out_file = os.path.join(config["dir"].get("results", "results"),
                            "geminiloader", "all_combined.vcf")
    logger.info("Combining files %s into %s." % (curr_files, out_file))
    if file_exists(out_file):
        curr_files = [out_file]
    else:
        curr_files = [
            genotype.combine_variant_files(curr_files, out_file,
                                           config["ref"]["fasta"], config)
        ]

    # break the VCF files up by chromosome for speed
    logger.info("Breaking up %s by chromosome." % (curr_files))
    breakvcf_class = STAGE_LOOKUP.get("breakvcf")
    breakvcf = breakvcf_class(config)
    curr_files = view.map(breakvcf, curr_files)

    # run VEP on the separate files in parallel
    logger.info("Running VEP on %s." % (curr_files))
    vep_class = STAGE_LOOKUP.get("vep")
    vep = vep_class(config)
    curr_files = view.map(vep, list(flatten(curr_files)))

    curr_files = filter(file_exists, curr_files)

    # load the files into gemini not in parallel
    # don't run in parallel

    # sort the vcf files
    logger.info("Sorting %s." % (curr_files))
    curr_files = view.map(sort_vcf, curr_files)
    # don't run the rest of this in parallel, so take the cluster down
    stop_cluster()

    out_file = os.path.join(config["dir"].get("results", "results"),
                            "geminiloader", "all_combined.vep.vcf")
    logger.info("Combining files %s into %s." % (curr_files, out_file))
    if file_exists(out_file):
        curr_files = [out_file]
    else:
        curr_files = [
            genotype.combine_variant_files(curr_files, out_file,
                                           config["ref"]["fasta"], config)
        ]

    logger.info("Loading %s into gemini." % (curr_files))
    gemini_class = STAGE_LOOKUP.get("geminiloader")
    geminiloader = gemini_class(config)
    curr_files = map(geminiloader, curr_files)
    logger.info("Run complete.")
Ejemplo n.º 9
0
def main(config_file):
    # load yaml config file
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # setup logging
    setup_logging(config)
    from bipy.log import logger
    # start cluster
    start_cluster(config)
    from bipy.cluster import view

    found = sh.find(config["dir"]["data"], "-name", "Variations")
    var_dirs = [str(x).strip() for x in found]
    logger.info("Var_dirs: %s" % (var_dirs))
    in_dirs = map(os.path.dirname, var_dirs)
    logger.info("in_dirs: %s" % (in_dirs))
    # XXX for testing only load 3
    #curr_files = in_dirs[0:5]
    curr_files = in_dirs


    # run the illumina fixer
    logger.info("Running illumina fixer on %s." % (curr_files))
    illf_class = STAGE_LOOKUP.get("illumina_fixer")
    illf = illf_class(config)
    curr_files = view.map(illf, curr_files)

    # sort the vcf files
    def sort_vcf(in_file):
        from bipy.utils import append_stem
        from bcbio.distributed.transaction import file_transaction
        from bcbio.utils import file_exists
        import sh

        out_file = append_stem(in_file, "sorted")
        if file_exists(out_file):
            return out_file
        with file_transaction(out_file) as tmp_out_file:
            sh.vcf_sort(in_file, _out=tmp_out_file)
        return out_file


    # combine
    out_file = os.path.join(config["dir"].get("results", "results"),
                            "geminiloader",
                            "all_combined.vcf")
    logger.info("Combining files %s into %s." % (curr_files, out_file))
    if file_exists(out_file):
        curr_files = [out_file]
    else:
        curr_files = [genotype.combine_variant_files(curr_files, out_file,
                                                     config["ref"]["fasta"],
                                                     config)]

    # break the VCF files up by chromosome for speed
    logger.info("Breaking up %s by chromosome." % (curr_files))
    breakvcf_class = STAGE_LOOKUP.get("breakvcf")
    breakvcf = breakvcf_class(config)
    curr_files = view.map(breakvcf, curr_files)

    # run VEP on the separate files in parallel
    logger.info("Running VEP on %s." % (curr_files))
    vep_class = STAGE_LOOKUP.get("vep")
    vep = vep_class(config)
    curr_files = view.map(vep, list(flatten(curr_files)))

    curr_files = filter(file_exists, curr_files)

    # load the files into gemini not in parallel
    # don't run in parallel

    # sort the vcf files
    logger.info("Sorting %s." % (curr_files))
    curr_files = view.map(sort_vcf, curr_files)
    # don't run the rest of this in parallel, so take the cluster down
    stop_cluster()

    out_file = os.path.join(config["dir"].get("results", "results"),
                            "geminiloader",
                            "all_combined.vep.vcf")
    logger.info("Combining files %s into %s." % (curr_files, out_file))
    if file_exists(out_file):
        curr_files = [out_file]
    else:
        curr_files = [genotype.combine_variant_files(curr_files, out_file,
                                                     config["ref"]["fasta"],
                                                     config)]


    logger.info("Loading %s into gemini." % (curr_files))
    gemini_class = STAGE_LOOKUP.get("geminiloader")
    geminiloader = gemini_class(config)
    curr_files = map(geminiloader, curr_files)
    logger.info("Run complete.")
Ejemplo n.º 10
0
def main(config_file):

    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    start_cluster(config)

    # after the cluster is up, import the view to i
    from bipy.cluster import view
    input_files = config["input"]
    results_dir = config["dir"]["results"]

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    curr_files = input_files

    ## qc steps
    for stage in config["run"]:
        if stage == "fastqc":
            # run the basic fastqc
            logger.info("Running %s on %s" % (stage, str(curr_files)))
            fastqc_config = config["stage"][stage]
            fastqc_outputs = view.map(fastqc.run, curr_files,
                                      [fastqc_config] * len(curr_files),
                                      [config] * len(curr_files))
            # this does nothing for now, not implemented yet
            summary_file = _combine_fastqc(fastqc_outputs)

        if stage == "trim":
            logger.info("Trimming poor quality ends "
                        " from %s" % (str(curr_files)))
            nlen = len(curr_files)
            min_length = str(config["stage"][stage].get("min_length", 20))

            # trim low quality ends of reads
            # do this dirty for now
            out_dir = os.path.join(results_dir, "trimmed")
            safe_makedir(out_dir)
            out_files = [append_stem(os.path.basename(x), "trim") for
                         x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            # XXX remove the magic number of 10 the length of the
            # minimum read to keep
            out_files = view.map(sickle.run, curr_files,
                                 ["se"] * nlen,
                                 ["sanger"] * nlen,
                                 [min_length] * nlen,
                                 out_files)
            curr_files = out_files

        if stage == "tagdust":
            input_files = curr_files
            # remove tags matching the other miRNA tested
            logger.info("Running %s on %s." % (stage, input_files))
            tagdust_config = config["stage"][stage]
            tagdust_outputs = view.map(tagdust.run, input_files,
                                       [tagdust_config] * len(input_files),
                                       [config] * len(input_files))
            curr_files = [x[0] for x in tagdust_outputs]

        if stage == "filter_length":
            # filter out reads below or above a certain length
            filter_config = config["stage"][stage]
            min_length = filter_config.get("min_length", 0)
            max_length = filter_config.get("max_length", MAX_READ_LENGTH)

            # length predicate
            def length_filter(x):
                return min_length < len(x.seq) < max_length

            # filter the input reads based on length
            # parallelizing this doesn't seem to work
            # ipython can't accept closures as an argument to view.map()
            """
            filtered_fastq = view.map(filter_seqio, tagdust_outputs,
                                      [lf] * len(tagdust_outputs),
                                      ["filt"] * len(tagdust_outputs),
                                      ["fastq"] * len(tagdust_outputs))"""
            out_files = [append_stem(os.path.basename(input_file[0]),
                         "filt") for input_file in tagdust_outputs]
            out_dir = os.path.join(config["dir"]["results"],
                                   "length_filtered")
            safe_makedir(out_dir)
            out_files = [os.path.join(out_dir, x) for x in out_files]

            filtered_fastq = [filter_seqio(x[0], length_filter, y, "fastq")
                              for x, y in zip(tagdust_outputs, out_files)]

            curr_files = filtered_fastq

        if stage == "count_ends":
            logger.info("Compiling nucleotide counts at 3' and 5' ends.")
            # count the nucleotide at the end of each read
            def count_ends(x, y):
                """ keeps a running count of an arbitrary set of keys
                during the reduce step """
                x[y] = x.get(y, 0) + 1
                return x

            def get_3prime_end(x):
                return str(x.seq[-1])

            def get_5prime_end(x):
                return str(x.seq[0])

            def output_counts(end_function, count_file):
                # if the count_file already exists, skip
                outdir = os.path.join(config["dir"]["results"], stage)
                safe_makedir(outdir)
                count_file = os.path.join(outdir, count_file)
                if os.path.exists(count_file):
                    return count_file
                # outputs a tab file of the counts at the end
                # of the fastq files kj
                counts = [reduce(count_ends,
                                 apply_seqio(x, end_function, kind="fastq"),
                                 {}) for x in curr_files]
                df = pd.DataFrame(counts,
                                  index=map(_short_name, curr_files))
                df = df.astype(float)
                total = df.sum(axis=1)
                df = df.div(total, axis=0)
                df["total"] = total
                df.to_csv(count_file, sep="\t")

            output_counts(get_3prime_end, "3prime_counts.tsv")
            output_counts(get_5prime_end, "5prime_counts.tsv")

        if stage == "tophat":
            tophat_config = config["stage"][stage]
            logger.info("Running tophat on %s" % (str(curr_files)))
            nlen = len(curr_files)
            pair_file = None
            ref_file = tophat_config["annotation"]
            out_base = os.path.join(results_dir, "mirna")
            align_dir = os.path.join(results_dir, "tophat")
            config = config
            tophat_files = view.map(tophat.align,
                                    curr_files,
                                    [pair_file] * nlen,
                                    [ref_file] * nlen,
                                    [out_base] * nlen,
                                    [align_dir] * nlen,
                                    [config] * nlen)
            curr_files = tophat_files

        if stage == "novoalign":
            logger.info("Running novoalign on %s" % (str(curr_files)))
            # align
            ref = config["genome"]["file"]
            novoalign_config = config["stage"][stage]
            aligned_outputs = view.map(novoalign.run, curr_files,
                                       [ref] * len(curr_files),
                                       [novoalign_config] * len(curr_files),
                                       [config] * len(curr_files))
            # convert sam to bam, sort and index
            picard = BroadRunner(config["program"]["picard"], None, {})
            bamfiles = view.map(picardrun.picard_formatconverter,
                                [picard] * len(aligned_outputs),
                                aligned_outputs)
            sorted_bf = view.map(picardrun.picard_sort,
                                 [picard] * len(bamfiles),
                                 bamfiles)
            view.map(picardrun.picard_index, [picard] * len(sorted_bf),
                     sorted_bf)
            # these files are the new starting point for the downstream
            # analyses, so copy them over into the data dir and setting
            # them to read only
            #data_dir = os.path.join(config["dir"]["data"], stage)
            #safe_makedir(data_dir)
            #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf))
            #new_files = [os.path.join(data_dir, x) for x in
            #             map(os.path.basename, sorted_bf)]
            #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files]
            # index the bam files for later use
            #view.map(picardrun.picard_index, [picard] * len(new_files),
            #         new_files)

            curr_files = sorted_bf

        if stage == "new_coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = blastn.prepare_ref_file(config["stage"][stage]["ref"],
                                          config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"], None, {})
            out_dir = os.path.join(results_dir, "new_coverage")
            safe_makedir(out_dir)
            out_files = [replace_suffix(os.path.basename(x),
                                        "metrics") for x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun,
                                 curr_files,
                                 [ref] * nrun,
                                 [ribo] * nrun,
                                 out_files)
            curr_files = out_files

        if stage == "coverage":
            gtf = blastn.prepare_ref_file(config["annotation"], config)
            logger.info("Calculating coverage of features in %s for %s"
                        % (gtf, str(sorted_bf)))
            out_files = [replace_suffix(x, "counts.bed") for
                         x in sorted_bf]
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            logger.info(out_files)
            out_files = [os.path.join(out_dir,
                                      os.path.basename(x)) for x in out_files]
            logger.info(out_files)
            view.map(bedtools.count_overlaps, sorted_bf,
                     [gtf] * len(sorted_bf),
                     out_files)

        if stage == "htseq-count":
            nfiles = len(curr_files)
            htseq_config = _get_stage_config(config, stage)
            htseq_outputs = view.map(htseq_count.run_with_config,
                                     aligned_outputs,
                                     [config] * nfiles,
                                     [stage] * nfiles)
            column_names = _get_short_names(input_files)
            logger.info("Column names: %s" % (column_names))
            out_file = os.path.join(config["dir"]["results"], stage,
                                    "combined.counts")
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      column_names,
                                                      out_file)
        if stage == "bedtools_intersect":
            bedfiles = config["stage"]["bedtools_intersect"].get("bed", None)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for bedfile in bedfiles:
                bedbase, bedext = os.path.splitext(bedfile)
                out_files = [remove_suffix(x) for x in sorted_bf]
                out_files = [os.path.join(out_dir, os.path.basename(x)) for x in
                             out_files]
                out_files = ["_vs_".join([x, os.path.basename(bedbase)])
                             for x in out_files]
                out_files = [".".join([x, "bam"]) for x in out_files]
                test_out = map(bedtools.intersectbam2bed, sorted_bf,
                               [bedfile] * len(sorted_bf),
                               [False] * len(sorted_bf),
                               out_files)
                count_files = [replace_suffix(x, "stats") for x in
                               out_files]
                map(write_ratios, sorted_bf, out_files, count_files)

        if stage == "piranha":
            piranha_runner = piranha.PiranhaStage(config)
            out_files = view.map(piranha_runner, curr_files)

    stop_cluster()
Ejemplo n.º 11
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    from bipy.log import logger
    start_cluster(config)

    data_dir = config["dir"]["data"]
    from bipy.cluster import view
    input_files = [glob.glob(os.path.join(data_dir, x, "*_rep*")) for x in
                   config["input_dirs"]]
    input_files = list(flatten(input_files))
    logger.info("Input files to process: %s" % (input_files))
    results_dir = config["dir"]["results"]

    map(safe_makedir, config["dir"].values())

    curr_files = input_files

    for stage in config["run"]:
        if stage == "fastqc":
            nfiles = len(curr_files)
            logger.info("Running %s on %s" % (stage, str(curr_files)))
            fastqc_config = _get_stage_config(config, stage)
            fastqc_outputs = view.map(fastqc.run, curr_files,
                                      [fastqc_config] * nfiles,
                                      [config] * nfiles)

        if stage == "cutadapt":
            nfiles = len(curr_files)
            cutadapt_config = _get_stage_config(config, stage)
            cutadapt_outputs = view.map(cutadapt_tool.run,
                                        curr_files,
                                        [cutadapt_config] * nfiles,
                                        [config] * nfiles)
            curr_files = cutadapt_outputs

        if stage == "novoalign":
            nfiles = len(curr_files)
            novoalign_config = _get_stage_config(config, stage)
            #db = novoindex.run(config["ref"],
            #                   _get_stage_config(config, "novoindex"),
            #                   config)
            db = config["genome"]["file"]
            novoalign_outputs = view.map(novoalign.run, curr_files,
                                         [db] * nfiles,
                                         [novoalign_config] * nfiles,
                                         [config] * nfiles)
            picard = BroadRunner(config["program"]["picard"])
            args = zip(*itertools.product([picard], novoalign_outputs))
            # conver to bam
            bamfiles = view.map(picardrun.picard_formatconverter,
                                *args)
            args = zip(*itertools.product([picard], bamfiles))
            # sort bam
            sorted_bf = view.map(picardrun.picard_sort, *args)
            # index bam
            args = zip(*itertools.product([picard], sorted_bf))
            view.map(picardrun.picard_index, *args)
            curr_files = novoalign_outputs

        if stage == "htseq-count":
            logger.info("Running htseq-count on %s" %(curr_files))
            htseq_outputs = curr_files
            column_names = _get_short_names(input_files)
            logger.info("Column names: %s" % (column_names))
            out_file = os.path.join(config["dir"]["results"], stage,
                                    "combined.counts")
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      column_names,
                                                      out_file)
            rpkm = htseq_count.calculate_rpkm(combined_out,
                                              config["annotation"]["file"])
            rpkm_file = os.path.join(config["dir"]["results"], stage,
                                     "rpkm.txt")
            rpkm.to_csv(rpkm_file, sep="\t")

        if stage == "coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = blastn.prepare_ref_file(config["stage"][stage]["ref"],
                                          config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"])
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            out_files = [replace_suffix(os.path.basename(x),
                                        "metrics") for x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun,
                                 curr_files,
                                 [ref] * nrun,
                                 [ribo] * nrun,
                                 out_files)

        if stage == "deseq":
            conditions = [os.path.basename(x).split("_")[0] for x in
                          input_files]
            deseq_config = _get_stage_config(config, stage)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for comparison in deseq_config["comparisons"]:
                comparison_name = "_vs_".join(comparison)
                out_dir = os.path.join(results_dir, stage, comparison_name)
                safe_makedir(out_dir)
                # get the of the conditons that match this comparison
                indexes = [x for x, y in enumerate(conditions) if
                           y in comparison]
                # find the htseq_files to combine and combine them
                htseq_files = [htseq_outputs[index] for index in indexes]
                htseq_columns = [column_names[index] for index in indexes]
                logger.info(htseq_files)
                logger.info(htseq_columns)
                out_file = os.path.join(out_dir,
                                        comparison_name + ".counts.txt")
                combined_out = htseq_count.combine_counts(htseq_files,
                                                          htseq_columns,
                                                          out_file)
                deseq_conds = [conditions[index] for index in indexes]
                deseq_prefix = os.path.join(out_dir, comparison_name)

                deseq_out = view.map(deseq.run, [combined_out],
                                     [deseq_conds], [deseq_prefix])
                logger.info("Annotating %s." % (deseq_out))
                annotated_file = view.map(annotate.annotate_table_with_biomart,
                                          deseq_out,
                                          ["id"],
                                          ["ensembl_gene_id"],
                                          ["human"])

        if stage == "dss":
            conditions = [os.path.basename(x).split("_")[0] for x in
                          input_files]
            dss_config = _get_stage_config(config, stage)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for comparison in dss_config["comparisons"]:
                comparison_name = "_vs_".join(comparison)
                out_dir = os.path.join(results_dir, stage, comparison_name)
                safe_makedir(out_dir)
                # get the of the conditons that match this comparison
                indexes = [x for x, y in enumerate(conditions) if
                           y in comparison]
                # find the htseq_files to combine and combine them
                htseq_files = [htseq_outputs[index] for index in indexes]
                htseq_columns = [column_names[index] for index in indexes]
                out_file = os.path.join(out_dir,
                                        comparison_name + ".counts.txt")
                combined_out = htseq_count.combine_counts(htseq_files,
                                                          htseq_columns,
                                                          out_file)
                dss_conds = [conditions[index] for index in indexes]
                dss_prefix = os.path.join(out_dir, comparison_name)
                logger.info("Running DSS on %s with conditions %s and comparison %s." % (combined_out, dss_conds, comparison))

                dss_out = dss.run(combined_out, dss_conds, comparison,
                                  dss_prefix)

    stop_cluster()
Ejemplo n.º 12
0
def main(config_file):

    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    start_cluster(config)

    # after the cluster is up, import the view to i
    from bipy.cluster import view
    input_files = config["input"]
    results_dir = config["dir"]["results"]

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    curr_files = input_files

    ## qc steps
    for stage in config["run"]:
        if stage == "fastqc":
            # run the basic fastqc
            logger.info("Running %s on %s" % (stage, str(curr_files)))
            fastqc_config = config["stage"][stage]
            fastqc_outputs = view.map(fastqc.run, curr_files,
                                      [fastqc_config] * len(curr_files),
                                      [config] * len(curr_files))
            # this does nothing for now, not implemented yet
            summary_file = _combine_fastqc(fastqc_outputs)

        if stage == "trim":
            logger.info("Trimming poor quality ends "
                        " from %s" % (str(curr_files)))
            nlen = len(curr_files)
            min_length = str(config["stage"][stage].get("min_length", 20))

            # trim low quality ends of reads
            # do this dirty for now
            out_dir = os.path.join(results_dir, "trimmed")
            safe_makedir(out_dir)
            out_files = [
                append_stem(os.path.basename(x), "trim") for x in curr_files
            ]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            # XXX remove the magic number of 10 the length of the
            # minimum read to keep
            out_files = view.map(sickle.run, curr_files, ["se"] * nlen,
                                 ["sanger"] * nlen, [min_length] * nlen,
                                 out_files)
            curr_files = out_files

        if stage == "tagdust":
            input_files = curr_files
            # remove tags matching the other miRNA tested
            logger.info("Running %s on %s." % (stage, input_files))
            tagdust_config = config["stage"][stage]
            tagdust_outputs = view.map(tagdust.run, input_files,
                                       [tagdust_config] * len(input_files),
                                       [config] * len(input_files))
            curr_files = [x[0] for x in tagdust_outputs]

        if stage == "filter_length":
            # filter out reads below or above a certain length
            filter_config = config["stage"][stage]
            min_length = filter_config.get("min_length", 0)
            max_length = filter_config.get("max_length", MAX_READ_LENGTH)

            # length predicate
            def length_filter(x):
                return min_length < len(x.seq) < max_length

            # filter the input reads based on length
            # parallelizing this doesn't seem to work
            # ipython can't accept closures as an argument to view.map()
            """
            filtered_fastq = view.map(filter_seqio, tagdust_outputs,
                                      [lf] * len(tagdust_outputs),
                                      ["filt"] * len(tagdust_outputs),
                                      ["fastq"] * len(tagdust_outputs))"""
            out_files = [
                append_stem(os.path.basename(input_file[0]), "filt")
                for input_file in tagdust_outputs
            ]
            out_dir = os.path.join(config["dir"]["results"], "length_filtered")
            safe_makedir(out_dir)
            out_files = [os.path.join(out_dir, x) for x in out_files]

            filtered_fastq = [
                filter_seqio(x[0], length_filter, y, "fastq")
                for x, y in zip(tagdust_outputs, out_files)
            ]

            curr_files = filtered_fastq

        if stage == "count_ends":
            logger.info("Compiling nucleotide counts at 3' and 5' ends.")

            # count the nucleotide at the end of each read
            def count_ends(x, y):
                """ keeps a running count of an arbitrary set of keys
                during the reduce step """
                x[y] = x.get(y, 0) + 1
                return x

            def get_3prime_end(x):
                return str(x.seq[-1])

            def get_5prime_end(x):
                return str(x.seq[0])

            def output_counts(end_function, count_file):
                # if the count_file already exists, skip
                outdir = os.path.join(config["dir"]["results"], stage)
                safe_makedir(outdir)
                count_file = os.path.join(outdir, count_file)
                if os.path.exists(count_file):
                    return count_file
                # outputs a tab file of the counts at the end
                # of the fastq files kj
                counts = [
                    reduce(count_ends,
                           apply_seqio(x, end_function, kind="fastq"), {})
                    for x in curr_files
                ]
                df = pd.DataFrame(counts, index=map(_short_name, curr_files))
                df = df.astype(float)
                total = df.sum(axis=1)
                df = df.div(total, axis=0)
                df["total"] = total
                df.to_csv(count_file, sep="\t")

            output_counts(get_3prime_end, "3prime_counts.tsv")
            output_counts(get_5prime_end, "5prime_counts.tsv")

        if stage == "tophat":
            tophat_config = config["stage"][stage]
            logger.info("Running tophat on %s" % (str(curr_files)))
            nlen = len(curr_files)
            pair_file = None
            ref_file = tophat_config["annotation"]
            out_base = os.path.join(results_dir, "mirna")
            align_dir = os.path.join(results_dir, "tophat")
            config = config
            tophat_files = view.map(tophat.align, curr_files,
                                    [pair_file] * nlen, [ref_file] * nlen,
                                    [out_base] * nlen, [align_dir] * nlen,
                                    [config] * nlen)
            curr_files = tophat_files

        if stage == "novoalign":
            logger.info("Running novoalign on %s" % (str(curr_files)))
            # align
            ref = config["genome"]["file"]
            novoalign_config = config["stage"][stage]
            aligned_outputs = view.map(novoalign.run, curr_files,
                                       [ref] * len(curr_files),
                                       [novoalign_config] * len(curr_files),
                                       [config] * len(curr_files))
            # convert sam to bam, sort and index
            picard = BroadRunner(config["program"]["picard"], None, {})
            bamfiles = view.map(picardrun.picard_formatconverter,
                                [picard] * len(aligned_outputs),
                                aligned_outputs)
            sorted_bf = view.map(picardrun.picard_sort,
                                 [picard] * len(bamfiles), bamfiles)
            view.map(picardrun.picard_index, [picard] * len(sorted_bf),
                     sorted_bf)
            # these files are the new starting point for the downstream
            # analyses, so copy them over into the data dir and setting
            # them to read only
            #data_dir = os.path.join(config["dir"]["data"], stage)
            #safe_makedir(data_dir)
            #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf))
            #new_files = [os.path.join(data_dir, x) for x in
            #             map(os.path.basename, sorted_bf)]
            #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files]
            # index the bam files for later use
            #view.map(picardrun.picard_index, [picard] * len(new_files),
            #         new_files)

            curr_files = sorted_bf

        if stage == "new_coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = blastn.prepare_ref_file(config["stage"][stage]["ref"],
                                          config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"], None, {})
            out_dir = os.path.join(results_dir, "new_coverage")
            safe_makedir(out_dir)
            out_files = [
                replace_suffix(os.path.basename(x), "metrics")
                for x in curr_files
            ]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun, curr_files, [ref] * nrun,
                                 [ribo] * nrun, out_files)
            curr_files = out_files

        if stage == "coverage":
            gtf = blastn.prepare_ref_file(config["annotation"], config)
            logger.info("Calculating coverage of features in %s for %s" %
                        (gtf, str(sorted_bf)))
            out_files = [replace_suffix(x, "counts.bed") for x in sorted_bf]
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            logger.info(out_files)
            out_files = [
                os.path.join(out_dir, os.path.basename(x)) for x in out_files
            ]
            logger.info(out_files)
            view.map(bedtools.count_overlaps, sorted_bf,
                     [gtf] * len(sorted_bf), out_files)

        if stage == "htseq-count":
            nfiles = len(curr_files)
            htseq_config = _get_stage_config(config, stage)
            htseq_outputs = view.map(htseq_count.run_with_config,
                                     aligned_outputs, [config] * nfiles,
                                     [stage] * nfiles)
            column_names = _get_short_names(input_files)
            logger.info("Column names: %s" % (column_names))
            out_file = os.path.join(config["dir"]["results"], stage,
                                    "combined.counts")
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      column_names, out_file)
        if stage == "bedtools_intersect":
            bedfiles = config["stage"]["bedtools_intersect"].get("bed", None)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for bedfile in bedfiles:
                bedbase, bedext = os.path.splitext(bedfile)
                out_files = [remove_suffix(x) for x in sorted_bf]
                out_files = [
                    os.path.join(out_dir, os.path.basename(x))
                    for x in out_files
                ]
                out_files = [
                    "_vs_".join([x, os.path.basename(bedbase)])
                    for x in out_files
                ]
                out_files = [".".join([x, "bam"]) for x in out_files]
                test_out = map(bedtools.intersectbam2bed, sorted_bf,
                               [bedfile] * len(sorted_bf),
                               [False] * len(sorted_bf), out_files)
                count_files = [replace_suffix(x, "stats") for x in out_files]
                map(write_ratios, sorted_bf, out_files, count_files)

        if stage == "piranha":
            piranha_runner = piranha.PiranhaStage(config)
            out_files = view.map(piranha_runner, curr_files)

    stop_cluster()