Example #1
0
def run(in_file, ref, blastn_config, config):
    logger.info("Preparing the reference file for %s." % (ref.get("name")))
    ref_file = prepare_ref_file(ref, config)
    logger.info("Preparing the blast database for %s." % (ref.get("name")))
    blast_db = prepare_blast_db(ref_file, "nucl")
    logger.info("Blasting %s against %s." % (in_file, ref.get("name")))

    results_dir = build_results_dir(blastn_config, config)
    utils.safe_makedir(results_dir)

    out_file = os.path.join(results_dir,
                            replace_suffix(os.path.basename(in_file),
                                           ref.get("name") + "hits.tsv"))
    tmp_out = out_file + ".tmp"

    blast_results = blast_search(in_file, blast_db, tmp_out)
    #logger.info("Filtering results for at least %f percent of the "
    #            "sequences covered." %(0.5*100))
    #filtered_results = filter_results_by_length(blast_results, 0.5)
    #logger.info("Filtered output file here: %s" %(filtered_results))
    with open(blast_results) as in_handle:
        reader = csv.reader(in_handle, delimiter="\t")
        with open(out_file, "w") as out_handle:
            writer = csv.writer(out_handle, delimiter="\t")
            writer.writerow(HEADER_FIELDS.split(" "))
            for line in reader:
                writer.writerow(line)

    return out_file
Example #2
0
def run(in_file, ref, blastn_config, config):
    logger.info("Preparing the reference file for %s." % (ref.get("name")))
    ref_file = prepare_ref_file(ref, config)
    logger.info("Preparing the blast database for %s." % (ref.get("name")))
    blast_db = prepare_blast_db(ref_file, "nucl")
    logger.info("Blasting %s against %s." % (in_file, ref.get("name")))

    results_dir = build_results_dir(blastn_config, config)
    utils.safe_makedir(results_dir)

    out_file = os.path.join(
        results_dir,
        replace_suffix(os.path.basename(in_file),
                       ref.get("name") + "hits.tsv"))
    tmp_out = out_file + ".tmp"

    blast_results = blast_search(in_file, blast_db, tmp_out)
    #logger.info("Filtering results for at least %f percent of the "
    #            "sequences covered." %(0.5*100))
    #filtered_results = filter_results_by_length(blast_results, 0.5)
    #logger.info("Filtered output file here: %s" %(filtered_results))
    with open(blast_results) as in_handle:
        reader = csv.reader(in_handle, delimiter="\t")
        with open(out_file, "w") as out_handle:
            writer = csv.writer(out_handle, delimiter="\t")
            writer.writerow(HEADER_FIELDS.split(" "))
            for line in reader:
                writer.writerow(line)

    return out_file
Example #3
0
def _build_output_prefix(input_file, jellyfish_config, config):
    out_dir = build_results_dir(jellyfish_config, config)
    out_prefix = os.path.join(out_dir, replace_suffix(input_file,
                                                      "count"))
    #out_prefix = "_".join([jellyfish_config["name"],
    #                       remove_suffix(input_file)])
    return out_prefix
Example #4
0
def _sam_to_bam(in_file):
    import sh
    from bipy.utils import replace_suffix
    from bcbio.utils import file_exists
    bam_file = replace_suffix(in_file, "bam")
    if file_exists(bam_file):
        return bam_file
    sh.samtools.view("-Sb", in_file, "-o", bam_file)
    return bam_file
Example #5
0
def sam2bam(in_file, out_file=None):
    """ convert a SAM file to a BAM file """
    if out_file is None:
        out_file = replace_suffix(in_file, "bam")

    if file_exists(out_file):
        return out_file

    sh.samtools.view("-Sb", in_file, "-o", out_file)
    return out_file
Example #6
0
File: sam.py Project: roryk/bipy
def bam_name_sort(in_file, out_prefix=None):
    """ sort a bam file by read name """
    if out_prefix is None:
        out_prefix = replace_suffix(in_file, "name_sorted")
    out_file = out_prefix + ".bam"
    if file_exists(out_file):
        return out_file

    sh.samtools.sort("-n", in_file, out_prefix)
    return out_file
Example #7
0
def bam_name_sort(in_file, out_prefix=None):
    """ sort a bam file by read name """
    if out_prefix is None:
        out_prefix = replace_suffix(in_file, "name_sorted")
    out_file = out_prefix + ".bam"
    if file_exists(out_file):
        return out_file

    sh.samtools.sort("-n", in_file, out_prefix)
    return out_file
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for thesis pipeline
    in_dir = config["dir"]["data"]
    id_file = config["id_file"]
    curr_files = input_files_from_dir(in_dir, id_file)
    logger.info("Running pipeline on %s." % (curr_files))

    for stage in config["run"]:
        if stage == "fastqc":
            logger.info("Running fastqc on %s." % (curr_files))
            stage_runner = fastqc.FastQC(config)
            view.map(stage_runner, curr_files, block=False)

        if stage == "cutadapt":
            logger.info("Running cutadapt on %s." % (curr_files))
            stage_runner = trim.Cutadapt(config)
            curr_files = view.map(stage_runner, curr_files)

        if stage == "bowtie":
            logger.info("Running bowtie on %s." % (curr_files))
            bowtie = Bowtie(config)
            curr_files = view.map(bowtie, curr_files)
            mapped = view.map(sam.only_mapped, curr_files)
            unmapped = view.map(sam.only_unmapped, curr_files)
            curr_files = mapped
            bam_files = view.map(sam.sam2bam, mapped)
            bam_sorted = view.map(sam.bamsort, bam_files)
            view.map(sam.bamindex, bam_sorted)


        if stage == "coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = prepare_ref_file(config["stage"][stage]["ref"], config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"])
            out_dir = os.path.join(config["dir"]["results"], stage)
            safe_makedir(out_dir)
            out_files = [replace_suffix(os.path.basename(x),
                                        "metrics") for x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun,
                                 curr_files,
                                 [ref] * nrun,
                                 [ribo] * nrun,
                                 out_files)

    stop_cluster()
Example #9
0
def bamsort(in_file, out_prefix=None):
    """ sort a BAM file """
    if out_prefix is None:
        out_prefix = replace_suffix(in_file, "sorted")

    out_file = out_prefix + ".bam"

    if file_exists(out_file):
        return out_file

    sh.samtools.sort(in_file, out_prefix)
    return out_file
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for thesis pipeline
    in_dir = config["dir"]["data"]
    id_file = config["id_file"]
    curr_files = input_files_from_dir(in_dir, id_file)
    logger.info("Running pipeline on %s." % (curr_files))

    for stage in config["run"]:
        if stage == "fastqc":
            logger.info("Running fastqc on %s." % (curr_files))
            stage_runner = fastqc.FastQC(config)
            view.map(stage_runner, curr_files, block=False)

        if stage == "cutadapt":
            logger.info("Running cutadapt on %s." % (curr_files))
            stage_runner = trim.Cutadapt(config)
            curr_files = view.map(stage_runner, curr_files)

        if stage == "bowtie":
            logger.info("Running bowtie on %s." % (curr_files))
            bowtie = Bowtie(config)
            curr_files = view.map(bowtie, curr_files)
            mapped = view.map(sam.only_mapped, curr_files)
            unmapped = view.map(sam.only_unmapped, curr_files)
            curr_files = mapped
            bam_files = view.map(sam.sam2bam, mapped)
            bam_sorted = view.map(sam.bamsort, bam_files)
            view.map(sam.bamindex, bam_sorted)

        if stage == "coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = prepare_ref_file(config["stage"][stage]["ref"], config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"])
            out_dir = os.path.join(config["dir"]["results"], stage)
            safe_makedir(out_dir)
            out_files = [
                replace_suffix(os.path.basename(x), "metrics")
                for x in curr_files
            ]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun, curr_files, [ref] * nrun,
                                 [ribo] * nrun, out_files)

    stop_cluster()
Example #11
0
File: sam.py Project: roryk/bipy
def bamsort(in_file, out_prefix=None):
    """ sort a BAM file """
    if out_prefix is None:
        out_prefix = replace_suffix(in_file, "sorted")

    out_file = out_prefix + ".bam"

    if file_exists(out_file):
        return out_file

    sh.samtools.sort(in_file, out_prefix)
    return out_file
Example #12
0
File: sam.py Project: roryk/bipy
def bam2sam(in_file, out_file=None):
    """ convert a BAM file to a SAM file """
    if is_sam(in_file):
        return in_file

    if out_file is None:
        out_file = replace_suffix(in_file, "sam")

    if file_exists(out_file):
        return out_file

    with file_transaction(out_file) as tmp_out_file:
        cmd = sh.samtools.view.bake(h=True, _out=tmp_out_file)
        cmd(in_file)

    return out_file
Example #13
0
def sam2bam(in_file, out_file=None):
    """ convert a SAM file to a BAM file. if the file is already a
    BAM file, return the BAM file name """

    if is_bam(in_file):
        return in_file

    if out_file is None:
        out_file = replace_suffix(in_file, "bam")

    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tmp_out_file:
        sort_sam = sh.samtools.view.bake(S=True, b=True, o=tmp_out_file)
        sort_sam(in_file)
    return out_file
Example #14
0
def bam2sam(in_file, out_file=None):
    """ convert a BAM file to a SAM file """
    if is_sam(in_file):
        return in_file

    if out_file is None:
        out_file = replace_suffix(in_file, "sam")

    if file_exists(out_file):
        return out_file

    with file_transaction(out_file) as tmp_out_file:
        cmd = sh.samtools.view.bake(h=True, _out=tmp_out_file)
        cmd(in_file)

    return out_file
Example #15
0
File: sam.py Project: roryk/bipy
def sam2bam(in_file, out_file=None):
    """ convert a SAM file to a BAM file. if the file is already a
    BAM file, return the BAM file name """

    if is_bam(in_file):
        return in_file

    if out_file is None:
        out_file = replace_suffix(in_file, "bam")

    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tmp_out_file:
        sort_sam = sh.samtools.view.bake(S=True, b=True, o=tmp_out_file)
        sort_sam(in_file)
    return out_file
Example #16
0
def count_overlaps(in_file, bed, out_file=None):
    """ calculates coverage across the features in the bedfile
    bed """

    if not which("coverageBed"):
        logger.error("Cannot find coverageBed. Make sure it is in your "
                     "path or install bedtools.")
        exit(-1)

    if not out_file:
        out_file = replace_suffix(in_file, ".counts")

    if os.path.exists(out_file):
        return out_file

    cmd = ["coverageBed", "-abam", in_file, "-b", bed]

    with open(out_file, "w") as out_handle:
        subprocess.check_call(cmd, stdout=out_handle)
    return out_file
Example #17
0
def run(in_file, bin_size=30, covariate=None, out_file=None):
    """
    takes a sorted BAM input file and runs Piranha on it
    with the specified bin size

    """
    if not out_file:
        out_file = replace_suffix(in_file, "piranha.bed")

    if file_exists(out_file):
        return out_file

    if covariate and file_exists(covariate):
        print "%s, %s, %s, %s" % (in_file, covariate, str(bin_size), out_file)
        Piranha("-s", in_file, covariate, b=bin_size, o=out_file)
    else:
        print "%s, %s, %s" % (in_file, str(bin_size), out_file)
        Piranha("-s", in_file, b=bin_size, o=out_file)

    return out_file
Example #18
0
def run(in_file, bin_size=30, covariate=None, out_file=None):
    """
    takes a sorted BAM input file and runs Piranha on it
    with the specified bin size

    """
    if not out_file:
        out_file = replace_suffix(in_file, "piranha.bed")

    if file_exists(out_file):
        return out_file

    if covariate and file_exists(covariate):
        print "%s, %s, %s, %s" % (in_file, covariate, str(bin_size), out_file)
        sh.piranha("-s", in_file, covariate, b=bin_size, o=out_file)
    else:
        print "%s, %s, %s" % (in_file, str(bin_size), out_file)
        sh.piranha("-s", in_file, b=bin_size, o=out_file)

    return out_file
Example #19
0
def count_overlaps(in_file, bed, out_file=None):
    """ calculates coverage across the features in the bedfile
    bed """

    if not which("coverageBed"):
        logger.error("Cannot find coverageBed. Make sure it is in your "
                     "path or install bedtools.")
        exit(-1)

    if not out_file:
        out_file = replace_suffix(in_file, ".counts")

    if os.path.exists(out_file):
        return out_file

    cmd = ["coverageBed", "-abam", in_file, "-b", bed]

    with open(out_file, "w") as out_handle:
        subprocess.check_call(cmd, stdout=out_handle)
    return out_file
Example #20
0
def _get_outfilename(input_file):
    out_file = replace_suffix(os.path.basename(input_file), "counts")
    return out_file
Example #21
0
 def chr_out(chrom):
     out_file = os.path.join(break_dir, append_stem(in_file, chrom))
     out_file = replace_suffix(out_file, "vcf")
     return out_file
Example #22
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for thesis pipeline
    input_dirs = config["input_dirs"]

    results_dir = config["dir"].get("results", "results")
    input_files = _find_input_files(config)
    conditions = _group_input_by_condition(input_files)
    logger.info("Input_files: %s" % (input_files))
    logger.info("Condition groups %s" % (conditions))
    htseq_outdict = {}

    for condition, curr_files in conditions.items():
        condition_dir = os.path.join(results_dir, condition)
        safe_makedir(condition_dir)
        config["dir"]["results"] = condition_dir

        for stage in config["run"]:
            if stage == "fastqc":
                _emit_stage_message(stage, curr_files)
                fastqc_config = _get_stage_config(config, stage)
                fastqc_args = zip(
                    *product(curr_files, [fastqc_config], [config]))
                view.map(fastqc.run, *fastqc_args)

            if stage == "cutadapt":
                _emit_stage_message(stage, curr_files)
                cutadapt_config = _get_stage_config(config, stage)
                cutadapt_args = zip(
                    *product(curr_files, [cutadapt_config], [config]))
                cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args)
                curr_files = cutadapt_outputs
                logger.info("Fixing mate pair information.")
                pairs = combine_pairs(curr_files)
                first = [x[0] for x in pairs]
                second = [x[1] for x in pairs]
                logger.info("Forward: %s" % (first))
                logger.info("Reverse: %s" % (second))
                fixed = view.map(fastq.fix_mate_pairs_with_config, first,
                                 second, [config] * len(first))
                curr_files = list(flatten(fixed))

            if stage == "sickle":
                _emit_stage_message(stage, curr_files)
                pairs = combine_pairs(curr_files)
                first = [x[0] for x in pairs]
                second = [x[1] for x in pairs]
                fixed = view.map(sickle.run_with_config, first, second,
                                 [config] * len(first))
                curr_files = list(flatten(fixed))

            if stage == "tophat":
                _emit_stage_message(stage, curr_files)
                tophat_config = _get_stage_config(config, stage)
                pairs = combine_pairs(curr_files)
                first = [x[0] for x in pairs]
                second = [x[1] for x in pairs]
                logger.info("first %s" % (first))
                logger.info("second %s" % (second))

                #tophat_args = zip(*product(first, second, [config["ref"]],
                #                           ["tophat"], [config]))
                tophat_outputs = view.map(tophat.run_with_config, first,
                                          second, [config["ref"]] * len(first),
                                          ["tophat"] * len(first),
                                          [config] * len(first))
                bamfiles = view.map(sam.sam2bam, tophat_outputs)
                bamsort = view.map(sam.bamsort, bamfiles)
                view.map(sam.bamindex, bamsort)
                final_bamfiles = bamsort
                curr_files = tophat_outputs

            if stage == "htseq-count":
                _emit_stage_message(stage, curr_files)
                htseq_config = _get_stage_config(config, stage)
                htseq_args = zip(*product(curr_files, [config], [stage]))
                htseq_outputs = view.map(htseq_count.run_with_config,
                                         *htseq_args)
                htseq_outdict[condition] = htseq_outputs

            if stage == "coverage":
                logger.info("Calculating RNASeq metrics on %s." % (curr_files))
                nrun = len(curr_files)
                ref = prepare_ref_file(config["stage"][stage]["ref"], config)
                ribo = config["stage"][stage]["ribo"]
                picard = BroadRunner(config["program"]["picard"])
                out_dir = os.path.join(results_dir, stage)
                safe_makedir(out_dir)
                out_files = [
                    replace_suffix(os.path.basename(x), "metrics")
                    for x in curr_files
                ]
                out_files = [os.path.join(out_dir, x) for x in out_files]
                out_files = view.map(picardrun.picard_rnaseq_metrics,
                                     [picard] * nrun, curr_files, [ref] * nrun,
                                     [ribo] * nrun, out_files)

            if stage == "rseqc":
                _emit_stage_message(stage, curr_files)
                rseqc_config = _get_stage_config(config, stage)
                rseq_args = zip(*product(curr_files, [config]))
                view.map(rseqc.bam_stat, *rseq_args)
                view.map(rseqc.genebody_coverage, *rseq_args)
                view.map(rseqc.junction_annotation, *rseq_args)
                view.map(rseqc.junction_saturation, *rseq_args)
                RPKM_args = zip(*product(final_bamfiles, [config]))
                RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
                RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                            RPKM_count_out)
                """
                                annotate_args = zip(*product(RPKM_count_fixed,
                                             ["gene_id"],
                                             ["ensembl_gene_id"],
                                             ["human"]))
                view.map(annotate.annotate_table_with_biomart,
                         *annotate_args)
                         """
                view.map(rseqc.RPKM_saturation, *rseq_args)
                curr_files = tophat_outputs

    # combine htseq-count files and run deseq on them
    conditions, htseq_files = dict_to_vectors(htseq_outdict)
    deseq_config = _get_stage_config(config, "deseq")
    cell_types = _group_input_by_cell_type(htseq_files)
    for cell_type, files in cell_types.items():
        for comparison in deseq_config["comparisons"]:
            comparison_name = "_vs_".join(comparison)
            deseq_dir = os.path.join(results_dir, "deseq", cell_type,
                                     comparison_name)
            safe_makedir(deseq_dir)
            out_file = os.path.join(deseq_dir, comparison_name + ".counts.txt")
            files_by_condition = _group_input_by_condition(files)
            _emit_stage_message("deseq", files_by_condition)
            c, f = dict_to_vectors(files_by_condition)
            combined_out = htseq_count.combine_counts(f, None, out_file)
            deseq_out = os.path.join(deseq_dir, comparison_name)
            logger.info("Running deseq on %s with conditions %s "
                        "and writing ot %s" %
                        (combined_out, conditions, deseq_out))
            deseq_out = view.map(deseq.run, [combined_out], [c], [deseq_out])
            annotate.annotate_table_with_biomart(deseq_out[0], "id",
                                                 "ensembl_gene_id", "human")
            #annotated_file = view.map(annotate.annotate_table_with_biomart,
            #                          [deseq_out],
            #                          ["id"],
            #                          ["ensembl_gene_id"],
            #                          ["human"])

    # end gracefully
    stop_cluster()
Example #23
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for thesis pipeline
    input_dirs = config["input_dirs"]

    results_dir = config["dir"].get("results", "results")
    input_files = _find_input_files(config)
    conditions = _group_input_by_condition(input_files)
    logger.info("Input_files: %s" % (input_files))
    logger.info("Condition groups %s" %(conditions))
    htseq_outdict = {}

    for condition, curr_files in conditions.items():
        condition_dir = os.path.join(results_dir, condition)
        safe_makedir(condition_dir)
        config["dir"]["results"] = condition_dir

        for stage in config["run"]:
            if stage == "fastqc":
                logger.info("Running fastqc on %s." % (curr_files))
                stage_runner = FastQC(config)
                view.map(stage_runner, curr_files)

            if stage == "cutadapt":
                logger.info("Running cutadapt on %s." % (curr_files))
                stage_runner = Cutadapt(config)
                curr_files = view.map(stage_runner, curr_files)

            if stage == "tophat":
                logger.info("Running tophat on %s." % (curr_files))
                stage_runner = Tophat(config)
                tophat_outputs = view.map(stage_runner, curr_files)
                bamfiles = view.map(sam.sam2bam, tophat_outputs)
                bamsort = view.map(sam.bamsort, bamfiles)
                view.map(sam.bamindex, bamsort)
                final_bamfiles = bamsort
                curr_files = tophat_outputs

            if stage == "htseq-count":
                _emit_stage_message(stage, curr_files)
                htseq_config = _get_stage_config(config, stage)
                htseq_args = zip(*product(curr_files, [config], [stage]))
                htseq_outputs = view.map(htseq_count.run_with_config,
                                         *htseq_args)
                htseq_outdict[condition] = htseq_outputs

            if stage == "coverage":
                logger.info("Calculating RNASeq metrics on %s." % (curr_files))
                nrun = len(curr_files)
                ref = prepare_ref_file(config["stage"][stage]["ref"], config)
                ribo = config["stage"][stage]["ribo"]
                picard = BroadRunner(config["program"]["picard"])
                out_dir = os.path.join(results_dir, stage)
                safe_makedir(out_dir)
                out_files = [replace_suffix(os.path.basename(x),
                                            "metrics") for x in curr_files]
                out_files = [os.path.join(out_dir, x) for x in out_files]
                out_files = view.map(picardrun.picard_rnaseq_metrics,
                                     [picard] * nrun,
                                     curr_files,
                                     [ref] * nrun,
                                     [ribo] * nrun,
                                     out_files)

            if stage == "rseqc":
                _emit_stage_message(stage, curr_files)
                rseqc_config = _get_stage_config(config, stage)
                rseq_args = zip(*product(curr_files, [config]))
                view.map(rseqc.bam_stat, *rseq_args)
                view.map(rseqc.genebody_coverage, *rseq_args)
                view.map(rseqc.junction_annotation, *rseq_args)
                view.map(rseqc.junction_saturation, *rseq_args)
                RPKM_args = zip(*product(final_bamfiles, [config]))
                RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
                RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                            RPKM_count_out)
                """
                                annotate_args = zip(*product(RPKM_count_fixed,
                                             ["gene_id"],
                                             ["ensembl_gene_id"],
                                             ["human"]))
                view.map(annotate.annotate_table_with_biomart,
                         *annotate_args)
                         """
                view.map(rseqc.RPKM_saturation, *rseq_args)
                curr_files = tophat_outputs

    # combine htseq-count files and run deseq on them
    conditions, htseq_files = dict_to_vectors(htseq_outdict)
    deseq_config = _get_stage_config(config, "deseq")
    cell_types = _group_input_by_cell_type(htseq_files)
    for cell_type, files in cell_types.items():
        for comparison in deseq_config["comparisons"]:
            comparison_name = "_vs_".join(comparison)
            deseq_dir = os.path.join(results_dir, "deseq", cell_type,
                                     comparison_name)
            safe_makedir(deseq_dir)
            out_file = os.path.join(deseq_dir, comparison_name + ".counts.txt")
            files_by_condition = _group_input_by_condition(files)
            _emit_stage_message("deseq", files_by_condition)
            c, f = dict_to_vectors(files_by_condition)
            combined_out = htseq_count.combine_counts(f,
                                                      None,
                                                      out_file)
            deseq_out = os.path.join(deseq_dir, comparison_name)
            logger.info("Running deseq on %s with conditions %s "
                        "and writing ot %s" % (combined_out,
                                               conditions,
                                               deseq_out))
            deseq_out = view.map(deseq.run, [combined_out], [c], [deseq_out])
            annotate.annotate_table_with_biomart(deseq_out[0],
                                                 "id",
                                                 "ensembl_gene_id",
                                                 "human")

    # end gracefully
    stop_cluster()
Example #24
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    from bipy.log import logger
    start_cluster(config)

    from bipy.cluster import view
    #    view.push({'logger': logger})

    input_files = [
        os.path.join(config["dir"]["data"], x) for x in config["input"]
    ]
    results_dir = config["dir"]["results"]

    map(safe_makedir, config["dir"].values())

    curr_files = input_files

    for stage in config["run"]:
        if stage == "fastqc":
            nfiles = len(curr_files)
            logger.info("Running %s on %s" % (stage, str(curr_files)))
            fastqc_config = _get_stage_config(config, stage)
            fastqc_outputs = view.map(fastqc.run, curr_files,
                                      [fastqc_config] * nfiles,
                                      [config] * nfiles)

        if stage == "cutadapt":
            nfiles = len(curr_files)
            cutadapt_config = _get_stage_config(config, stage)
            cutadapt_outputs = view.map(cutadapt_tool.run, curr_files,
                                        [cutadapt_config] * nfiles,
                                        [config] * nfiles)
            curr_files = cutadapt_outputs

        if stage == "novoalign":
            nfiles = len(curr_files)
            novoalign_config = _get_stage_config(config, stage)
            #db = novoindex.run(config["ref"],
            #                   _get_stage_config(config, "novoindex"),
            #                   config)
            db = config["genome"]["file"]
            novoalign_outputs = view.map(novoalign.run, curr_files,
                                         [db] * nfiles,
                                         [novoalign_config] * nfiles,
                                         [config] * nfiles)
            picard = BroadRunner(config["program"]["picard"])
            args = zip(*itertools.product([picard], novoalign_outputs))
            # conver to bam
            bamfiles = view.map(picardrun.picard_formatconverter, *args)
            args = zip(*itertools.product([picard], bamfiles))
            # sort bam
            sorted_bf = view.map(picardrun.picard_sort, *args)
            # index bam
            args = zip(*itertools.product([picard], sorted_bf))
            view.map(picardrun.picard_index, *args)
            curr_files = novoalign_outputs

        if stage == "htseq-count":
            nfiles = len(curr_files)
            htseq_config = _get_stage_config(config, stage)
            htseq_outputs = view.map(htseq_count.run_with_config, curr_files,
                                     [config] * nfiles, [stage] * nfiles)
            column_names = _get_short_names(input_files)
            logger.info("Column names: %s" % (column_names))
            out_file = os.path.join(config["dir"]["results"], stage,
                                    "combined.counts")
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      column_names, out_file)
            rpkm = htseq_count.calculate_rpkm(combined_out,
                                              config["annotation"]["file"])
            rpkm_file = os.path.join(config["dir"]["results"], stage,
                                     "rpkm.txt")
            rpkm.to_csv(rpkm_file, sep="\t")

        if stage == "coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = blastn.prepare_ref_file(config["stage"][stage]["ref"],
                                          config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"])
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            out_files = [
                replace_suffix(os.path.basename(x), "metrics")
                for x in curr_files
            ]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun, curr_files, [ref] * nrun,
                                 [ribo] * nrun, out_files)

        if stage == "deseq":
            conditions = [
                os.path.basename(x).split("_")[0] for x in input_files
            ]
            deseq_config = _get_stage_config(config, stage)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for comparison in deseq_config["comparisons"]:
                comparison_name = "_vs_".join(comparison)
                out_dir = os.path.join(results_dir, stage, comparison_name)
                safe_makedir(out_dir)
                # get the of the conditons that match this comparison
                indexes = [
                    x for x, y in enumerate(conditions) if y in comparison
                ]
                # find the htseq_files to combine and combine them
                htseq_files = [htseq_outputs[index] for index in indexes]
                htseq_columns = [column_names[index] for index in indexes]
                logger.info(htseq_files)
                logger.info(htseq_columns)
                out_file = os.path.join(out_dir,
                                        comparison_name + ".counts.txt")
                combined_out = htseq_count.combine_counts(
                    htseq_files, htseq_columns, out_file)
                deseq_conds = [conditions[index] for index in indexes]
                deseq_prefix = os.path.join(out_dir, comparison_name)

                deseq_out = view.map(deseq.run, [combined_out], [deseq_conds],
                                     [deseq_prefix])
                logger.info("Annotating %s." % (deseq_out))
                annotated_file = view.map(annotate.annotate_table_with_biomart,
                                          deseq_out, ["id"],
                                          ["ensembl_gene_id"], ["human"])

    stop_cluster()
Example #25
0
def main(config_file):
    """ this assumes that we are keeping the same order of the files
    throughout """
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    input_dict = config["input"]
    curr_files = _make_current_files(input_dict.keys())
    input_meta = input_dict.values()

    for stage in config["run"]:
        if stage == "fastqc":
            _emit_stage_message(stage, curr_files)
            fastqc_config = _get_stage_config(config, stage)
            fastqc_args = zip(*product(curr_files, [fastqc_config], [config]))
            view.map(fastqc.run, *fastqc_args)

        if stage == "cutadapt":
            _emit_stage_message(stage, curr_files)
            cutadapt_config = _get_stage_config(config, stage)
            cutadapt_args = zip(
                *product(curr_files, [cutadapt_config], [config]))
            cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args)
            curr_files = _make_current_files(cutadapt_outputs)

        if stage == "tophat":
            _emit_stage_message(stage, curr_files)
            tophat_config = _get_stage_config(config, stage)
            tophat_args = zip(*product(curr_files, [None], [config["ref"]],
                                       ["tophat"], [config]))
            tophat_outputs = view.map(tophat.run_with_config, *tophat_args)
            bamfiles = view.map(sam.sam2bam, tophat_outputs)
            bamsort = view.map(sam.bamsort, bamfiles)
            view.map(sam.bamindex, bamsort)
            final_bamfiles = bamsort
            curr_files = tophat_outputs

        if stage == "htseq-count":
            _emit_stage_message(stage, curr_files)
            htseq_config = _get_stage_config(config, stage)
            htseq_args = zip(*product(curr_files, [config], [stage]))
            htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args)
            combined_out = os.path.join(config["dir"]["results"], stage,
                                        "all_combined.counts")
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      None,
                                                      out_file=combined_out)

        if stage == "rseqc":
            _emit_stage_message(stage, curr_files)
            rseqc_config = _get_stage_config(config, stage)
            rseq_args = zip(*product(curr_files, [config]))
            view.map(rseqc.bam_stat, *rseq_args)
            view.map(rseqc.genebody_coverage, *rseq_args)
            view.map(rseqc.junction_annotation, *rseq_args)
            view.map(rseqc.junction_saturation, *rseq_args)
            RPKM_args = zip(*product(final_bamfiles, [config]))
            RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
            RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                        RPKM_count_out)
            annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"],
                                         ["ensembl_transcript_id"], ["mouse"]))
            view.map(annotate.annotate_table_with_biomart, *annotate_args)
            view.map(rseqc.RPKM_saturation, *RPKM_args)

        if stage == "coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = prepare_ref_file(config["stage"][stage]["ref"], config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"])
            out_dir = os.path.join(config["dir"]["results"], stage)
            safe_makedir(out_dir)
            out_files = [
                replace_suffix(os.path.basename(x), "metrics")
                for x in curr_files
            ]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun, curr_files, [ref] * nrun,
                                 [ribo] * nrun, out_files)

        if stage == "deseq":
            _emit_stage_message(stage, curr_files)
            deseq_config = _get_stage_config(config, stage)
            out_dir = os.path.join(config["dir"]["results"], stage)
            safe_makedir(out_dir)
            for test in deseq_config["tests"]:
                indexes = [
                    _find_file_index_for_test(input_meta, condition)
                    for condition in test
                ]
                files = [htseq_outputs[x] for x in indexes]
                conditions = [input_meta[x]["condition"] for x in indexes]
                combined_out = os.path.join(
                    out_dir, "_".join(conditions) + "_combined.counts")
                logger.info("Combining %s to %s." % (files, combined_out))
                count_file = htseq_count.combine_counts(files,
                                                        None,
                                                        out_file=combined_out)
                out_file = os.path.join(out_dir,
                                        "_".join(conditions) + "_deseq.txt")
                logger.info("Running deseq on %s with conditions %s "
                            "and writing to %s" %
                            (count_file, conditions, out_file))
                view.map(deseq.run, [count_file], [conditions], [out_file])
                #deseq.run(count_file, conditions, out_file=out_file)

    # end gracefully
    stop_cluster()
Example #26
0
def _build_output_file(input_file, config):
    safe_makedir(config["dir"]["ref"])
    return os.path.join(config["dir"]["ref"],
                        os.path.basename(replace_suffix(input_file, "nix")))
Example #27
0
def _build_output_file(input_file, config):
    safe_makedir(config["dir"]["ref"])
    return os.path.join(config["dir"]["ref"],
                        os.path.basename(replace_suffix(input_file, "nix")))
Example #28
0
def _get_outfilename(input_file):
    out_file = replace_suffix(os.path.basename(input_file), "counts")
    return out_file
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for thesis pipeline
    in_dir = config["dir"]["data"]
    curr_files = input_files_from_dir(in_dir)

    for stage in config["run"]:
        if stage == "fastqc":
            stage_runner = fastqc.FastQCStage(config)
            view.map(stage_runner, curr_files)

        if stage == "cutadapt":
            stage_runner = trim.Cutadapt(config)
            curr_files = view.map(stage_runner, curr_files)

        if stage == "tophat":
            _emit_stage_message(stage, curr_files)
            tophat_config = _get_stage_config(config, stage)
            tophat_outputs = view.map(tophat.run_with_config,
                                      first, [None] * len(curr_files),
                                      [config["ref"]] * len(curr_files),
                                      ["tophat"] * len(curr_files),
                                      [config] * len(curr_files))
            bamfiles = view.map(sam.sam2bam, tophat_outputs)
            bamsort = view.map(sam.bamsort, bamfiles)
            view.map(sam.bamindex, bamsort)
            final_bamfiles = bamsort
            curr_files = tophat_outputs

        if stage == "coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = prepare_ref_file(config["stage"][stage]["ref"], config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"])
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            out_files = [replace_suffix(os.path.basename(x),
                                        "metrics") for x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun,
                                 curr_files,
                                 [ref] * nrun,
                                 [ribo] * nrun,
                                 out_files)

        if stage == "rseqc":
            _emit_stage_message(stage, curr_files)
            rseqc_config = _get_stage_config(config, stage)
            rseq_args = zip(*product(curr_files, [config]))
            view.map(rseqc.bam_stat, *rseq_args)
            view.map(rseqc.genebody_coverage, *rseq_args)
            view.map(rseqc.junction_annotation, *rseq_args)
            view.map(rseqc.junction_saturation, *rseq_args)
            RPKM_args = zip(*product(final_bamfiles, [config]))
            RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
            RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                        RPKM_count_out)
            """
i                                annotate_args = zip(*product(RPKM_count_fixed,
                                             ["gene_id"],
                                             ["ensembl_gene_id"],
                                             ["human"]))
                view.map(annotate.annotate_table_with_biomart,
                         *annotate_args)
                         """
                view.map(rseqc.RPKM_saturation, *rseq_args)
                curr_files = tophat_outputs
Example #30
0
def main(config_file):

    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    start_cluster(config)

    # after the cluster is up, import the view to i
    from bipy.cluster import view
    input_files = config["input"]
    results_dir = config["dir"]["results"]

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    curr_files = input_files

    ## qc steps
    for stage in config["run"]:
        if stage == "fastqc":
            # run the basic fastqc
            logger.info("Running %s on %s" % (stage, str(curr_files)))
            fastqc_config = config["stage"][stage]
            fastqc_outputs = view.map(fastqc.run, curr_files,
                                      [fastqc_config] * len(curr_files),
                                      [config] * len(curr_files))
            # this does nothing for now, not implemented yet
            summary_file = _combine_fastqc(fastqc_outputs)

        if stage == "trim":
            logger.info("Trimming poor quality ends "
                        " from %s" % (str(curr_files)))
            nlen = len(curr_files)
            min_length = str(config["stage"][stage].get("min_length", 20))

            # trim low quality ends of reads
            # do this dirty for now
            out_dir = os.path.join(results_dir, "trimmed")
            safe_makedir(out_dir)
            out_files = [
                append_stem(os.path.basename(x), "trim") for x in curr_files
            ]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            # XXX remove the magic number of 10 the length of the
            # minimum read to keep
            out_files = view.map(sickle.run, curr_files, ["se"] * nlen,
                                 ["sanger"] * nlen, [min_length] * nlen,
                                 out_files)
            curr_files = out_files

        if stage == "tagdust":
            input_files = curr_files
            # remove tags matching the other miRNA tested
            logger.info("Running %s on %s." % (stage, input_files))
            tagdust_config = config["stage"][stage]
            tagdust_outputs = view.map(tagdust.run, input_files,
                                       [tagdust_config] * len(input_files),
                                       [config] * len(input_files))
            curr_files = [x[0] for x in tagdust_outputs]

        if stage == "filter_length":
            # filter out reads below or above a certain length
            filter_config = config["stage"][stage]
            min_length = filter_config.get("min_length", 0)
            max_length = filter_config.get("max_length", MAX_READ_LENGTH)

            # length predicate
            def length_filter(x):
                return min_length < len(x.seq) < max_length

            # filter the input reads based on length
            # parallelizing this doesn't seem to work
            # ipython can't accept closures as an argument to view.map()
            """
            filtered_fastq = view.map(filter_seqio, tagdust_outputs,
                                      [lf] * len(tagdust_outputs),
                                      ["filt"] * len(tagdust_outputs),
                                      ["fastq"] * len(tagdust_outputs))"""
            out_files = [
                append_stem(os.path.basename(input_file[0]), "filt")
                for input_file in tagdust_outputs
            ]
            out_dir = os.path.join(config["dir"]["results"], "length_filtered")
            safe_makedir(out_dir)
            out_files = [os.path.join(out_dir, x) for x in out_files]

            filtered_fastq = [
                filter_seqio(x[0], length_filter, y, "fastq")
                for x, y in zip(tagdust_outputs, out_files)
            ]

            curr_files = filtered_fastq

        if stage == "count_ends":
            logger.info("Compiling nucleotide counts at 3' and 5' ends.")

            # count the nucleotide at the end of each read
            def count_ends(x, y):
                """ keeps a running count of an arbitrary set of keys
                during the reduce step """
                x[y] = x.get(y, 0) + 1
                return x

            def get_3prime_end(x):
                return str(x.seq[-1])

            def get_5prime_end(x):
                return str(x.seq[0])

            def output_counts(end_function, count_file):
                # if the count_file already exists, skip
                outdir = os.path.join(config["dir"]["results"], stage)
                safe_makedir(outdir)
                count_file = os.path.join(outdir, count_file)
                if os.path.exists(count_file):
                    return count_file
                # outputs a tab file of the counts at the end
                # of the fastq files kj
                counts = [
                    reduce(count_ends,
                           apply_seqio(x, end_function, kind="fastq"), {})
                    for x in curr_files
                ]
                df = pd.DataFrame(counts, index=map(_short_name, curr_files))
                df = df.astype(float)
                total = df.sum(axis=1)
                df = df.div(total, axis=0)
                df["total"] = total
                df.to_csv(count_file, sep="\t")

            output_counts(get_3prime_end, "3prime_counts.tsv")
            output_counts(get_5prime_end, "5prime_counts.tsv")

        if stage == "tophat":
            tophat_config = config["stage"][stage]
            logger.info("Running tophat on %s" % (str(curr_files)))
            nlen = len(curr_files)
            pair_file = None
            ref_file = tophat_config["annotation"]
            out_base = os.path.join(results_dir, "mirna")
            align_dir = os.path.join(results_dir, "tophat")
            config = config
            tophat_files = view.map(tophat.align, curr_files,
                                    [pair_file] * nlen, [ref_file] * nlen,
                                    [out_base] * nlen, [align_dir] * nlen,
                                    [config] * nlen)
            curr_files = tophat_files

        if stage == "novoalign":
            logger.info("Running novoalign on %s" % (str(curr_files)))
            # align
            ref = config["genome"]["file"]
            novoalign_config = config["stage"][stage]
            aligned_outputs = view.map(novoalign.run, curr_files,
                                       [ref] * len(curr_files),
                                       [novoalign_config] * len(curr_files),
                                       [config] * len(curr_files))
            # convert sam to bam, sort and index
            picard = BroadRunner(config["program"]["picard"], None, {})
            bamfiles = view.map(picardrun.picard_formatconverter,
                                [picard] * len(aligned_outputs),
                                aligned_outputs)
            sorted_bf = view.map(picardrun.picard_sort,
                                 [picard] * len(bamfiles), bamfiles)
            view.map(picardrun.picard_index, [picard] * len(sorted_bf),
                     sorted_bf)
            # these files are the new starting point for the downstream
            # analyses, so copy them over into the data dir and setting
            # them to read only
            #data_dir = os.path.join(config["dir"]["data"], stage)
            #safe_makedir(data_dir)
            #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf))
            #new_files = [os.path.join(data_dir, x) for x in
            #             map(os.path.basename, sorted_bf)]
            #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files]
            # index the bam files for later use
            #view.map(picardrun.picard_index, [picard] * len(new_files),
            #         new_files)

            curr_files = sorted_bf

        if stage == "new_coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = blastn.prepare_ref_file(config["stage"][stage]["ref"],
                                          config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"], None, {})
            out_dir = os.path.join(results_dir, "new_coverage")
            safe_makedir(out_dir)
            out_files = [
                replace_suffix(os.path.basename(x), "metrics")
                for x in curr_files
            ]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun, curr_files, [ref] * nrun,
                                 [ribo] * nrun, out_files)
            curr_files = out_files

        if stage == "coverage":
            gtf = blastn.prepare_ref_file(config["annotation"], config)
            logger.info("Calculating coverage of features in %s for %s" %
                        (gtf, str(sorted_bf)))
            out_files = [replace_suffix(x, "counts.bed") for x in sorted_bf]
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            logger.info(out_files)
            out_files = [
                os.path.join(out_dir, os.path.basename(x)) for x in out_files
            ]
            logger.info(out_files)
            view.map(bedtools.count_overlaps, sorted_bf,
                     [gtf] * len(sorted_bf), out_files)

        if stage == "htseq-count":
            nfiles = len(curr_files)
            htseq_config = _get_stage_config(config, stage)
            htseq_outputs = view.map(htseq_count.run_with_config,
                                     aligned_outputs, [config] * nfiles,
                                     [stage] * nfiles)
            column_names = _get_short_names(input_files)
            logger.info("Column names: %s" % (column_names))
            out_file = os.path.join(config["dir"]["results"], stage,
                                    "combined.counts")
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      column_names, out_file)
        if stage == "bedtools_intersect":
            bedfiles = config["stage"]["bedtools_intersect"].get("bed", None)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for bedfile in bedfiles:
                bedbase, bedext = os.path.splitext(bedfile)
                out_files = [remove_suffix(x) for x in sorted_bf]
                out_files = [
                    os.path.join(out_dir, os.path.basename(x))
                    for x in out_files
                ]
                out_files = [
                    "_vs_".join([x, os.path.basename(bedbase)])
                    for x in out_files
                ]
                out_files = [".".join([x, "bam"]) for x in out_files]
                test_out = map(bedtools.intersectbam2bed, sorted_bf,
                               [bedfile] * len(sorted_bf),
                               [False] * len(sorted_bf), out_files)
                count_files = [replace_suffix(x, "stats") for x in out_files]
                map(write_ratios, sorted_bf, out_files, count_files)

        if stage == "piranha":
            piranha_runner = piranha.PiranhaStage(config)
            out_files = view.map(piranha_runner, curr_files)

    stop_cluster()
Example #31
0
File: rseqc.py Project: roryk/bipy
 def out_file(self, in_file):
     results_dir = self.config["dir"].get("results", "results")
     out_dir = os.path.join(results_dir, self.stage)
     safe_makedir(out_dir)
     out_file = replace_suffix(os.path.basename(in_file), "metrics")
     return os.path.join(out_dir, out_file)
Example #32
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    stage_dict = {"download_encode": _download_encode,
                  "fastqc": _run_fastqc}

    curr_files = config["encode_file"]

    results_dir = config["dir"].get("results", "results")

    for cell_type in config["cell_types"]:
        cell_type_dir = os.path.join(results_dir, cell_type)
        safe_makedir(cell_type_dir)
        config["dir"]["results"] = cell_type_dir
        in_files = glob.glob(os.path.join(config["dir"]["data"],
                                          cell_type, "*"))
        curr_files = in_files
        for stage in config["run"]:
            if stage == "fastqc":
                _emit_stage_message(stage, curr_files)
                fastqc_config = _get_stage_config(config, stage)
                fastqc_args = zip(*product(curr_files, [fastqc_config],
                                           [config]))
                view.map(fastqc.run, *fastqc_args)

            if stage == "cutadapt":
                _emit_stage_message(stage, curr_files)
                cutadapt_config = _get_stage_config(config, stage)
                cutadapt_args = zip(*product(curr_files, [cutadapt_config],
                                             [config]))
                cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args)
                curr_files = cutadapt_outputs

            if stage == "tophat":
                _emit_stage_message(stage, curr_files)
                tophat_config = _get_stage_config(config, stage)
                tophat_args = zip(*product(curr_files, [None], [config["ref"]],
                                           ["tophat"], [config]))
                tophat_outputs = view.map(tophat.run_with_config, *tophat_args)

                picard = BroadRunner(config["program"]["picard"])
                # convert to bam
                #args = zip(*product([picard], tophat_outputs))
                #bamfiles = view.map(picardrun.picard_formatconverter,
                #                    *args)
                bamfiles = view.map(sam.sam2bam, tophat_outputs)
                sorted_bf = view.map(sam.bamsort, bamfiles)
                view.map(sam.bamindex, sorted_bf)
                curr_files = sorted_bf

            if stage == "rseqc":
                _emit_stage_message(stage, curr_files)
                rseqc_config = _get_stage_config(config, stage)
                rseq_args = zip(*product(curr_files, [config]))
                view.map(rseqc.bam2bigwig, *rseq_args, block=False)
                view.map(rseqc.bam_stat, *rseq_args, block=False)
                view.map(rseqc.clipping_profile, *rseq_args, block=False)
                view.map(rseqc.genebody_coverage, *rseq_args, block=False)
                view.map(rseqc.junction_annotation, *rseq_args, block=False)
                view.map(rseqc.junction_saturation, *rseq_args, block=False)
                RPKM_count_files = view.map(rseqc.RPKM_count,
                                            *rseq_args)
                dirs_to_process = list(set(map(os.path.dirname,
                                               RPKM_count_files)))
                logger.info("Count files: %s" % (RPKM_count_files))
                logger.info("dirnames to process: %s" % (dirs_to_process))
                RPKM_merged = view.map(rseqc.merge_RPKM, dirs_to_process)

                view.map(rseqc.RPKM_saturation, *rseq_args, block=False)
                curr_files = tophat_outputs

            if stage == "htseq-count":
                _emit_stage_message(stage, curr_files)
                htseq_config = _get_stage_config(config, stage)
                htseq_args = zip(*product(curr_files, [config], [stage]))
                htseq_outputs = view.map(htseq_count.run_with_config,
                                         *htseq_args)
                column_names = in_files
                out_file = os.path.join(config["dir"]["results"], stage,
                                        cell_type + ".combined.counts")
                combined_out = htseq_count.combine_counts(htseq_outputs,
                                                          column_names,
                                                          out_file)
                rpkm = htseq_count.calculate_rpkm(combined_out,
                                                  config["annotation"]["file"])
                rpkm_file = os.path.join(config["dir"]["results"], stage,
                                         cell_type + ".rpkm.txt")
                rpkm.to_csv(rpkm_file, sep="\t")

            if stage == "coverage":
                logger.info("Calculating RNASeq metrics on %s." % (curr_files))
                nrun = len(curr_files)
                ref = prepare_ref_file(config["stage"][stage]["ref"],
                                              config)
                ribo = config["stage"][stage]["ribo"]
                picard = BroadRunner(config["program"]["picard"])
                out_dir = os.path.join(config["dir"]["results"], stage)
                safe_makedir(out_dir)
                out_files = [replace_suffix(os.path.basename(x),
                                            "metrics") for x in curr_files]
                out_files = [os.path.join(out_dir, x) for x in out_files]
                out_files = view.map(picardrun.picard_rnaseq_metrics,
                                     [picard] * nrun,
                                     curr_files,
                                     [ref] * nrun,
                                     [ribo] * nrun,
                                     out_files)

    # end gracefully, wait for jobs to finish, then exit
    view.wait()
    stop_cluster()
Example #33
0
def main(config_file):
    if config_file:
        with open(config_file) as in_handle:
            config = yaml.load(in_handle)

    dirs = config["in_dir"]
    conditions = config["conditions"]
    glob_string = config["glob_string"]

    files = list(flatten([glob.glob(os.path.join(x, glob_string)) for x in dirs]))
    out_dir = config["dir"]["results"]
    safe_makedir(out_dir)

    curr_files = []
    for condition in conditions:
        condition_files = [x for x in files if condition in x]
        out_file = os.path.join(out_dir, condition + "_v2_v3.bam")
        print "Combining %s into %s." % (condition_files, out_file)
        sh.samtools.merge(list(flatten([out_file, condition_files])))
        #        bsub_call = list(flatten(["-q", "hsph", "-o", "out" + condition, "-e", "err" + condition, "samtools", "merge", out_file, condition_files]))
        #sh.bsub(bsub_call)
        sorted_prefix = remove_suffix(out_file) + ".sorted"
        sorted_file = sorted_prefix + ".bam"
        sh.samtools.sort(out_file, sorted_prefix)
        sh.samtools.index(sorted_file)
        mapped_file = append_stem(sorted_file, "mapped")
        sh.samtools.view(sorted_file, F=4, b=True, o=mapped_file)
        sh.samtools.index(mapped_file)

        # find the reads that don't intersect with the rrna
        in_file = mapped_file
        out_file = os.path.join(out_dir, condition + "_noribo" + "_v2_v3.bam")
        ribo = config["ribo"]
        print "Filtering %s for rRNA in %s into %s." % (in_file, ribo, out_file)
        sh.bedtools.intersect("-abam", in_file, "-v", "-b", ribo, _out=out_file)
        filtered_file = out_file

        print "Calculating RNASeq metrics on %s." % (out_file)
        in_file = out_file
        ref = blastn.prepare_ref_file(config["stage"]["new_coverage"]["ref"],
                                      config)
        ribo = config["stage"]["new_coverage"]["ribo"]
        picard = BroadRunner(config["program"]["picard"])
        out_dir = os.path.join(config["dir"]["results"], "new_coverage")
        safe_makedir(out_dir)
        out_file = replace_suffix(os.path.basename(in_file), "metrics")
        out_file = os.path.join(out_dir, out_file)
        metrics_file = picardrun.picard_rnaseq_metrics(picard, in_file, ref,
                                                       ribo, out_file)

        jelly_dir = os.path.join(config["dir"]["results"], "jellyfish")
        safe_makedir(jelly_dir)
        # convert the filtered file to fastq for jellyfish counting
        fastq_file = os.path.join(jelly_dir,
                                  os.path.basename(replace_suffix(filtered_file,
                                                                  "fastq")))
        sh.bam2fastx(filtered_file, fastq=True, _out=fastq_file)
        for mer in config["stage"]["jellyfish"]["mer_lengths"]:
            base, _ = os.path.splitext(os.path.basename(fastq_file))
            out_prefix = base + "_%dmer" % (mer)
            out_file = os.path.join(jelly_dir, out_prefix)
            if not file_exists(out_file):
                sh.jellyfish.count(fastq_file,
                                   config["stage"]["jellyfish"]["options"],
                                   m=mer, o=out_file)
Example #34
0
def main(config_file):
    """ this assumes that we are keeping the same order of the files
    throughout """
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    input_dir = config["input_dir"]
    results_dir = config["dir"].get("results", "results")
    input_files = glob.glob(os.path.join(input_dir, "*.fq"))
    curr_files = _make_current_files(input_files)
    conditions = [os.path.basename(x).split("_")[0] for x in input_files]

    for stage in config["run"]:
        if stage == "fastqc":
            _emit_stage_message(stage, curr_files)
            fastqc_config = _get_stage_config(config, stage)
            fastqc_args = zip(*product(curr_files, [fastqc_config],
                                       [config]))
            fastqc_out = view.map(fastqc.run, *fastqc_args)
            logger.info("fastqc outfiles: %s" % (fastqc_out))

        if stage == "cutadapt":
            _emit_stage_message(stage, curr_files)
            cutadapt_config = _get_stage_config(config, stage)
            cutadapt_args = zip(*product(curr_files, [cutadapt_config],
                                         [config]))
            cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args)
            curr_files = _make_current_files(cutadapt_outputs)

        if stage == "tophat":
            _emit_stage_message(stage, curr_files)
            tophat_config = _get_stage_config(config, stage)
            tophat_args = zip(*product(curr_files, [None], [config["ref"]],
                                       ["tophat"], [config]))
            tophat_outputs = view.map(tophat.run_with_config, *tophat_args)
            # convert to bam, sort and index
            bamfiles = view.map(sam.sam2bam, tophat_outputs)
            sorted_bf = view.map(sam.bamsort, bamfiles)
            view.map(sam.bamindex, sorted_bf)
            curr_files = sorted_bf

        if stage == "rseqc":
            _emit_stage_message(stage, curr_files)
            rseqc_config = _get_stage_config(config, stage)
            rseq_args = zip(*product(curr_files, [config]))
            view.map(rseqc.bam2bigwig, *rseq_args, block=False)
            view.map(rseqc.bam_stat, *rseq_args, block=False)
            view.map(rseqc.clipping_profile, *rseq_args, block=False)
            view.map(rseqc.genebody_coverage, *rseq_args, block=False)
            view.map(rseqc.junction_annotation, *rseq_args, block=False)
            view.map(rseqc.junction_saturation, *rseq_args, block=False)
            view.map(rseqc.RPKM_count, *rseq_args, block=False)
            view.map(rseqc.RPKM_saturation, *rseq_args, block=False)
            curr_files = tophat_outputs

        if stage == "coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = prepare_ref_file(config["stage"][stage]["ref"],
                                          config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"])
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            out_files = [replace_suffix(os.path.basename(x),
                                        "metrics") for x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun,
                                 curr_files,
                                 [ref] * nrun,
                                 [ribo] * nrun,
                                 out_files)

        if stage == "htseq-count":
            _emit_stage_message(stage, curr_files)
            htseq_config = _get_stage_config(config, stage)
            htseq_args = zip(*product(curr_files, [config], [stage]))
            htseq_outputs = view.map(htseq_count.run_with_config,
                                     *htseq_args)
            combined_out = os.path.join(config["dir"]["results"], stage,
                                        "all_combined.counts")
            combined_out = htseq_count.combine_counts(htseq_outputs, None,
                                                      out_file=combined_out)

        if stage == "deseq":
            _emit_stage_message(stage, curr_files)
            deseq_config = _get_stage_config(config, stage)
            out_dir = os.path.join(config["dir"]["results"], stage)
            safe_makedir(out_dir)
            for comparison in deseq_config["comparisons"]:
                comparison_name = "_vs_".join(comparison)
                out_dir = os.path.join(results_dir, stage,
                                       comparison_name)
                safe_makedir(out_dir)
                indexes = [x for x, y in enumerate(conditions) if y
                           in comparison]
                htseq_files = [htseq_outputs[index] for index in indexes]
                htseq_columns = [conditions[index] for index in indexes]
                out_file = os.path.join(out_dir,
                                        comparison_name + ".counts.txt")
                combined_out = htseq_count.combine_counts(htseq_files,
                                                          htseq_columns,
                                                          out_file)
                deseq_conds = [conditions[index] for index in indexes]
                deseq_out = os.path.join(out_dir,
                                         comparison_name + ".deseq.txt")
                logger.info("Running deseq on %s with conditions %s "
                            "and writing to %s" % (combined_out,
                                                   conditions,
                                                   deseq_out))
                view.map(deseq.run, [combined_out], [deseq_conds], [deseq_out])
                annotated_file = view.map(annotate.annotate_table_with_biomart,
                                          [deseq_out],
                                          ["id"],
                                          ["ensembl_gene_id"],
                                          ["zebrafish"])


    # end gracefully
    stop_cluster()
Example #35
0
def main(config_file):
    """ this assumes that we are keeping the same order of the files
    throughout """
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    input_dict = config["input"]
    curr_files = _make_current_files(input_dict.keys())
    input_meta = input_dict.values()

    for stage in config["run"]:
        if stage == "fastqc":
            _emit_stage_message(stage, curr_files)
            fastqc_config = _get_stage_config(config, stage)
            fastqc_args = zip(*product(curr_files, [fastqc_config],
                                       [config]))
            view.map(fastqc.run, *fastqc_args)

        if stage == "cutadapt":
            _emit_stage_message(stage, curr_files)
            cutadapt_config = _get_stage_config(config, stage)
            cutadapt_args = zip(*product(curr_files, [cutadapt_config],
                                         [config]))
            cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args)
            curr_files = _make_current_files(cutadapt_outputs)

        if stage == "tophat":
            _emit_stage_message(stage, curr_files)
            tophat_config = _get_stage_config(config, stage)
            tophat_args = zip(*product(curr_files, [None], [config["ref"]],
                                       ["tophat"], [config]))
            tophat_outputs = view.map(tophat.run_with_config, *tophat_args)
            bamfiles = view.map(sam.sam2bam, tophat_outputs)
            bamsort = view.map(sam.bamsort, bamfiles)
            view.map(sam.bamindex, bamsort)
            final_bamfiles = bamsort
            curr_files = tophat_outputs


        if stage == "htseq-count":
            _emit_stage_message(stage, curr_files)
            htseq_config = _get_stage_config(config, stage)
            htseq_args = zip(*product(curr_files, [config], [stage]))
            htseq_outputs = view.map(htseq_count.run_with_config,
                                     *htseq_args)
            combined_out = os.path.join(config["dir"]["results"], stage,
                                        "all_combined.counts")
            combined_out = htseq_count.combine_counts(htseq_outputs, None,
                                                      out_file=combined_out)

        if stage == "rseqc":
            _emit_stage_message(stage, curr_files)
            rseqc_config = _get_stage_config(config, stage)
            rseq_args = zip(*product(curr_files, [config]))
            view.map(rseqc.bam_stat, *rseq_args)
            view.map(rseqc.genebody_coverage, *rseq_args)
            view.map(rseqc.junction_annotation, *rseq_args)
            view.map(rseqc.junction_saturation, *rseq_args)
            RPKM_args = zip(*product(final_bamfiles, [config]))
            RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
            RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                        RPKM_count_out)
            annotate_args = zip(*product(RPKM_count_fixed,
                                         ["gene_id"],
                                         ["ensembl_transcript_id"],
                                         ["mouse"]))
            view.map(annotate.annotate_table_with_biomart,
                     *annotate_args)
            view.map(rseqc.RPKM_saturation, *RPKM_args)

        if stage == "coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = prepare_ref_file(config["stage"][stage]["ref"], config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"])
            out_dir = os.path.join(config["dir"]["results"], stage)
            safe_makedir(out_dir)
            out_files = [replace_suffix(os.path.basename(x),
                                        "metrics") for x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun,
                                 curr_files,
                                 [ref] * nrun,
                                 [ribo] * nrun,
                                 out_files)

        if stage == "deseq":
            _emit_stage_message(stage, curr_files)
            deseq_config = _get_stage_config(config, stage)
            out_dir = os.path.join(config["dir"]["results"], stage)
            safe_makedir(out_dir)
            for test in deseq_config["tests"]:
                indexes = [_find_file_index_for_test(input_meta,
                                                     condition) for
                                                     condition in test]
                files = [htseq_outputs[x] for x in indexes]
                conditions = [input_meta[x]["condition"] for x in indexes]
                combined_out = os.path.join(out_dir,
                                            "_".join(conditions) +
                                            "_combined.counts")
                logger.info("Combining %s to %s." % (files, combined_out))
                count_file = htseq_count.combine_counts(files, None,
                                                        out_file=combined_out)
                out_file = os.path.join(out_dir, "_".join(conditions) +
                                        "_deseq.txt")
                logger.info("Running deseq on %s with conditions %s "
                            "and writing to %s" % (count_file,
                                                   conditions,
                                                   out_file))
                view.map(deseq.run, [count_file], [conditions], [out_file])
                #deseq.run(count_file, conditions, out_file=out_file)

    # end gracefully
    stop_cluster()
Example #36
0
 def setUp(self):
     with open(CONFIG_FILE) as in_handle:
         self.config = yaml.load(in_handle)
     self.input_files = self.config["input"]
     self.db = os.path.basename(replace_suffix(self.config["ref"], "nix"))
     self.db = os.path.join(self.config["dir"]["ref"], self.db)
Example #37
0
def main(config_file):
    """ this assumes that we are keeping the same order of the files
    throughout """
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    input_dir = config["input_dir"]
    results_dir = config["dir"].get("results", "results")
    input_files = glob.glob(os.path.join(input_dir, "*.fq"))
    curr_files = _make_current_files(input_files)
    conditions = [os.path.basename(x).split("_")[0] for x in input_files]

    for stage in config["run"]:
        if stage == "fastqc":
            _emit_stage_message(stage, curr_files)
            fastqc_config = _get_stage_config(config, stage)
            fastqc_args = zip(*product(curr_files, [fastqc_config], [config]))
            fastqc_out = view.map(fastqc.run, *fastqc_args)
            logger.info("fastqc outfiles: %s" % (fastqc_out))

        if stage == "cutadapt":
            _emit_stage_message(stage, curr_files)
            cutadapt_config = _get_stage_config(config, stage)
            cutadapt_args = zip(
                *product(curr_files, [cutadapt_config], [config]))
            cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args)
            curr_files = _make_current_files(cutadapt_outputs)

        if stage == "tophat":
            _emit_stage_message(stage, curr_files)
            tophat_config = _get_stage_config(config, stage)
            tophat_args = zip(*product(curr_files, [None], [config["ref"]],
                                       ["tophat"], [config]))
            tophat_outputs = view.map(tophat.run_with_config, *tophat_args)
            # convert to bam, sort and index
            bamfiles = view.map(sam.sam2bam, tophat_outputs)
            sorted_bf = view.map(sam.bamsort, bamfiles)
            view.map(sam.bamindex, sorted_bf)
            curr_files = sorted_bf

        if stage == "rseqc":
            _emit_stage_message(stage, curr_files)
            rseqc_config = _get_stage_config(config, stage)
            rseq_args = zip(*product(curr_files, [config]))
            view.map(rseqc.bam2bigwig, *rseq_args, block=False)
            view.map(rseqc.bam_stat, *rseq_args, block=False)
            view.map(rseqc.clipping_profile, *rseq_args, block=False)
            view.map(rseqc.genebody_coverage, *rseq_args, block=False)
            view.map(rseqc.junction_annotation, *rseq_args, block=False)
            view.map(rseqc.junction_saturation, *rseq_args, block=False)
            view.map(rseqc.RPKM_count, *rseq_args, block=False)
            view.map(rseqc.RPKM_saturation, *rseq_args, block=False)
            curr_files = tophat_outputs

        if stage == "coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = prepare_ref_file(config["stage"][stage]["ref"], config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"])
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            out_files = [
                replace_suffix(os.path.basename(x), "metrics")
                for x in curr_files
            ]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun, curr_files, [ref] * nrun,
                                 [ribo] * nrun, out_files)

        if stage == "htseq-count":
            _emit_stage_message(stage, curr_files)
            htseq_config = _get_stage_config(config, stage)
            htseq_args = zip(*product(curr_files, [config], [stage]))
            htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args)
            combined_out = os.path.join(config["dir"]["results"], stage,
                                        "all_combined.counts")
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      None,
                                                      out_file=combined_out)

        if stage == "deseq":
            _emit_stage_message(stage, curr_files)
            deseq_config = _get_stage_config(config, stage)
            out_dir = os.path.join(config["dir"]["results"], stage)
            safe_makedir(out_dir)
            for comparison in deseq_config["comparisons"]:
                comparison_name = "_vs_".join(comparison)
                out_dir = os.path.join(results_dir, stage, comparison_name)
                safe_makedir(out_dir)
                indexes = [
                    x for x, y in enumerate(conditions) if y in comparison
                ]
                htseq_files = [htseq_outputs[index] for index in indexes]
                htseq_columns = [conditions[index] for index in indexes]
                out_file = os.path.join(out_dir,
                                        comparison_name + ".counts.txt")
                combined_out = htseq_count.combine_counts(
                    htseq_files, htseq_columns, out_file)
                deseq_conds = [conditions[index] for index in indexes]
                deseq_out = os.path.join(out_dir,
                                         comparison_name + ".deseq.txt")
                logger.info("Running deseq on %s with conditions %s "
                            "and writing to %s" %
                            (combined_out, conditions, deseq_out))
                view.map(deseq.run, [combined_out], [deseq_conds], [deseq_out])
                annotated_file = view.map(annotate.annotate_table_with_biomart,
                                          [deseq_out], ["id"],
                                          ["ensembl_gene_id"], ["zebrafish"])

    # end gracefully
    stop_cluster()
Example #38
0
 def chr_out(chrom):
     out_file = os.path.join(break_dir, append_stem(in_file, chrom))
     out_file = replace_suffix(out_file, "vcf")
     return out_file
Example #39
0
def _build_output_prefix(input_file, jellyfish_config, config):
    out_dir = build_results_dir(jellyfish_config, config)
    out_prefix = os.path.join(out_dir, replace_suffix(input_file, "count"))
    #out_prefix = "_".join([jellyfish_config["name"],
    #                       remove_suffix(input_file)])
    return out_prefix
Example #40
0
 def setUp(self):
     with open(CONFIG_FILE) as in_handle:
         self.config = yaml.load(in_handle)
     self.input_files = self.config["input"]
     self.db = os.path.basename(replace_suffix(self.config["ref"], "nix"))
     self.db = os.path.join(self.config["dir"]["ref"], self.db)
Example #41
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    from bipy.log import logger
    start_cluster(config)

    data_dir = config["dir"]["data"]
    from bipy.cluster import view
    input_files = [glob.glob(os.path.join(data_dir, x, "*_rep*")) for x in
                   config["input_dirs"]]
    input_files = list(flatten(input_files))
    logger.info("Input files to process: %s" % (input_files))
    results_dir = config["dir"]["results"]

    map(safe_makedir, config["dir"].values())

    curr_files = input_files

    for stage in config["run"]:
        if stage == "fastqc":
            nfiles = len(curr_files)
            logger.info("Running %s on %s" % (stage, str(curr_files)))
            fastqc_config = _get_stage_config(config, stage)
            fastqc_outputs = view.map(fastqc.run, curr_files,
                                      [fastqc_config] * nfiles,
                                      [config] * nfiles)

        if stage == "cutadapt":
            nfiles = len(curr_files)
            cutadapt_config = _get_stage_config(config, stage)
            cutadapt_outputs = view.map(cutadapt_tool.run,
                                        curr_files,
                                        [cutadapt_config] * nfiles,
                                        [config] * nfiles)
            curr_files = cutadapt_outputs

        if stage == "novoalign":
            nfiles = len(curr_files)
            novoalign_config = _get_stage_config(config, stage)
            #db = novoindex.run(config["ref"],
            #                   _get_stage_config(config, "novoindex"),
            #                   config)
            db = config["genome"]["file"]
            novoalign_outputs = view.map(novoalign.run, curr_files,
                                         [db] * nfiles,
                                         [novoalign_config] * nfiles,
                                         [config] * nfiles)
            picard = BroadRunner(config["program"]["picard"])
            args = zip(*itertools.product([picard], novoalign_outputs))
            # conver to bam
            bamfiles = view.map(picardrun.picard_formatconverter,
                                *args)
            args = zip(*itertools.product([picard], bamfiles))
            # sort bam
            sorted_bf = view.map(picardrun.picard_sort, *args)
            # index bam
            args = zip(*itertools.product([picard], sorted_bf))
            view.map(picardrun.picard_index, *args)
            curr_files = novoalign_outputs

        if stage == "htseq-count":
            logger.info("Running htseq-count on %s" %(curr_files))
            htseq_outputs = curr_files
            column_names = _get_short_names(input_files)
            logger.info("Column names: %s" % (column_names))
            out_file = os.path.join(config["dir"]["results"], stage,
                                    "combined.counts")
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      column_names,
                                                      out_file)
            rpkm = htseq_count.calculate_rpkm(combined_out,
                                              config["annotation"]["file"])
            rpkm_file = os.path.join(config["dir"]["results"], stage,
                                     "rpkm.txt")
            rpkm.to_csv(rpkm_file, sep="\t")

        if stage == "coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = blastn.prepare_ref_file(config["stage"][stage]["ref"],
                                          config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"])
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            out_files = [replace_suffix(os.path.basename(x),
                                        "metrics") for x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun,
                                 curr_files,
                                 [ref] * nrun,
                                 [ribo] * nrun,
                                 out_files)

        if stage == "deseq":
            conditions = [os.path.basename(x).split("_")[0] for x in
                          input_files]
            deseq_config = _get_stage_config(config, stage)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for comparison in deseq_config["comparisons"]:
                comparison_name = "_vs_".join(comparison)
                out_dir = os.path.join(results_dir, stage, comparison_name)
                safe_makedir(out_dir)
                # get the of the conditons that match this comparison
                indexes = [x for x, y in enumerate(conditions) if
                           y in comparison]
                # find the htseq_files to combine and combine them
                htseq_files = [htseq_outputs[index] for index in indexes]
                htseq_columns = [column_names[index] for index in indexes]
                logger.info(htseq_files)
                logger.info(htseq_columns)
                out_file = os.path.join(out_dir,
                                        comparison_name + ".counts.txt")
                combined_out = htseq_count.combine_counts(htseq_files,
                                                          htseq_columns,
                                                          out_file)
                deseq_conds = [conditions[index] for index in indexes]
                deseq_prefix = os.path.join(out_dir, comparison_name)

                deseq_out = view.map(deseq.run, [combined_out],
                                     [deseq_conds], [deseq_prefix])
                logger.info("Annotating %s." % (deseq_out))
                annotated_file = view.map(annotate.annotate_table_with_biomart,
                                          deseq_out,
                                          ["id"],
                                          ["ensembl_gene_id"],
                                          ["human"])

        if stage == "dss":
            conditions = [os.path.basename(x).split("_")[0] for x in
                          input_files]
            dss_config = _get_stage_config(config, stage)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for comparison in dss_config["comparisons"]:
                comparison_name = "_vs_".join(comparison)
                out_dir = os.path.join(results_dir, stage, comparison_name)
                safe_makedir(out_dir)
                # get the of the conditons that match this comparison
                indexes = [x for x, y in enumerate(conditions) if
                           y in comparison]
                # find the htseq_files to combine and combine them
                htseq_files = [htseq_outputs[index] for index in indexes]
                htseq_columns = [column_names[index] for index in indexes]
                out_file = os.path.join(out_dir,
                                        comparison_name + ".counts.txt")
                combined_out = htseq_count.combine_counts(htseq_files,
                                                          htseq_columns,
                                                          out_file)
                dss_conds = [conditions[index] for index in indexes]
                dss_prefix = os.path.join(out_dir, comparison_name)
                logger.info("Running DSS on %s with conditions %s and comparison %s." % (combined_out, dss_conds, comparison))

                dss_out = dss.run(combined_out, dss_conds, comparison,
                                  dss_prefix)

    stop_cluster()
Example #42
0
File: rseqc.py Project: roryk/bipy
def _gtf2bed(gtf):
    bed = replace_suffix(gtf, "bed")
    if not file_exists(bed):
        sh.gtf2bigbed(gtf, _out=bed)
    return bed
Example #43
0
def _build_output_file(input_file, novoalign_config, config):
    outdir = build_results_dir(novoalign_config, config)
    safe_makedir(outdir)
    return os.path.join(outdir,
                        os.path.basename(replace_suffix(input_file, "sam")))
Example #44
0
def main(config_file):

    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    start_cluster(config)

    # after the cluster is up, import the view to i
    from bipy.cluster import view
    input_files = config["input"]
    results_dir = config["dir"]["results"]

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    curr_files = input_files

    ## qc steps
    for stage in config["run"]:
        if stage == "fastqc":
            # run the basic fastqc
            logger.info("Running %s on %s" % (stage, str(curr_files)))
            fastqc_config = config["stage"][stage]
            fastqc_outputs = view.map(fastqc.run, curr_files,
                                      [fastqc_config] * len(curr_files),
                                      [config] * len(curr_files))
            # this does nothing for now, not implemented yet
            summary_file = _combine_fastqc(fastqc_outputs)

        if stage == "trim":
            logger.info("Trimming poor quality ends "
                        " from %s" % (str(curr_files)))
            nlen = len(curr_files)
            min_length = str(config["stage"][stage].get("min_length", 20))

            # trim low quality ends of reads
            # do this dirty for now
            out_dir = os.path.join(results_dir, "trimmed")
            safe_makedir(out_dir)
            out_files = [append_stem(os.path.basename(x), "trim") for
                         x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            # XXX remove the magic number of 10 the length of the
            # minimum read to keep
            out_files = view.map(sickle.run, curr_files,
                                 ["se"] * nlen,
                                 ["sanger"] * nlen,
                                 [min_length] * nlen,
                                 out_files)
            curr_files = out_files

        if stage == "tagdust":
            input_files = curr_files
            # remove tags matching the other miRNA tested
            logger.info("Running %s on %s." % (stage, input_files))
            tagdust_config = config["stage"][stage]
            tagdust_outputs = view.map(tagdust.run, input_files,
                                       [tagdust_config] * len(input_files),
                                       [config] * len(input_files))
            curr_files = [x[0] for x in tagdust_outputs]

        if stage == "filter_length":
            # filter out reads below or above a certain length
            filter_config = config["stage"][stage]
            min_length = filter_config.get("min_length", 0)
            max_length = filter_config.get("max_length", MAX_READ_LENGTH)

            # length predicate
            def length_filter(x):
                return min_length < len(x.seq) < max_length

            # filter the input reads based on length
            # parallelizing this doesn't seem to work
            # ipython can't accept closures as an argument to view.map()
            """
            filtered_fastq = view.map(filter_seqio, tagdust_outputs,
                                      [lf] * len(tagdust_outputs),
                                      ["filt"] * len(tagdust_outputs),
                                      ["fastq"] * len(tagdust_outputs))"""
            out_files = [append_stem(os.path.basename(input_file[0]),
                         "filt") for input_file in tagdust_outputs]
            out_dir = os.path.join(config["dir"]["results"],
                                   "length_filtered")
            safe_makedir(out_dir)
            out_files = [os.path.join(out_dir, x) for x in out_files]

            filtered_fastq = [filter_seqio(x[0], length_filter, y, "fastq")
                              for x, y in zip(tagdust_outputs, out_files)]

            curr_files = filtered_fastq

        if stage == "count_ends":
            logger.info("Compiling nucleotide counts at 3' and 5' ends.")
            # count the nucleotide at the end of each read
            def count_ends(x, y):
                """ keeps a running count of an arbitrary set of keys
                during the reduce step """
                x[y] = x.get(y, 0) + 1
                return x

            def get_3prime_end(x):
                return str(x.seq[-1])

            def get_5prime_end(x):
                return str(x.seq[0])

            def output_counts(end_function, count_file):
                # if the count_file already exists, skip
                outdir = os.path.join(config["dir"]["results"], stage)
                safe_makedir(outdir)
                count_file = os.path.join(outdir, count_file)
                if os.path.exists(count_file):
                    return count_file
                # outputs a tab file of the counts at the end
                # of the fastq files kj
                counts = [reduce(count_ends,
                                 apply_seqio(x, end_function, kind="fastq"),
                                 {}) for x in curr_files]
                df = pd.DataFrame(counts,
                                  index=map(_short_name, curr_files))
                df = df.astype(float)
                total = df.sum(axis=1)
                df = df.div(total, axis=0)
                df["total"] = total
                df.to_csv(count_file, sep="\t")

            output_counts(get_3prime_end, "3prime_counts.tsv")
            output_counts(get_5prime_end, "5prime_counts.tsv")

        if stage == "tophat":
            tophat_config = config["stage"][stage]
            logger.info("Running tophat on %s" % (str(curr_files)))
            nlen = len(curr_files)
            pair_file = None
            ref_file = tophat_config["annotation"]
            out_base = os.path.join(results_dir, "mirna")
            align_dir = os.path.join(results_dir, "tophat")
            config = config
            tophat_files = view.map(tophat.align,
                                    curr_files,
                                    [pair_file] * nlen,
                                    [ref_file] * nlen,
                                    [out_base] * nlen,
                                    [align_dir] * nlen,
                                    [config] * nlen)
            curr_files = tophat_files

        if stage == "novoalign":
            logger.info("Running novoalign on %s" % (str(curr_files)))
            # align
            ref = config["genome"]["file"]
            novoalign_config = config["stage"][stage]
            aligned_outputs = view.map(novoalign.run, curr_files,
                                       [ref] * len(curr_files),
                                       [novoalign_config] * len(curr_files),
                                       [config] * len(curr_files))
            # convert sam to bam, sort and index
            picard = BroadRunner(config["program"]["picard"], None, {})
            bamfiles = view.map(picardrun.picard_formatconverter,
                                [picard] * len(aligned_outputs),
                                aligned_outputs)
            sorted_bf = view.map(picardrun.picard_sort,
                                 [picard] * len(bamfiles),
                                 bamfiles)
            view.map(picardrun.picard_index, [picard] * len(sorted_bf),
                     sorted_bf)
            # these files are the new starting point for the downstream
            # analyses, so copy them over into the data dir and setting
            # them to read only
            #data_dir = os.path.join(config["dir"]["data"], stage)
            #safe_makedir(data_dir)
            #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf))
            #new_files = [os.path.join(data_dir, x) for x in
            #             map(os.path.basename, sorted_bf)]
            #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files]
            # index the bam files for later use
            #view.map(picardrun.picard_index, [picard] * len(new_files),
            #         new_files)

            curr_files = sorted_bf

        if stage == "new_coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = blastn.prepare_ref_file(config["stage"][stage]["ref"],
                                          config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"], None, {})
            out_dir = os.path.join(results_dir, "new_coverage")
            safe_makedir(out_dir)
            out_files = [replace_suffix(os.path.basename(x),
                                        "metrics") for x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun,
                                 curr_files,
                                 [ref] * nrun,
                                 [ribo] * nrun,
                                 out_files)
            curr_files = out_files

        if stage == "coverage":
            gtf = blastn.prepare_ref_file(config["annotation"], config)
            logger.info("Calculating coverage of features in %s for %s"
                        % (gtf, str(sorted_bf)))
            out_files = [replace_suffix(x, "counts.bed") for
                         x in sorted_bf]
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            logger.info(out_files)
            out_files = [os.path.join(out_dir,
                                      os.path.basename(x)) for x in out_files]
            logger.info(out_files)
            view.map(bedtools.count_overlaps, sorted_bf,
                     [gtf] * len(sorted_bf),
                     out_files)

        if stage == "htseq-count":
            nfiles = len(curr_files)
            htseq_config = _get_stage_config(config, stage)
            htseq_outputs = view.map(htseq_count.run_with_config,
                                     aligned_outputs,
                                     [config] * nfiles,
                                     [stage] * nfiles)
            column_names = _get_short_names(input_files)
            logger.info("Column names: %s" % (column_names))
            out_file = os.path.join(config["dir"]["results"], stage,
                                    "combined.counts")
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      column_names,
                                                      out_file)
        if stage == "bedtools_intersect":
            bedfiles = config["stage"]["bedtools_intersect"].get("bed", None)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for bedfile in bedfiles:
                bedbase, bedext = os.path.splitext(bedfile)
                out_files = [remove_suffix(x) for x in sorted_bf]
                out_files = [os.path.join(out_dir, os.path.basename(x)) for x in
                             out_files]
                out_files = ["_vs_".join([x, os.path.basename(bedbase)])
                             for x in out_files]
                out_files = [".".join([x, "bam"]) for x in out_files]
                test_out = map(bedtools.intersectbam2bed, sorted_bf,
                               [bedfile] * len(sorted_bf),
                               [False] * len(sorted_bf),
                               out_files)
                count_files = [replace_suffix(x, "stats") for x in
                               out_files]
                map(write_ratios, sorted_bf, out_files, count_files)

        if stage == "piranha":
            piranha_runner = piranha.PiranhaStage(config)
            out_files = view.map(piranha_runner, curr_files)

    stop_cluster()
Example #45
0
def _build_output_file(input_file, novoalign_config, config):
    outdir = build_results_dir(novoalign_config, config)
    safe_makedir(outdir)
    return os.path.join(outdir,
                        os.path.basename(replace_suffix(input_file, "sam")))