Example #1
0
def run(input_file, options, control_file=None, out_dir=None):
    out_files = (remove_suffix(input_file) + "_peaks.bed",
                 remove_suffix(input_file) + "_summits.bed")
    cmd = _build_command(input_file, options, control_file, out_dir)
    subprocess.check_call(cmd)
    if out_dir:
        for f in out_files:
			sh.mv(f, os.path.join(out_dir, os.path.basename(f)))
        out_files = [os.path.join(out_dir, os.path.basename(x)) for
                     x in out_files]
  
    return out_files
Example #2
0
File: macs.py Project: roryk/bipy
def run(input_file, options, control_file=None, out_dir=None):
    out_files = (remove_suffix(input_file) + "_peaks.bed",
                 remove_suffix(input_file) + "_summits.bed")
    cmd = _build_command(input_file, options, control_file, out_dir)
    subprocess.check_call(cmd)
    if out_dir:
        for f in out_files:
            sh.mv(f, os.path.join(out_dir, os.path.basename(f)))
        out_files = [
            os.path.join(out_dir, os.path.basename(x)) for x in out_files
        ]

    return out_files
Example #3
0
File: rseqc.py Project: roryk/bipy
 def generate_report(self, name, figures=None):
     template = Template(self._template)
     clean_name = safe_latex(name)
     #clean_figures = self.clean_figures(figures)
     #section = template.render(name=clean_name, figures=clean_figures)
     clean_figures = [(remove_suffix(figure[0]), figure[1], figure[2])
                      for figure in figures]
     section = template.render(name=clean_name, figures=clean_figures)
     return section
Example #4
0
def _run_command(in_files, cmd, suffix="", out_file=None):
    if not out_file:
        out_file = "".join(
            [remove_suffix(os.path.basename(x)) for x in in_files]) + suffix
        out_file = os.path.join(os.path.dirname(in_files[0]), out_file)

    if file_exists(out_file):
        return out_file

    with open(out_file, "w") as out_handle:
        subprocess.check_call(cmd, stdout=out_handle)

    return out_file
Example #5
0
def _build_command(input_file, options, control_file=None, out_dir=None):
    name = remove_suffix(os.path.basename(input_file))
    #if out_dir:
    #    name = os.path.join(out_dir, name)

    options = ["=".join(map(str, x)) for x in options]

    cmd = ["macs14", "--treatment=" + input_file, flatten(options),
           "--name=" + name]
    if control_file:
        cmd += ["--control=" + control_file]

    return map(str, flatten(cmd))
Example #6
0
def _run_command(in_files, cmd, suffix="", out_file=None):
    if not out_file:
        out_file = "".join([remove_suffix(os.path.basename(x)) for x in
                            in_files]) + suffix
        out_file = os.path.join(os.path.dirname(in_files[0]), out_file)

    if file_exists(out_file):
        return out_file

    with open(out_file, "w") as out_handle:
        subprocess.check_call(cmd, stdout=out_handle)

    return out_file
Example #7
0
    def generate_pdf(self, sections=None, out_file=None):
        out_tmpl = Template(self._base_template)
        if not out_file:
            latex_file = "latex.tex"
            out_file = "latex.pdf"
        else:
            latex_file = remove_suffix(out_file) + ".tex"

        with open(latex_file, "w") as latex_handle:
            latex_handle.write(out_tmpl.render(sections=sections))
        sh.pdflatex(latex_file)

        return out_file
Example #8
0
File: macs.py Project: roryk/bipy
def _build_command(input_file, options, control_file=None, out_dir=None):
    name = remove_suffix(os.path.basename(input_file))
    #if out_dir:
    #    name = os.path.join(out_dir, name)

    options = ["=".join(map(str, x)) for x in options]

    cmd = [
        "macs14", "--treatment=" + input_file,
        flatten(options), "--name=" + name
    ]
    if control_file:
        cmd += ["--control=" + control_file]

    return map(str, flatten(cmd))
Example #9
0
def _merge_condition(in_files, condition):
    """
    merge all of the bam files from a condition together
    as recomended in the MACS manual
    """
    condition_files = [filename for filename in in_files if
                       condition in filename]
    if not condition_files:
        return None
    condition_filename = os.path.join(os.path.dirname(condition_files[1]),
                                      condition + "_merged.bam")
    sorted_prefix = remove_suffix(condition_filename) + ".sorted"
    sorted_filename = sorted_prefix + ".bam"
    if file_exists(sorted_filename):
        return sorted_filename

    sh.samtools("merge", condition_filename, condition_files)
    sh.samtools("sort", condition_filename, sorted_prefix)
    sh.samtools("index", sorted_filename)
    return sorted_filename
Example #10
0
def _merge_condition(in_files, condition):
    """
    merge all of the bam files from a condition together
    as recomended in the MACS manual
    """
    condition_files = [
        filename for filename in in_files if condition in filename
    ]
    if not condition_files:
        return None
    condition_filename = os.path.join(os.path.dirname(condition_files[1]),
                                      condition + "_merged.bam")
    sorted_prefix = remove_suffix(condition_filename) + ".sorted"
    sorted_filename = sorted_prefix + ".bam"
    if file_exists(sorted_filename):
        return sorted_filename

    sh.samtools("merge", condition_filename, condition_files)
    sh.samtools("sort", condition_filename, sorted_prefix)
    sh.samtools("index", sorted_filename)
    return sorted_filename
Example #11
0
File: tophat.py Project: roryk/bipy
def _bcbio_tophat_wrapper(fastq_file, pair_file, ref_file,
                          stage_name, config):
    bcbio_config = {}
    stage_config = config["stage"][stage_name]
    cores = stage_config.get("cores", 1)
    # use the listed quality format, if there isn't one, try to figure
    # out what format it is
    quality_format = stage_config.get("quality_format", None)
    if quality_format is None:
        fastq_format = fastqc.detect_fastq_format(fastq_file)
        quality_format = FASTQ_FORMAT_TO_BCBIO[fastq_format]

    max_errors = stage_config.get("max_errors", None)
    options = stage_config.get("options", {})
    tophat_loc = config["program"].get("tophat", "tophat")
    bowtie_loc = config["program"].get("bowtie", "bowtie")
    out_base = remove_suffix(os.path.basename(fastq_file))
    align_dir = os.path.join(config["dir"]["results"], stage_name)

    bcbio_config["resources"] = {"tophat": {"cores": cores,
                                            "options": options}}
    bcbio_config["algorithm"] = {}
    bcbio_config["program"] = {}
    bcbio_config["algorithm"]["quality_format"] = quality_format
    bcbio_config["algorithm"]["max_errors"] = max_errors
    bcbio_config["gtf"] = config.get("gtf", None)
    if bcbio_config["gtf"]:
        if not file_exists(bcbio_config["gtf"]):
            raise ValueError("GTF file does not exist. Please check to make sure that "
                             "the value of gtf is set corretly in the configuration file.")
            sys.exit(1)
    bcbio_config["program"]["tophat"] = tophat_loc
    bcbio_config["program"]["bowtie"] = bowtie_loc
    bcbio_config["program"]["picard"] = config["program"]["picard"]
    bcbio_config["program"]["gatk"] = {"dir": ""}

    out_file = tophat.align(fastq_file, pair_file, ref_file, out_base,
                            align_dir, bcbio_config)
    return out_file
Example #12
0
def _bcbio_tophat_wrapper(fastq_file, pair_file, ref_file,
                          stage_name, config):
    bcbio_config = {}
    stage_config = config["stage"][stage_name]
    cores = config["cluster"].get("cores", None)
    # use the listed quality format, if there isn't one, try to figure
    # out what format it is
    quality_format = stage_config.get("quality_format", None)
    if quality_format is None:
        fastq_format = fastqc.detect_fastq_format(fastq_file)
        quality_format = FASTQ_FORMAT_TO_BCBIO[fastq_format]

    max_errors = stage_config.get("max_errors", None)
    tophat_loc = config["program"].get("tophat", "tophat")
    bowtie_loc = config["program"].get("bowtie", "bowtie")
    out_base = remove_suffix(os.path.basename(fastq_file))
    align_dir = os.path.join(config["dir"]["results"], stage_name)

    bcbio_config["resources"] = {"tophat": {"cores": cores}}
    bcbio_config["algorithm"] = {}
    bcbio_config["program"] = {}
    bcbio_config["algorithm"]["quality_format"] = quality_format
    bcbio_config["algorithm"]["max_errors"] = max_errors
    bcbio_config["gtf"] = config.get("gtf", None)
    bcbio_config["program"]["tophat"] = tophat_loc
    bcbio_config["program"]["bowtie"] = bowtie_loc

    out_file = tophat.align(fastq_file, pair_file, ref_file, out_base,
                            align_dir, bcbio_config)
    os.remove(out_file)

    out_dir = os.path.dirname(out_file)
    out_file_fixed = os.path.join(out_dir, out_base + ".sam")
    os.symlink("accepted_hits.sam", out_file_fixed)

    return out_file_fixed
Example #13
0
def _get_short_names(input_files):
    return [remove_suffix(os.path.basename(x)) for x in input_files]
Example #14
0
def main(config_file):

    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    start_cluster(config)

    # after the cluster is up, import the view to i
    from bipy.cluster import view
    input_files = config["input"]
    results_dir = config["dir"]["results"]

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    curr_files = input_files

    ## qc steps
    for stage in config["run"]:
        if stage == "fastqc":
            # run the basic fastqc
            logger.info("Running %s on %s" % (stage, str(curr_files)))
            fastqc_config = config["stage"][stage]
            fastqc_outputs = view.map(fastqc.run, curr_files,
                                      [fastqc_config] * len(curr_files),
                                      [config] * len(curr_files))
            # this does nothing for now, not implemented yet
            summary_file = _combine_fastqc(fastqc_outputs)

        if stage == "trim":
            logger.info("Trimming poor quality ends "
                        " from %s" % (str(curr_files)))
            nlen = len(curr_files)
            min_length = str(config["stage"][stage].get("min_length", 20))

            # trim low quality ends of reads
            # do this dirty for now
            out_dir = os.path.join(results_dir, "trimmed")
            safe_makedir(out_dir)
            out_files = [
                append_stem(os.path.basename(x), "trim") for x in curr_files
            ]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            # XXX remove the magic number of 10 the length of the
            # minimum read to keep
            out_files = view.map(sickle.run, curr_files, ["se"] * nlen,
                                 ["sanger"] * nlen, [min_length] * nlen,
                                 out_files)
            curr_files = out_files

        if stage == "tagdust":
            input_files = curr_files
            # remove tags matching the other miRNA tested
            logger.info("Running %s on %s." % (stage, input_files))
            tagdust_config = config["stage"][stage]
            tagdust_outputs = view.map(tagdust.run, input_files,
                                       [tagdust_config] * len(input_files),
                                       [config] * len(input_files))
            curr_files = [x[0] for x in tagdust_outputs]

        if stage == "filter_length":
            # filter out reads below or above a certain length
            filter_config = config["stage"][stage]
            min_length = filter_config.get("min_length", 0)
            max_length = filter_config.get("max_length", MAX_READ_LENGTH)

            # length predicate
            def length_filter(x):
                return min_length < len(x.seq) < max_length

            # filter the input reads based on length
            # parallelizing this doesn't seem to work
            # ipython can't accept closures as an argument to view.map()
            """
            filtered_fastq = view.map(filter_seqio, tagdust_outputs,
                                      [lf] * len(tagdust_outputs),
                                      ["filt"] * len(tagdust_outputs),
                                      ["fastq"] * len(tagdust_outputs))"""
            out_files = [
                append_stem(os.path.basename(input_file[0]), "filt")
                for input_file in tagdust_outputs
            ]
            out_dir = os.path.join(config["dir"]["results"], "length_filtered")
            safe_makedir(out_dir)
            out_files = [os.path.join(out_dir, x) for x in out_files]

            filtered_fastq = [
                filter_seqio(x[0], length_filter, y, "fastq")
                for x, y in zip(tagdust_outputs, out_files)
            ]

            curr_files = filtered_fastq

        if stage == "count_ends":
            logger.info("Compiling nucleotide counts at 3' and 5' ends.")

            # count the nucleotide at the end of each read
            def count_ends(x, y):
                """ keeps a running count of an arbitrary set of keys
                during the reduce step """
                x[y] = x.get(y, 0) + 1
                return x

            def get_3prime_end(x):
                return str(x.seq[-1])

            def get_5prime_end(x):
                return str(x.seq[0])

            def output_counts(end_function, count_file):
                # if the count_file already exists, skip
                outdir = os.path.join(config["dir"]["results"], stage)
                safe_makedir(outdir)
                count_file = os.path.join(outdir, count_file)
                if os.path.exists(count_file):
                    return count_file
                # outputs a tab file of the counts at the end
                # of the fastq files kj
                counts = [
                    reduce(count_ends,
                           apply_seqio(x, end_function, kind="fastq"), {})
                    for x in curr_files
                ]
                df = pd.DataFrame(counts, index=map(_short_name, curr_files))
                df = df.astype(float)
                total = df.sum(axis=1)
                df = df.div(total, axis=0)
                df["total"] = total
                df.to_csv(count_file, sep="\t")

            output_counts(get_3prime_end, "3prime_counts.tsv")
            output_counts(get_5prime_end, "5prime_counts.tsv")

        if stage == "tophat":
            tophat_config = config["stage"][stage]
            logger.info("Running tophat on %s" % (str(curr_files)))
            nlen = len(curr_files)
            pair_file = None
            ref_file = tophat_config["annotation"]
            out_base = os.path.join(results_dir, "mirna")
            align_dir = os.path.join(results_dir, "tophat")
            config = config
            tophat_files = view.map(tophat.align, curr_files,
                                    [pair_file] * nlen, [ref_file] * nlen,
                                    [out_base] * nlen, [align_dir] * nlen,
                                    [config] * nlen)
            curr_files = tophat_files

        if stage == "novoalign":
            logger.info("Running novoalign on %s" % (str(curr_files)))
            # align
            ref = config["genome"]["file"]
            novoalign_config = config["stage"][stage]
            aligned_outputs = view.map(novoalign.run, curr_files,
                                       [ref] * len(curr_files),
                                       [novoalign_config] * len(curr_files),
                                       [config] * len(curr_files))
            # convert sam to bam, sort and index
            picard = BroadRunner(config["program"]["picard"], None, {})
            bamfiles = view.map(picardrun.picard_formatconverter,
                                [picard] * len(aligned_outputs),
                                aligned_outputs)
            sorted_bf = view.map(picardrun.picard_sort,
                                 [picard] * len(bamfiles), bamfiles)
            view.map(picardrun.picard_index, [picard] * len(sorted_bf),
                     sorted_bf)
            # these files are the new starting point for the downstream
            # analyses, so copy them over into the data dir and setting
            # them to read only
            #data_dir = os.path.join(config["dir"]["data"], stage)
            #safe_makedir(data_dir)
            #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf))
            #new_files = [os.path.join(data_dir, x) for x in
            #             map(os.path.basename, sorted_bf)]
            #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files]
            # index the bam files for later use
            #view.map(picardrun.picard_index, [picard] * len(new_files),
            #         new_files)

            curr_files = sorted_bf

        if stage == "new_coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = blastn.prepare_ref_file(config["stage"][stage]["ref"],
                                          config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"], None, {})
            out_dir = os.path.join(results_dir, "new_coverage")
            safe_makedir(out_dir)
            out_files = [
                replace_suffix(os.path.basename(x), "metrics")
                for x in curr_files
            ]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun, curr_files, [ref] * nrun,
                                 [ribo] * nrun, out_files)
            curr_files = out_files

        if stage == "coverage":
            gtf = blastn.prepare_ref_file(config["annotation"], config)
            logger.info("Calculating coverage of features in %s for %s" %
                        (gtf, str(sorted_bf)))
            out_files = [replace_suffix(x, "counts.bed") for x in sorted_bf]
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            logger.info(out_files)
            out_files = [
                os.path.join(out_dir, os.path.basename(x)) for x in out_files
            ]
            logger.info(out_files)
            view.map(bedtools.count_overlaps, sorted_bf,
                     [gtf] * len(sorted_bf), out_files)

        if stage == "htseq-count":
            nfiles = len(curr_files)
            htseq_config = _get_stage_config(config, stage)
            htseq_outputs = view.map(htseq_count.run_with_config,
                                     aligned_outputs, [config] * nfiles,
                                     [stage] * nfiles)
            column_names = _get_short_names(input_files)
            logger.info("Column names: %s" % (column_names))
            out_file = os.path.join(config["dir"]["results"], stage,
                                    "combined.counts")
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      column_names, out_file)
        if stage == "bedtools_intersect":
            bedfiles = config["stage"]["bedtools_intersect"].get("bed", None)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for bedfile in bedfiles:
                bedbase, bedext = os.path.splitext(bedfile)
                out_files = [remove_suffix(x) for x in sorted_bf]
                out_files = [
                    os.path.join(out_dir, os.path.basename(x))
                    for x in out_files
                ]
                out_files = [
                    "_vs_".join([x, os.path.basename(bedbase)])
                    for x in out_files
                ]
                out_files = [".".join([x, "bam"]) for x in out_files]
                test_out = map(bedtools.intersectbam2bed, sorted_bf,
                               [bedfile] * len(sorted_bf),
                               [False] * len(sorted_bf), out_files)
                count_files = [replace_suffix(x, "stats") for x in out_files]
                map(write_ratios, sorted_bf, out_files, count_files)

        if stage == "piranha":
            piranha_runner = piranha.PiranhaStage(config)
            out_files = view.map(piranha_runner, curr_files)

    stop_cluster()
Example #15
0
def main(config_file):
    if config_file:
        with open(config_file) as in_handle:
            config = yaml.load(in_handle)

    dirs = config["in_dir"]
    conditions = config["conditions"]
    glob_string = config["glob_string"]

    files = list(flatten([glob.glob(os.path.join(x, glob_string)) for x in dirs]))
    out_dir = config["dir"]["results"]
    safe_makedir(out_dir)

    curr_files = []
    for condition in conditions:
        condition_files = [x for x in files if condition in x]
        out_file = os.path.join(out_dir, condition + "_v2_v3.bam")
        print "Combining %s into %s." % (condition_files, out_file)
        sh.samtools.merge(list(flatten([out_file, condition_files])))
        #        bsub_call = list(flatten(["-q", "hsph", "-o", "out" + condition, "-e", "err" + condition, "samtools", "merge", out_file, condition_files]))
        #sh.bsub(bsub_call)
        sorted_prefix = remove_suffix(out_file) + ".sorted"
        sorted_file = sorted_prefix + ".bam"
        sh.samtools.sort(out_file, sorted_prefix)
        sh.samtools.index(sorted_file)
        mapped_file = append_stem(sorted_file, "mapped")
        sh.samtools.view(sorted_file, F=4, b=True, o=mapped_file)
        sh.samtools.index(mapped_file)

        # find the reads that don't intersect with the rrna
        in_file = mapped_file
        out_file = os.path.join(out_dir, condition + "_noribo" + "_v2_v3.bam")
        ribo = config["ribo"]
        print "Filtering %s for rRNA in %s into %s." % (in_file, ribo, out_file)
        sh.bedtools.intersect("-abam", in_file, "-v", "-b", ribo, _out=out_file)
        filtered_file = out_file

        print "Calculating RNASeq metrics on %s." % (out_file)
        in_file = out_file
        ref = blastn.prepare_ref_file(config["stage"]["new_coverage"]["ref"],
                                      config)
        ribo = config["stage"]["new_coverage"]["ribo"]
        picard = BroadRunner(config["program"]["picard"])
        out_dir = os.path.join(config["dir"]["results"], "new_coverage")
        safe_makedir(out_dir)
        out_file = replace_suffix(os.path.basename(in_file), "metrics")
        out_file = os.path.join(out_dir, out_file)
        metrics_file = picardrun.picard_rnaseq_metrics(picard, in_file, ref,
                                                       ribo, out_file)

        jelly_dir = os.path.join(config["dir"]["results"], "jellyfish")
        safe_makedir(jelly_dir)
        # convert the filtered file to fastq for jellyfish counting
        fastq_file = os.path.join(jelly_dir,
                                  os.path.basename(replace_suffix(filtered_file,
                                                                  "fastq")))
        sh.bam2fastx(filtered_file, fastq=True, _out=fastq_file)
        for mer in config["stage"]["jellyfish"]["mer_lengths"]:
            base, _ = os.path.splitext(os.path.basename(fastq_file))
            out_prefix = base + "_%dmer" % (mer)
            out_file = os.path.join(jelly_dir, out_prefix)
            if not file_exists(out_file):
                sh.jellyfish.count(fastq_file,
                                   config["stage"]["jellyfish"]["options"],
                                   m=mer, o=out_file)
Example #16
0
def main(config_file):

    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    start_cluster(config)

    # after the cluster is up, import the view to i
    from bipy.cluster import view
    input_files = config["input"]
    results_dir = config["dir"]["results"]

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    curr_files = input_files

    ## qc steps
    for stage in config["run"]:
        if stage == "fastqc":
            # run the basic fastqc
            logger.info("Running %s on %s" % (stage, str(curr_files)))
            fastqc_config = config["stage"][stage]
            fastqc_outputs = view.map(fastqc.run, curr_files,
                                      [fastqc_config] * len(curr_files),
                                      [config] * len(curr_files))
            # this does nothing for now, not implemented yet
            summary_file = _combine_fastqc(fastqc_outputs)

        if stage == "trim":
            logger.info("Trimming poor quality ends "
                        " from %s" % (str(curr_files)))
            nlen = len(curr_files)
            min_length = str(config["stage"][stage].get("min_length", 20))

            # trim low quality ends of reads
            # do this dirty for now
            out_dir = os.path.join(results_dir, "trimmed")
            safe_makedir(out_dir)
            out_files = [append_stem(os.path.basename(x), "trim") for
                         x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            # XXX remove the magic number of 10 the length of the
            # minimum read to keep
            out_files = view.map(sickle.run, curr_files,
                                 ["se"] * nlen,
                                 ["sanger"] * nlen,
                                 [min_length] * nlen,
                                 out_files)
            curr_files = out_files

        if stage == "tagdust":
            input_files = curr_files
            # remove tags matching the other miRNA tested
            logger.info("Running %s on %s." % (stage, input_files))
            tagdust_config = config["stage"][stage]
            tagdust_outputs = view.map(tagdust.run, input_files,
                                       [tagdust_config] * len(input_files),
                                       [config] * len(input_files))
            curr_files = [x[0] for x in tagdust_outputs]

        if stage == "filter_length":
            # filter out reads below or above a certain length
            filter_config = config["stage"][stage]
            min_length = filter_config.get("min_length", 0)
            max_length = filter_config.get("max_length", MAX_READ_LENGTH)

            # length predicate
            def length_filter(x):
                return min_length < len(x.seq) < max_length

            # filter the input reads based on length
            # parallelizing this doesn't seem to work
            # ipython can't accept closures as an argument to view.map()
            """
            filtered_fastq = view.map(filter_seqio, tagdust_outputs,
                                      [lf] * len(tagdust_outputs),
                                      ["filt"] * len(tagdust_outputs),
                                      ["fastq"] * len(tagdust_outputs))"""
            out_files = [append_stem(os.path.basename(input_file[0]),
                         "filt") for input_file in tagdust_outputs]
            out_dir = os.path.join(config["dir"]["results"],
                                   "length_filtered")
            safe_makedir(out_dir)
            out_files = [os.path.join(out_dir, x) for x in out_files]

            filtered_fastq = [filter_seqio(x[0], length_filter, y, "fastq")
                              for x, y in zip(tagdust_outputs, out_files)]

            curr_files = filtered_fastq

        if stage == "count_ends":
            logger.info("Compiling nucleotide counts at 3' and 5' ends.")
            # count the nucleotide at the end of each read
            def count_ends(x, y):
                """ keeps a running count of an arbitrary set of keys
                during the reduce step """
                x[y] = x.get(y, 0) + 1
                return x

            def get_3prime_end(x):
                return str(x.seq[-1])

            def get_5prime_end(x):
                return str(x.seq[0])

            def output_counts(end_function, count_file):
                # if the count_file already exists, skip
                outdir = os.path.join(config["dir"]["results"], stage)
                safe_makedir(outdir)
                count_file = os.path.join(outdir, count_file)
                if os.path.exists(count_file):
                    return count_file
                # outputs a tab file of the counts at the end
                # of the fastq files kj
                counts = [reduce(count_ends,
                                 apply_seqio(x, end_function, kind="fastq"),
                                 {}) for x in curr_files]
                df = pd.DataFrame(counts,
                                  index=map(_short_name, curr_files))
                df = df.astype(float)
                total = df.sum(axis=1)
                df = df.div(total, axis=0)
                df["total"] = total
                df.to_csv(count_file, sep="\t")

            output_counts(get_3prime_end, "3prime_counts.tsv")
            output_counts(get_5prime_end, "5prime_counts.tsv")

        if stage == "tophat":
            tophat_config = config["stage"][stage]
            logger.info("Running tophat on %s" % (str(curr_files)))
            nlen = len(curr_files)
            pair_file = None
            ref_file = tophat_config["annotation"]
            out_base = os.path.join(results_dir, "mirna")
            align_dir = os.path.join(results_dir, "tophat")
            config = config
            tophat_files = view.map(tophat.align,
                                    curr_files,
                                    [pair_file] * nlen,
                                    [ref_file] * nlen,
                                    [out_base] * nlen,
                                    [align_dir] * nlen,
                                    [config] * nlen)
            curr_files = tophat_files

        if stage == "novoalign":
            logger.info("Running novoalign on %s" % (str(curr_files)))
            # align
            ref = config["genome"]["file"]
            novoalign_config = config["stage"][stage]
            aligned_outputs = view.map(novoalign.run, curr_files,
                                       [ref] * len(curr_files),
                                       [novoalign_config] * len(curr_files),
                                       [config] * len(curr_files))
            # convert sam to bam, sort and index
            picard = BroadRunner(config["program"]["picard"], None, {})
            bamfiles = view.map(picardrun.picard_formatconverter,
                                [picard] * len(aligned_outputs),
                                aligned_outputs)
            sorted_bf = view.map(picardrun.picard_sort,
                                 [picard] * len(bamfiles),
                                 bamfiles)
            view.map(picardrun.picard_index, [picard] * len(sorted_bf),
                     sorted_bf)
            # these files are the new starting point for the downstream
            # analyses, so copy them over into the data dir and setting
            # them to read only
            #data_dir = os.path.join(config["dir"]["data"], stage)
            #safe_makedir(data_dir)
            #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf))
            #new_files = [os.path.join(data_dir, x) for x in
            #             map(os.path.basename, sorted_bf)]
            #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files]
            # index the bam files for later use
            #view.map(picardrun.picard_index, [picard] * len(new_files),
            #         new_files)

            curr_files = sorted_bf

        if stage == "new_coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = blastn.prepare_ref_file(config["stage"][stage]["ref"],
                                          config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"], None, {})
            out_dir = os.path.join(results_dir, "new_coverage")
            safe_makedir(out_dir)
            out_files = [replace_suffix(os.path.basename(x),
                                        "metrics") for x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun,
                                 curr_files,
                                 [ref] * nrun,
                                 [ribo] * nrun,
                                 out_files)
            curr_files = out_files

        if stage == "coverage":
            gtf = blastn.prepare_ref_file(config["annotation"], config)
            logger.info("Calculating coverage of features in %s for %s"
                        % (gtf, str(sorted_bf)))
            out_files = [replace_suffix(x, "counts.bed") for
                         x in sorted_bf]
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            logger.info(out_files)
            out_files = [os.path.join(out_dir,
                                      os.path.basename(x)) for x in out_files]
            logger.info(out_files)
            view.map(bedtools.count_overlaps, sorted_bf,
                     [gtf] * len(sorted_bf),
                     out_files)

        if stage == "htseq-count":
            nfiles = len(curr_files)
            htseq_config = _get_stage_config(config, stage)
            htseq_outputs = view.map(htseq_count.run_with_config,
                                     aligned_outputs,
                                     [config] * nfiles,
                                     [stage] * nfiles)
            column_names = _get_short_names(input_files)
            logger.info("Column names: %s" % (column_names))
            out_file = os.path.join(config["dir"]["results"], stage,
                                    "combined.counts")
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      column_names,
                                                      out_file)
        if stage == "bedtools_intersect":
            bedfiles = config["stage"]["bedtools_intersect"].get("bed", None)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for bedfile in bedfiles:
                bedbase, bedext = os.path.splitext(bedfile)
                out_files = [remove_suffix(x) for x in sorted_bf]
                out_files = [os.path.join(out_dir, os.path.basename(x)) for x in
                             out_files]
                out_files = ["_vs_".join([x, os.path.basename(bedbase)])
                             for x in out_files]
                out_files = [".".join([x, "bam"]) for x in out_files]
                test_out = map(bedtools.intersectbam2bed, sorted_bf,
                               [bedfile] * len(sorted_bf),
                               [False] * len(sorted_bf),
                               out_files)
                count_files = [replace_suffix(x, "stats") for x in
                               out_files]
                map(write_ratios, sorted_bf, out_files, count_files)

        if stage == "piranha":
            piranha_runner = piranha.PiranhaStage(config)
            out_files = view.map(piranha_runner, curr_files)

    stop_cluster()
Example #17
0
def _get_short_names(input_files):
    return [remove_suffix(os.path.basename(x)) for x in
            input_files]
Example #18
0
def _make_outfile(input_file, config):
    outdir = _make_outdir(config)
    #outfile = "".join([os.path.basename(input_file), "_fastqc.zip"])
    outfile = "".join([remove_suffix(os.path.basename(input_file)),
                       "_fastqc.zip"])
    return os.path.join(outdir, outfile)