def run(input_file, options, control_file=None, out_dir=None): out_files = (remove_suffix(input_file) + "_peaks.bed", remove_suffix(input_file) + "_summits.bed") cmd = _build_command(input_file, options, control_file, out_dir) subprocess.check_call(cmd) if out_dir: for f in out_files: sh.mv(f, os.path.join(out_dir, os.path.basename(f))) out_files = [os.path.join(out_dir, os.path.basename(x)) for x in out_files] return out_files
def run(input_file, options, control_file=None, out_dir=None): out_files = (remove_suffix(input_file) + "_peaks.bed", remove_suffix(input_file) + "_summits.bed") cmd = _build_command(input_file, options, control_file, out_dir) subprocess.check_call(cmd) if out_dir: for f in out_files: sh.mv(f, os.path.join(out_dir, os.path.basename(f))) out_files = [ os.path.join(out_dir, os.path.basename(x)) for x in out_files ] return out_files
def generate_report(self, name, figures=None): template = Template(self._template) clean_name = safe_latex(name) #clean_figures = self.clean_figures(figures) #section = template.render(name=clean_name, figures=clean_figures) clean_figures = [(remove_suffix(figure[0]), figure[1], figure[2]) for figure in figures] section = template.render(name=clean_name, figures=clean_figures) return section
def _run_command(in_files, cmd, suffix="", out_file=None): if not out_file: out_file = "".join( [remove_suffix(os.path.basename(x)) for x in in_files]) + suffix out_file = os.path.join(os.path.dirname(in_files[0]), out_file) if file_exists(out_file): return out_file with open(out_file, "w") as out_handle: subprocess.check_call(cmd, stdout=out_handle) return out_file
def _build_command(input_file, options, control_file=None, out_dir=None): name = remove_suffix(os.path.basename(input_file)) #if out_dir: # name = os.path.join(out_dir, name) options = ["=".join(map(str, x)) for x in options] cmd = ["macs14", "--treatment=" + input_file, flatten(options), "--name=" + name] if control_file: cmd += ["--control=" + control_file] return map(str, flatten(cmd))
def _run_command(in_files, cmd, suffix="", out_file=None): if not out_file: out_file = "".join([remove_suffix(os.path.basename(x)) for x in in_files]) + suffix out_file = os.path.join(os.path.dirname(in_files[0]), out_file) if file_exists(out_file): return out_file with open(out_file, "w") as out_handle: subprocess.check_call(cmd, stdout=out_handle) return out_file
def generate_pdf(self, sections=None, out_file=None): out_tmpl = Template(self._base_template) if not out_file: latex_file = "latex.tex" out_file = "latex.pdf" else: latex_file = remove_suffix(out_file) + ".tex" with open(latex_file, "w") as latex_handle: latex_handle.write(out_tmpl.render(sections=sections)) sh.pdflatex(latex_file) return out_file
def _build_command(input_file, options, control_file=None, out_dir=None): name = remove_suffix(os.path.basename(input_file)) #if out_dir: # name = os.path.join(out_dir, name) options = ["=".join(map(str, x)) for x in options] cmd = [ "macs14", "--treatment=" + input_file, flatten(options), "--name=" + name ] if control_file: cmd += ["--control=" + control_file] return map(str, flatten(cmd))
def _merge_condition(in_files, condition): """ merge all of the bam files from a condition together as recomended in the MACS manual """ condition_files = [filename for filename in in_files if condition in filename] if not condition_files: return None condition_filename = os.path.join(os.path.dirname(condition_files[1]), condition + "_merged.bam") sorted_prefix = remove_suffix(condition_filename) + ".sorted" sorted_filename = sorted_prefix + ".bam" if file_exists(sorted_filename): return sorted_filename sh.samtools("merge", condition_filename, condition_files) sh.samtools("sort", condition_filename, sorted_prefix) sh.samtools("index", sorted_filename) return sorted_filename
def _merge_condition(in_files, condition): """ merge all of the bam files from a condition together as recomended in the MACS manual """ condition_files = [ filename for filename in in_files if condition in filename ] if not condition_files: return None condition_filename = os.path.join(os.path.dirname(condition_files[1]), condition + "_merged.bam") sorted_prefix = remove_suffix(condition_filename) + ".sorted" sorted_filename = sorted_prefix + ".bam" if file_exists(sorted_filename): return sorted_filename sh.samtools("merge", condition_filename, condition_files) sh.samtools("sort", condition_filename, sorted_prefix) sh.samtools("index", sorted_filename) return sorted_filename
def _bcbio_tophat_wrapper(fastq_file, pair_file, ref_file, stage_name, config): bcbio_config = {} stage_config = config["stage"][stage_name] cores = stage_config.get("cores", 1) # use the listed quality format, if there isn't one, try to figure # out what format it is quality_format = stage_config.get("quality_format", None) if quality_format is None: fastq_format = fastqc.detect_fastq_format(fastq_file) quality_format = FASTQ_FORMAT_TO_BCBIO[fastq_format] max_errors = stage_config.get("max_errors", None) options = stage_config.get("options", {}) tophat_loc = config["program"].get("tophat", "tophat") bowtie_loc = config["program"].get("bowtie", "bowtie") out_base = remove_suffix(os.path.basename(fastq_file)) align_dir = os.path.join(config["dir"]["results"], stage_name) bcbio_config["resources"] = {"tophat": {"cores": cores, "options": options}} bcbio_config["algorithm"] = {} bcbio_config["program"] = {} bcbio_config["algorithm"]["quality_format"] = quality_format bcbio_config["algorithm"]["max_errors"] = max_errors bcbio_config["gtf"] = config.get("gtf", None) if bcbio_config["gtf"]: if not file_exists(bcbio_config["gtf"]): raise ValueError("GTF file does not exist. Please check to make sure that " "the value of gtf is set corretly in the configuration file.") sys.exit(1) bcbio_config["program"]["tophat"] = tophat_loc bcbio_config["program"]["bowtie"] = bowtie_loc bcbio_config["program"]["picard"] = config["program"]["picard"] bcbio_config["program"]["gatk"] = {"dir": ""} out_file = tophat.align(fastq_file, pair_file, ref_file, out_base, align_dir, bcbio_config) return out_file
def _bcbio_tophat_wrapper(fastq_file, pair_file, ref_file, stage_name, config): bcbio_config = {} stage_config = config["stage"][stage_name] cores = config["cluster"].get("cores", None) # use the listed quality format, if there isn't one, try to figure # out what format it is quality_format = stage_config.get("quality_format", None) if quality_format is None: fastq_format = fastqc.detect_fastq_format(fastq_file) quality_format = FASTQ_FORMAT_TO_BCBIO[fastq_format] max_errors = stage_config.get("max_errors", None) tophat_loc = config["program"].get("tophat", "tophat") bowtie_loc = config["program"].get("bowtie", "bowtie") out_base = remove_suffix(os.path.basename(fastq_file)) align_dir = os.path.join(config["dir"]["results"], stage_name) bcbio_config["resources"] = {"tophat": {"cores": cores}} bcbio_config["algorithm"] = {} bcbio_config["program"] = {} bcbio_config["algorithm"]["quality_format"] = quality_format bcbio_config["algorithm"]["max_errors"] = max_errors bcbio_config["gtf"] = config.get("gtf", None) bcbio_config["program"]["tophat"] = tophat_loc bcbio_config["program"]["bowtie"] = bowtie_loc out_file = tophat.align(fastq_file, pair_file, ref_file, out_base, align_dir, bcbio_config) os.remove(out_file) out_dir = os.path.dirname(out_file) out_file_fixed = os.path.join(out_dir, out_base + ".sam") os.symlink("accepted_hits.sam", out_file_fixed) return out_file_fixed
def _get_short_names(input_files): return [remove_suffix(os.path.basename(x)) for x in input_files]
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) # after the cluster is up, import the view to i from bipy.cluster import view input_files = config["input"] results_dir = config["dir"]["results"] # make the needed directories map(safe_makedir, config["dir"].values()) curr_files = input_files ## qc steps for stage in config["run"]: if stage == "fastqc": # run the basic fastqc logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = config["stage"][stage] fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * len(curr_files), [config] * len(curr_files)) # this does nothing for now, not implemented yet summary_file = _combine_fastqc(fastqc_outputs) if stage == "trim": logger.info("Trimming poor quality ends " " from %s" % (str(curr_files))) nlen = len(curr_files) min_length = str(config["stage"][stage].get("min_length", 20)) # trim low quality ends of reads # do this dirty for now out_dir = os.path.join(results_dir, "trimmed") safe_makedir(out_dir) out_files = [ append_stem(os.path.basename(x), "trim") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] # XXX remove the magic number of 10 the length of the # minimum read to keep out_files = view.map(sickle.run, curr_files, ["se"] * nlen, ["sanger"] * nlen, [min_length] * nlen, out_files) curr_files = out_files if stage == "tagdust": input_files = curr_files # remove tags matching the other miRNA tested logger.info("Running %s on %s." % (stage, input_files)) tagdust_config = config["stage"][stage] tagdust_outputs = view.map(tagdust.run, input_files, [tagdust_config] * len(input_files), [config] * len(input_files)) curr_files = [x[0] for x in tagdust_outputs] if stage == "filter_length": # filter out reads below or above a certain length filter_config = config["stage"][stage] min_length = filter_config.get("min_length", 0) max_length = filter_config.get("max_length", MAX_READ_LENGTH) # length predicate def length_filter(x): return min_length < len(x.seq) < max_length # filter the input reads based on length # parallelizing this doesn't seem to work # ipython can't accept closures as an argument to view.map() """ filtered_fastq = view.map(filter_seqio, tagdust_outputs, [lf] * len(tagdust_outputs), ["filt"] * len(tagdust_outputs), ["fastq"] * len(tagdust_outputs))""" out_files = [ append_stem(os.path.basename(input_file[0]), "filt") for input_file in tagdust_outputs ] out_dir = os.path.join(config["dir"]["results"], "length_filtered") safe_makedir(out_dir) out_files = [os.path.join(out_dir, x) for x in out_files] filtered_fastq = [ filter_seqio(x[0], length_filter, y, "fastq") for x, y in zip(tagdust_outputs, out_files) ] curr_files = filtered_fastq if stage == "count_ends": logger.info("Compiling nucleotide counts at 3' and 5' ends.") # count the nucleotide at the end of each read def count_ends(x, y): """ keeps a running count of an arbitrary set of keys during the reduce step """ x[y] = x.get(y, 0) + 1 return x def get_3prime_end(x): return str(x.seq[-1]) def get_5prime_end(x): return str(x.seq[0]) def output_counts(end_function, count_file): # if the count_file already exists, skip outdir = os.path.join(config["dir"]["results"], stage) safe_makedir(outdir) count_file = os.path.join(outdir, count_file) if os.path.exists(count_file): return count_file # outputs a tab file of the counts at the end # of the fastq files kj counts = [ reduce(count_ends, apply_seqio(x, end_function, kind="fastq"), {}) for x in curr_files ] df = pd.DataFrame(counts, index=map(_short_name, curr_files)) df = df.astype(float) total = df.sum(axis=1) df = df.div(total, axis=0) df["total"] = total df.to_csv(count_file, sep="\t") output_counts(get_3prime_end, "3prime_counts.tsv") output_counts(get_5prime_end, "5prime_counts.tsv") if stage == "tophat": tophat_config = config["stage"][stage] logger.info("Running tophat on %s" % (str(curr_files))) nlen = len(curr_files) pair_file = None ref_file = tophat_config["annotation"] out_base = os.path.join(results_dir, "mirna") align_dir = os.path.join(results_dir, "tophat") config = config tophat_files = view.map(tophat.align, curr_files, [pair_file] * nlen, [ref_file] * nlen, [out_base] * nlen, [align_dir] * nlen, [config] * nlen) curr_files = tophat_files if stage == "novoalign": logger.info("Running novoalign on %s" % (str(curr_files))) # align ref = config["genome"]["file"] novoalign_config = config["stage"][stage] aligned_outputs = view.map(novoalign.run, curr_files, [ref] * len(curr_files), [novoalign_config] * len(curr_files), [config] * len(curr_files)) # convert sam to bam, sort and index picard = BroadRunner(config["program"]["picard"], None, {}) bamfiles = view.map(picardrun.picard_formatconverter, [picard] * len(aligned_outputs), aligned_outputs) sorted_bf = view.map(picardrun.picard_sort, [picard] * len(bamfiles), bamfiles) view.map(picardrun.picard_index, [picard] * len(sorted_bf), sorted_bf) # these files are the new starting point for the downstream # analyses, so copy them over into the data dir and setting # them to read only #data_dir = os.path.join(config["dir"]["data"], stage) #safe_makedir(data_dir) #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf)) #new_files = [os.path.join(data_dir, x) for x in # map(os.path.basename, sorted_bf)] #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files] # index the bam files for later use #view.map(picardrun.picard_index, [picard] * len(new_files), # new_files) curr_files = sorted_bf if stage == "new_coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"], None, {}) out_dir = os.path.join(results_dir, "new_coverage") safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) curr_files = out_files if stage == "coverage": gtf = blastn.prepare_ref_file(config["annotation"], config) logger.info("Calculating coverage of features in %s for %s" % (gtf, str(sorted_bf))) out_files = [replace_suffix(x, "counts.bed") for x in sorted_bf] out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) logger.info(out_files) out_files = [ os.path.join(out_dir, os.path.basename(x)) for x in out_files ] logger.info(out_files) view.map(bedtools.count_overlaps, sorted_bf, [gtf] * len(sorted_bf), out_files) if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, aligned_outputs, [config] * nfiles, [stage] * nfiles) column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) if stage == "bedtools_intersect": bedfiles = config["stage"]["bedtools_intersect"].get("bed", None) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for bedfile in bedfiles: bedbase, bedext = os.path.splitext(bedfile) out_files = [remove_suffix(x) for x in sorted_bf] out_files = [ os.path.join(out_dir, os.path.basename(x)) for x in out_files ] out_files = [ "_vs_".join([x, os.path.basename(bedbase)]) for x in out_files ] out_files = [".".join([x, "bam"]) for x in out_files] test_out = map(bedtools.intersectbam2bed, sorted_bf, [bedfile] * len(sorted_bf), [False] * len(sorted_bf), out_files) count_files = [replace_suffix(x, "stats") for x in out_files] map(write_ratios, sorted_bf, out_files, count_files) if stage == "piranha": piranha_runner = piranha.PiranhaStage(config) out_files = view.map(piranha_runner, curr_files) stop_cluster()
def main(config_file): if config_file: with open(config_file) as in_handle: config = yaml.load(in_handle) dirs = config["in_dir"] conditions = config["conditions"] glob_string = config["glob_string"] files = list(flatten([glob.glob(os.path.join(x, glob_string)) for x in dirs])) out_dir = config["dir"]["results"] safe_makedir(out_dir) curr_files = [] for condition in conditions: condition_files = [x for x in files if condition in x] out_file = os.path.join(out_dir, condition + "_v2_v3.bam") print "Combining %s into %s." % (condition_files, out_file) sh.samtools.merge(list(flatten([out_file, condition_files]))) # bsub_call = list(flatten(["-q", "hsph", "-o", "out" + condition, "-e", "err" + condition, "samtools", "merge", out_file, condition_files])) #sh.bsub(bsub_call) sorted_prefix = remove_suffix(out_file) + ".sorted" sorted_file = sorted_prefix + ".bam" sh.samtools.sort(out_file, sorted_prefix) sh.samtools.index(sorted_file) mapped_file = append_stem(sorted_file, "mapped") sh.samtools.view(sorted_file, F=4, b=True, o=mapped_file) sh.samtools.index(mapped_file) # find the reads that don't intersect with the rrna in_file = mapped_file out_file = os.path.join(out_dir, condition + "_noribo" + "_v2_v3.bam") ribo = config["ribo"] print "Filtering %s for rRNA in %s into %s." % (in_file, ribo, out_file) sh.bedtools.intersect("-abam", in_file, "-v", "-b", ribo, _out=out_file) filtered_file = out_file print "Calculating RNASeq metrics on %s." % (out_file) in_file = out_file ref = blastn.prepare_ref_file(config["stage"]["new_coverage"]["ref"], config) ribo = config["stage"]["new_coverage"]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], "new_coverage") safe_makedir(out_dir) out_file = replace_suffix(os.path.basename(in_file), "metrics") out_file = os.path.join(out_dir, out_file) metrics_file = picardrun.picard_rnaseq_metrics(picard, in_file, ref, ribo, out_file) jelly_dir = os.path.join(config["dir"]["results"], "jellyfish") safe_makedir(jelly_dir) # convert the filtered file to fastq for jellyfish counting fastq_file = os.path.join(jelly_dir, os.path.basename(replace_suffix(filtered_file, "fastq"))) sh.bam2fastx(filtered_file, fastq=True, _out=fastq_file) for mer in config["stage"]["jellyfish"]["mer_lengths"]: base, _ = os.path.splitext(os.path.basename(fastq_file)) out_prefix = base + "_%dmer" % (mer) out_file = os.path.join(jelly_dir, out_prefix) if not file_exists(out_file): sh.jellyfish.count(fastq_file, config["stage"]["jellyfish"]["options"], m=mer, o=out_file)
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) # after the cluster is up, import the view to i from bipy.cluster import view input_files = config["input"] results_dir = config["dir"]["results"] # make the needed directories map(safe_makedir, config["dir"].values()) curr_files = input_files ## qc steps for stage in config["run"]: if stage == "fastqc": # run the basic fastqc logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = config["stage"][stage] fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * len(curr_files), [config] * len(curr_files)) # this does nothing for now, not implemented yet summary_file = _combine_fastqc(fastqc_outputs) if stage == "trim": logger.info("Trimming poor quality ends " " from %s" % (str(curr_files))) nlen = len(curr_files) min_length = str(config["stage"][stage].get("min_length", 20)) # trim low quality ends of reads # do this dirty for now out_dir = os.path.join(results_dir, "trimmed") safe_makedir(out_dir) out_files = [append_stem(os.path.basename(x), "trim") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] # XXX remove the magic number of 10 the length of the # minimum read to keep out_files = view.map(sickle.run, curr_files, ["se"] * nlen, ["sanger"] * nlen, [min_length] * nlen, out_files) curr_files = out_files if stage == "tagdust": input_files = curr_files # remove tags matching the other miRNA tested logger.info("Running %s on %s." % (stage, input_files)) tagdust_config = config["stage"][stage] tagdust_outputs = view.map(tagdust.run, input_files, [tagdust_config] * len(input_files), [config] * len(input_files)) curr_files = [x[0] for x in tagdust_outputs] if stage == "filter_length": # filter out reads below or above a certain length filter_config = config["stage"][stage] min_length = filter_config.get("min_length", 0) max_length = filter_config.get("max_length", MAX_READ_LENGTH) # length predicate def length_filter(x): return min_length < len(x.seq) < max_length # filter the input reads based on length # parallelizing this doesn't seem to work # ipython can't accept closures as an argument to view.map() """ filtered_fastq = view.map(filter_seqio, tagdust_outputs, [lf] * len(tagdust_outputs), ["filt"] * len(tagdust_outputs), ["fastq"] * len(tagdust_outputs))""" out_files = [append_stem(os.path.basename(input_file[0]), "filt") for input_file in tagdust_outputs] out_dir = os.path.join(config["dir"]["results"], "length_filtered") safe_makedir(out_dir) out_files = [os.path.join(out_dir, x) for x in out_files] filtered_fastq = [filter_seqio(x[0], length_filter, y, "fastq") for x, y in zip(tagdust_outputs, out_files)] curr_files = filtered_fastq if stage == "count_ends": logger.info("Compiling nucleotide counts at 3' and 5' ends.") # count the nucleotide at the end of each read def count_ends(x, y): """ keeps a running count of an arbitrary set of keys during the reduce step """ x[y] = x.get(y, 0) + 1 return x def get_3prime_end(x): return str(x.seq[-1]) def get_5prime_end(x): return str(x.seq[0]) def output_counts(end_function, count_file): # if the count_file already exists, skip outdir = os.path.join(config["dir"]["results"], stage) safe_makedir(outdir) count_file = os.path.join(outdir, count_file) if os.path.exists(count_file): return count_file # outputs a tab file of the counts at the end # of the fastq files kj counts = [reduce(count_ends, apply_seqio(x, end_function, kind="fastq"), {}) for x in curr_files] df = pd.DataFrame(counts, index=map(_short_name, curr_files)) df = df.astype(float) total = df.sum(axis=1) df = df.div(total, axis=0) df["total"] = total df.to_csv(count_file, sep="\t") output_counts(get_3prime_end, "3prime_counts.tsv") output_counts(get_5prime_end, "5prime_counts.tsv") if stage == "tophat": tophat_config = config["stage"][stage] logger.info("Running tophat on %s" % (str(curr_files))) nlen = len(curr_files) pair_file = None ref_file = tophat_config["annotation"] out_base = os.path.join(results_dir, "mirna") align_dir = os.path.join(results_dir, "tophat") config = config tophat_files = view.map(tophat.align, curr_files, [pair_file] * nlen, [ref_file] * nlen, [out_base] * nlen, [align_dir] * nlen, [config] * nlen) curr_files = tophat_files if stage == "novoalign": logger.info("Running novoalign on %s" % (str(curr_files))) # align ref = config["genome"]["file"] novoalign_config = config["stage"][stage] aligned_outputs = view.map(novoalign.run, curr_files, [ref] * len(curr_files), [novoalign_config] * len(curr_files), [config] * len(curr_files)) # convert sam to bam, sort and index picard = BroadRunner(config["program"]["picard"], None, {}) bamfiles = view.map(picardrun.picard_formatconverter, [picard] * len(aligned_outputs), aligned_outputs) sorted_bf = view.map(picardrun.picard_sort, [picard] * len(bamfiles), bamfiles) view.map(picardrun.picard_index, [picard] * len(sorted_bf), sorted_bf) # these files are the new starting point for the downstream # analyses, so copy them over into the data dir and setting # them to read only #data_dir = os.path.join(config["dir"]["data"], stage) #safe_makedir(data_dir) #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf)) #new_files = [os.path.join(data_dir, x) for x in # map(os.path.basename, sorted_bf)] #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files] # index the bam files for later use #view.map(picardrun.picard_index, [picard] * len(new_files), # new_files) curr_files = sorted_bf if stage == "new_coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"], None, {}) out_dir = os.path.join(results_dir, "new_coverage") safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) curr_files = out_files if stage == "coverage": gtf = blastn.prepare_ref_file(config["annotation"], config) logger.info("Calculating coverage of features in %s for %s" % (gtf, str(sorted_bf))) out_files = [replace_suffix(x, "counts.bed") for x in sorted_bf] out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) logger.info(out_files) out_files = [os.path.join(out_dir, os.path.basename(x)) for x in out_files] logger.info(out_files) view.map(bedtools.count_overlaps, sorted_bf, [gtf] * len(sorted_bf), out_files) if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, aligned_outputs, [config] * nfiles, [stage] * nfiles) column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) if stage == "bedtools_intersect": bedfiles = config["stage"]["bedtools_intersect"].get("bed", None) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for bedfile in bedfiles: bedbase, bedext = os.path.splitext(bedfile) out_files = [remove_suffix(x) for x in sorted_bf] out_files = [os.path.join(out_dir, os.path.basename(x)) for x in out_files] out_files = ["_vs_".join([x, os.path.basename(bedbase)]) for x in out_files] out_files = [".".join([x, "bam"]) for x in out_files] test_out = map(bedtools.intersectbam2bed, sorted_bf, [bedfile] * len(sorted_bf), [False] * len(sorted_bf), out_files) count_files = [replace_suffix(x, "stats") for x in out_files] map(write_ratios, sorted_bf, out_files, count_files) if stage == "piranha": piranha_runner = piranha.PiranhaStage(config) out_files = view.map(piranha_runner, curr_files) stop_cluster()
def _make_outfile(input_file, config): outdir = _make_outdir(config) #outfile = "".join([os.path.basename(input_file), "_fastqc.zip"]) outfile = "".join([remove_suffix(os.path.basename(input_file)), "_fastqc.zip"]) return os.path.join(outdir, outfile)