def fix_mate_pairs(fq1, fq2, f_suffix="/1", r_suffix="/2"): """ takes two FASTQ files (fq1 and fq2) of paired end sequencing data and filters out reads without a mate pair. """ fq1_out = append_stem(fq1, "fixed") fq2_out = append_stem(fq2, "fixed") fq1_single = append_stem(fq1, "singles") fq2_single = append_stem(fq2, "singles") if all(map(file_exists, [fq1_out, fq2_out, fq2_single, fq2_single])): return [fq1_out, fq2_out] f_dict = SeqIO.index(fq1, "fastq", key_function=get_read_name_function(f_suffix)) r_dict = SeqIO.index(fq2, "fastq", key_function=get_read_name_function(r_suffix)) with open(fq1_out, 'w') as fq1_out_handle, open(fq2_out, 'w') as fq2_out_handle, open(fq1_single, 'w') as fq1_single_handle, open(fq2_single, 'w') as fq2_single_handle: for key in f_dict: if key in r_dict: fq1_out_handle.write(f_dict.get_raw(key)) fq2_out_handle.write(r_dict.get_raw(key)) else: fq1_single_handle.write(f_dict.get_raw(key)) for key in r_dict: if key not in f_dict: fq2_single_handle.write(r_dict.get_raw(key)) return [fq1_out, fq2_out]
def filter_reads_by_length(fq1, fq2, min_length=30): """ removes reads which are empty a pair of fastq files """ logger.info("Removing reads in %s and %s that " "are less than %d bases." % (fq1, fq2, min_length)) # just pick the first one if it can be multiple types quality_type = QUALITY_TYPE[DetectFastqFormat.run(fq1)[0]] fq1_out = append_stem(fq1, "fixed") fq2_out = append_stem(fq2, "fixed") fq1_single = append_stem(fq1, "singles") fq2_single = append_stem(fq2, "singles") if all(map(file_exists, [fq1_out, fq2_out, fq2_single, fq2_single])): return [fq1_out, fq2_out] fq1_in = SeqIO.parse(fq1, quality_type) fq2_in = SeqIO.parse(fq2, quality_type) with open(fq1_out, 'w') as fq1_out_handle, open(fq2_out, 'w') as fq2_out_handle, open(fq1_single, 'w') as fq1_single_handle, open(fq2_single, 'w') as fq2_single_handle: for fq1_record, fq2_record in izip(fq1_in, fq2_in): if len(fq1_record.seq) >= min_length and len(fq2_record.seq) >= min_length: fq1_out_handle.write(fq1_record.format(quality_type)) fq2_out_handle.write(fq2_record.format(quality_type)) else: if len(fq1_record.seq) > min_length: fq1_single_handle.write(fq1_record.format(quality_type)) if len(fq2_record.seq) > min_length: fq2_single_handle.write(fq2_record.format(quality_type)) return [fq1_out, fq2_out]
def _load_gemini(self, in_file): log_id = os.path.join(self.log, "gemini" + "_" + str(uuid.uuid4()) + ".log") sh.gemini.load(self.db, v=in_file, t=self.type, _out=append_stem(log_id, "out"), _err=append_stem(log_id, "err"))
def _get_handles(self, in_file): assigned_name = append_stem(in_file, "unique") ambiguous_name = append_stem(in_file, "ambiguous") in_handle = pysam.Samfile(in_file, "rb") assigned = pysam.Samfile(assigned_name, "wb", template=in_handle) ambiguous = pysam.Samfile(ambiguous_name, "wb", template=in_handle) return (in_handle, assigned, ambiguous)
def run_as_pe(first, second, config): first_out = append_stem(first, "sickle") second_out = append_stem(second, "sickle") single_out = append_stem(first, "single") quality_type = _get_quality_type(first) length_cutoff = _get_length_cutoff(config) quality_cutoff = _get_quality_cutoff(config) if all(map(os.path.exists, [first_out, second_out, single_out])): return (first_out, second_out) sh.sickle("pe", f=first, r=second, l=length_cutoff, q=quality_cutoff, t=quality_type, o=first_out, p=second_out, s=single_out) return (first_out, second_out)
def __call__(self, pair): unique_files = [append_stem(x, "unique") for x in pair] ambig_files = [append_stem(x, "ambiguous") for x in pair] if all(map(os.path.exists, unique_files + ambig_files)): return [unique_files, ambig_files] handles_0 = self._get_handles(pair[0]) handles_1 = self._get_handles(pair[1]) self._process_reads(handles_0, handles_1, None, None) [x.close() for x in handles_0] [x.close() for x in handles_1] return [unique_files, ambig_files]
def downsample_bam(bam_file, target_reads, out_file=None): if out_file is None: out_file = append_stem(bam_file, "downsampled") percentage_to_sample = _get_percentage_to_sample(bam_file, target_reads) sh.samtools.view("-h", "-b", "-s", percentage_to_sample, "-o", out_file, bam_file) return out_file
def filter_results_by_length(filename, cutoff): """ filters the tsv results by the metric that both the overlap of the query sequence and the subject sequence must both be > cutoff of their length. This might be a little too restrictive though """ def query_match(linedict): length = abs(float(linedict["qstart"]) - float(linedict["qend"])) if length / float(linedict["qlen"]) > (cutoff / float(100)): return True else: return False def subject_match(linedict): length = abs(float(linedict["sstart"]) - float(linedict["send"])) if length / float(linedict["slen"]) > (cutoff / float(100)): return True else: return False out_fname = append_stem(filename, str(cutoff) + "_filt") # skip if it already exists if os.path.exists(out_fname): return out_fname with open(filename) as in_handle: reader = csv.reader(in_handle, delimiter="\t") with open(out_fname, "w") as out_handle: writer = csv.writer(out_handle, delimiter="\t") writer.writerow(HEADER_FIELDS.split(" ")) for line in reader: linedict = dict(zip(HEADER_FIELDS.split(" "), line)) if query_match(linedict) & subject_match(linedict): writer.writerow(line) return out_fname
def fix_RPKM_count_file(in_file, out_file=None): """ splits the RPKM_count file id column into two separate columns; one with the id and the other with the feature """ if not out_file: out_file = append_stem(in_file, "fixed") if file_exists(out_file): return out_file with open(in_file) as in_handle: rpkm = pd.io.parsers.read_table(in_handle) rpkm["gene_id"] = rpkm["accession"].apply(lambda x: x.rsplit("_", 2)[0]) rpkm["feature"] = rpkm["accession"].apply(lambda x: x.rsplit("_", 2)[1]) # remove the '#' character since it denotes a comment rpkm = rpkm.rename(columns={"#chrom": "chrom"}) with file_transaction(out_file) as tmp_out_file: rpkm.to_csv(tmp_out_file, sep="\t", index=False) return out_file
def test_length_filter(self): paired = self.config["input_paired"] out_files = filter_reads_by_length(paired[0], paired[1], min_length=20) correct_files = map(self._find_length_filter_correct, out_files) self.assertTrue(all(map(filecmp.cmp, correct_files, out_files))) map(os.remove, out_files) map(os.remove, [append_stem(x, "singles") for x in paired])
def hard_clip(in_file, bases=8, right_side=True, quality_format="sanger", out_file=None): """ hard clip a fastq file by removing N bases from each read bases is the number of bases to clip right_side is True to trim from the right side, False to trim from the left example: hard_clip(fastq_file, bases=4, end="5prime") """ if right_side: logger.info("Hard clipping %d bases from the right side of " "reads in %s." % (bases, in_file)) else: logger.info("Hard clipping %d bases from the left side of " "reads in %s." % (bases, in_file)) quality_type = QUALITY_TYPE_HARD_TRIM[quality_format] if not out_file: out_file = append_stem(in_file, "clip") if file_exists(out_file): return out_file in_iterator = SeqIO.parse(in_file, quality_type) out_iterator = (_trim_read(record, bases, right_side) for record in in_iterator) with file_transaction(out_file) as tmp_out_file: with open(tmp_out_file, "w") as out_handle: SeqIO.write(out_iterator, out_handle, quality_type) return out_file
def run(in_file, stage_config, config): arguments = [stage_config["program"]] arguments += _parse(stage_config) results_dir = config["dir"].get("results", None) if results_dir: out_dir = os.path.join(results_dir, "cutadapt") safe_makedir(out_dir) out_file = os.path.join(out_dir, os.path.basename(append_stem(in_file, "trimmed"))) else: out_file = append_stem(in_file, "trimmed") if file_exists(out_file): return out_file arguments.extend(["--output", out_file, in_file]) subprocess.check_call(arguments) return out_file
def only_unmapped(in_file, out_file=None): if out_file is None: out_file = append_stem(in_file, "unmapped") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: sh.samtools.view(in_file, h=True, S=True, f=4, o=out_file) return out_file
def run_with_config(first, second=None, config=None): first_out = append_stem(first, "sickle") second_out = None if second: out_files = run_as_pe(first, second, config) return out_files else: out_file = run_as_se(first, config) return out_file
def _run_vep(self, in_file): out_file = append_stem(in_file, "vep") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: sh.perl(self.vep, "-i", in_file, "-o", tmp_out_file, species=self.species, _convert_underscore=False, **self.options) return out_file
def out_file(self, in_file): """ returns the expected output file name from the in_file example: "control_1.fastq" -> "control_1.groom.fastq" """ results_dir = self.config["dirs"].get("results", "results") stage_dir = os.path.join(results_dir, self.stage) out_file = append_stem(os.path.basename(in_file), "groom") return os.path.join(stage_dir, out_file)
def run(in_file, end="se", qual="sanger", l="20", out_file=None): if not out_file: out_file = append_stem(in_file, "trimmed") if os.path.exists(out_file): return out_file cmd = ["sickle", end, "-f", in_file, "-o", out_file, "-t", qual, "-l", l, "-q", qual] subprocess.check_call(cmd) return out_file
def sort_vcf(in_file): from bipy.utils import append_stem from bcbio.distributed.transaction import file_transaction from bcbio.utils import file_exists import sh out_file = append_stem(in_file, "sorted") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: sh.vcf_sort(in_file, _out=tmp_out_file) return out_file
def run(input_file, jellyfish_config, config): # run the jellyfish counting, this produces a set of files identified # by out_prefix out_prefix = _build_output_prefix(input_file, jellyfish_config, config) cmd = _build_command(input_file, out_prefix, config) subprocess.check_call(cmd) # combine the output files into one merged file and return that out_file = append_stem(out_prefix, "combined") merge_cmd = _build_merge_command(out_prefix, out_file) subprocess.check_call(merge_cmd) # find all of the output files and merge them into one file return out_file
def _run_trim(curr_files, config): logger.info("Trimming poor quality ends from %s" % (str(curr_files))) nfiles = len(curr_files) min_length = str(config["stage"]["trim"].get("min_length", 20)) pair = str(config["stage"]["trim"].get("pair", "se")) platform = str(config["stage"]["trim"].get("platform", "sanger")) out_dir = os.path.join(config["dir"]["results"], "trimmed") safe_makedir(out_dir) out_files = [append_stem(os.path.basename(x), "trim") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(sickle.run, curr_files, [pair] * nfiles, [platform] * nfiles, [min_length] * nfiles, out_files) return out_files
def make_test(in_file, config, lines=1000000): """ take a small subset of the input files for testing. only makes sense for text files where lines gives an appopriate number of records, for example, FASTQ files should be a multiple of 4. """ results_dir = config["dir"]["results"] out_dir = os.path.join(results_dir, "test", "data") safe_makedir(out_dir) out_file = os.path.join(out_dir, append_stem(os.path.basename(in_file), "test")) with open(in_file) as in_handle, open(out_file, "w") as out_handle: for line in islice(in_handle, lines): out_handle.write(line) return out_file
def annotate_table_with_biomart(in_file, join_column, filter_type, organism, out_file=None): """ join_column is the column to combine the perform the lookups on filter_type describes the type of the join_column (see the getBM documentation in R for details), organism is the english name of the organism example: annotate_table_with_biomart(in_file, "id", "ensembl_gene_id", "human") """ if organism not in ORG_TO_ENSEMBL: logger.error("organism not supported") exit(1) logger.info("Annotating %s." % (organism)) if not out_file: out_file = append_stem(in_file, "annotated") if os.path.exists(out_file): return out_file # use biomaRt to annotate the data file r = robjects.r r.assign('join_column', join_column) r.assign('in_file', in_file) r.assign('out_file', out_file) r.assign('ensembl_gene', ORG_TO_ENSEMBL[organism]["gene_ensembl"]) r.assign('gene_symbol', ORG_TO_ENSEMBL[organism]["gene_symbol"]) r.assign('filter_type', filter_type) r(''' library(biomaRt) ensembl = useMart("ensembl", dataset = ensembl_gene) d = read.table(in_file, header=TRUE) a = getBM(attributes=c(filter_type, gene_symbol, "description"), filters=c(filter_type), values=d[,join_column], mart=ensembl) m = merge(d, a, by.x=join_column, by.y=filter_type) write.table(m, out_file, quote=FALSE, row.names=FALSE, sep="\t") ''') return out_file
def filter_single_reads_by_length(in_file, min_length=30): """ removes reads from a fastq file which are below a min_length in bases """ logger.info("Removing reads in %s thare are less than %d bases." % (in_file, min_length)) quality_type = QUALITY_TYPE[DetectFastqFormat.run(in_file)[0]] out_file = append_stem(in_file, "fixed") if file_exists(out_file): return out_file in_iterator = SeqIO.parse(in_file, quality_type) out_iterator = (record for record in in_iterator if len(record.seq) > min_length) with file_transaction(out_file) as tmp_out_file: with open(tmp_out_file, "w") as out_handle: SeqIO.write(out_iterator, out_handle, quality_type) return out_file
def chr_out(chrom): out_file = os.path.join(break_dir, append_stem(in_file, chrom)) out_file = replace_suffix(out_file, "vcf") return out_file
def _build_output_file(input_file, suffix, config): base = os.path.basename(input_file) return os.path.join(config["dir"]["results"], "tagdust", append_stem(base, suffix))
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) # after the cluster is up, import the view to i from bipy.cluster import view input_files = config["input"] results_dir = config["dir"]["results"] # make the needed directories map(safe_makedir, config["dir"].values()) curr_files = input_files ## qc steps for stage in config["run"]: if stage == "fastqc": # run the basic fastqc logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = config["stage"][stage] fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * len(curr_files), [config] * len(curr_files)) # this does nothing for now, not implemented yet summary_file = _combine_fastqc(fastqc_outputs) if stage == "trim": logger.info("Trimming poor quality ends " " from %s" % (str(curr_files))) nlen = len(curr_files) min_length = str(config["stage"][stage].get("min_length", 20)) # trim low quality ends of reads # do this dirty for now out_dir = os.path.join(results_dir, "trimmed") safe_makedir(out_dir) out_files = [ append_stem(os.path.basename(x), "trim") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] # XXX remove the magic number of 10 the length of the # minimum read to keep out_files = view.map(sickle.run, curr_files, ["se"] * nlen, ["sanger"] * nlen, [min_length] * nlen, out_files) curr_files = out_files if stage == "tagdust": input_files = curr_files # remove tags matching the other miRNA tested logger.info("Running %s on %s." % (stage, input_files)) tagdust_config = config["stage"][stage] tagdust_outputs = view.map(tagdust.run, input_files, [tagdust_config] * len(input_files), [config] * len(input_files)) curr_files = [x[0] for x in tagdust_outputs] if stage == "filter_length": # filter out reads below or above a certain length filter_config = config["stage"][stage] min_length = filter_config.get("min_length", 0) max_length = filter_config.get("max_length", MAX_READ_LENGTH) # length predicate def length_filter(x): return min_length < len(x.seq) < max_length # filter the input reads based on length # parallelizing this doesn't seem to work # ipython can't accept closures as an argument to view.map() """ filtered_fastq = view.map(filter_seqio, tagdust_outputs, [lf] * len(tagdust_outputs), ["filt"] * len(tagdust_outputs), ["fastq"] * len(tagdust_outputs))""" out_files = [ append_stem(os.path.basename(input_file[0]), "filt") for input_file in tagdust_outputs ] out_dir = os.path.join(config["dir"]["results"], "length_filtered") safe_makedir(out_dir) out_files = [os.path.join(out_dir, x) for x in out_files] filtered_fastq = [ filter_seqio(x[0], length_filter, y, "fastq") for x, y in zip(tagdust_outputs, out_files) ] curr_files = filtered_fastq if stage == "count_ends": logger.info("Compiling nucleotide counts at 3' and 5' ends.") # count the nucleotide at the end of each read def count_ends(x, y): """ keeps a running count of an arbitrary set of keys during the reduce step """ x[y] = x.get(y, 0) + 1 return x def get_3prime_end(x): return str(x.seq[-1]) def get_5prime_end(x): return str(x.seq[0]) def output_counts(end_function, count_file): # if the count_file already exists, skip outdir = os.path.join(config["dir"]["results"], stage) safe_makedir(outdir) count_file = os.path.join(outdir, count_file) if os.path.exists(count_file): return count_file # outputs a tab file of the counts at the end # of the fastq files kj counts = [ reduce(count_ends, apply_seqio(x, end_function, kind="fastq"), {}) for x in curr_files ] df = pd.DataFrame(counts, index=map(_short_name, curr_files)) df = df.astype(float) total = df.sum(axis=1) df = df.div(total, axis=0) df["total"] = total df.to_csv(count_file, sep="\t") output_counts(get_3prime_end, "3prime_counts.tsv") output_counts(get_5prime_end, "5prime_counts.tsv") if stage == "tophat": tophat_config = config["stage"][stage] logger.info("Running tophat on %s" % (str(curr_files))) nlen = len(curr_files) pair_file = None ref_file = tophat_config["annotation"] out_base = os.path.join(results_dir, "mirna") align_dir = os.path.join(results_dir, "tophat") config = config tophat_files = view.map(tophat.align, curr_files, [pair_file] * nlen, [ref_file] * nlen, [out_base] * nlen, [align_dir] * nlen, [config] * nlen) curr_files = tophat_files if stage == "novoalign": logger.info("Running novoalign on %s" % (str(curr_files))) # align ref = config["genome"]["file"] novoalign_config = config["stage"][stage] aligned_outputs = view.map(novoalign.run, curr_files, [ref] * len(curr_files), [novoalign_config] * len(curr_files), [config] * len(curr_files)) # convert sam to bam, sort and index picard = BroadRunner(config["program"]["picard"], None, {}) bamfiles = view.map(picardrun.picard_formatconverter, [picard] * len(aligned_outputs), aligned_outputs) sorted_bf = view.map(picardrun.picard_sort, [picard] * len(bamfiles), bamfiles) view.map(picardrun.picard_index, [picard] * len(sorted_bf), sorted_bf) # these files are the new starting point for the downstream # analyses, so copy them over into the data dir and setting # them to read only #data_dir = os.path.join(config["dir"]["data"], stage) #safe_makedir(data_dir) #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf)) #new_files = [os.path.join(data_dir, x) for x in # map(os.path.basename, sorted_bf)] #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files] # index the bam files for later use #view.map(picardrun.picard_index, [picard] * len(new_files), # new_files) curr_files = sorted_bf if stage == "new_coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"], None, {}) out_dir = os.path.join(results_dir, "new_coverage") safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) curr_files = out_files if stage == "coverage": gtf = blastn.prepare_ref_file(config["annotation"], config) logger.info("Calculating coverage of features in %s for %s" % (gtf, str(sorted_bf))) out_files = [replace_suffix(x, "counts.bed") for x in sorted_bf] out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) logger.info(out_files) out_files = [ os.path.join(out_dir, os.path.basename(x)) for x in out_files ] logger.info(out_files) view.map(bedtools.count_overlaps, sorted_bf, [gtf] * len(sorted_bf), out_files) if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, aligned_outputs, [config] * nfiles, [stage] * nfiles) column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) if stage == "bedtools_intersect": bedfiles = config["stage"]["bedtools_intersect"].get("bed", None) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for bedfile in bedfiles: bedbase, bedext = os.path.splitext(bedfile) out_files = [remove_suffix(x) for x in sorted_bf] out_files = [ os.path.join(out_dir, os.path.basename(x)) for x in out_files ] out_files = [ "_vs_".join([x, os.path.basename(bedbase)]) for x in out_files ] out_files = [".".join([x, "bam"]) for x in out_files] test_out = map(bedtools.intersectbam2bed, sorted_bf, [bedfile] * len(sorted_bf), [False] * len(sorted_bf), out_files) count_files = [replace_suffix(x, "stats") for x in out_files] map(write_ratios, sorted_bf, out_files, count_files) if stage == "piranha": piranha_runner = piranha.PiranhaStage(config) out_files = view.map(piranha_runner, curr_files) stop_cluster()
def coordinate_sort_sam(in_file, config, out_file=None): out_file = append_stem(in_file, "sorted") picard = BroadRunner(config["program"]["picard"], None, {"algorithm": {}}) picardrun.picard_sort(picard, in_file, sort_order="coordinate", out_file=out_file) return out_file
def sortsam(in_file, out_file=None): out_file = append_stem(in_file, "sorted") with file_transaction(out_file) as tmp_out_file: sort = sh.sort.bake(s=True, k="1,1", _out=tmp_out_file) sort(in_file) return out_file
def run_as_se(first, config): first_out = append_stem(first, "sickle") pass
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) # after the cluster is up, import the view to i from bipy.cluster import view input_files = config["input"] results_dir = config["dir"]["results"] # make the needed directories map(safe_makedir, config["dir"].values()) curr_files = input_files ## qc steps for stage in config["run"]: if stage == "fastqc": # run the basic fastqc logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = config["stage"][stage] fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * len(curr_files), [config] * len(curr_files)) # this does nothing for now, not implemented yet summary_file = _combine_fastqc(fastqc_outputs) if stage == "trim": logger.info("Trimming poor quality ends " " from %s" % (str(curr_files))) nlen = len(curr_files) min_length = str(config["stage"][stage].get("min_length", 20)) # trim low quality ends of reads # do this dirty for now out_dir = os.path.join(results_dir, "trimmed") safe_makedir(out_dir) out_files = [append_stem(os.path.basename(x), "trim") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] # XXX remove the magic number of 10 the length of the # minimum read to keep out_files = view.map(sickle.run, curr_files, ["se"] * nlen, ["sanger"] * nlen, [min_length] * nlen, out_files) curr_files = out_files if stage == "tagdust": input_files = curr_files # remove tags matching the other miRNA tested logger.info("Running %s on %s." % (stage, input_files)) tagdust_config = config["stage"][stage] tagdust_outputs = view.map(tagdust.run, input_files, [tagdust_config] * len(input_files), [config] * len(input_files)) curr_files = [x[0] for x in tagdust_outputs] if stage == "filter_length": # filter out reads below or above a certain length filter_config = config["stage"][stage] min_length = filter_config.get("min_length", 0) max_length = filter_config.get("max_length", MAX_READ_LENGTH) # length predicate def length_filter(x): return min_length < len(x.seq) < max_length # filter the input reads based on length # parallelizing this doesn't seem to work # ipython can't accept closures as an argument to view.map() """ filtered_fastq = view.map(filter_seqio, tagdust_outputs, [lf] * len(tagdust_outputs), ["filt"] * len(tagdust_outputs), ["fastq"] * len(tagdust_outputs))""" out_files = [append_stem(os.path.basename(input_file[0]), "filt") for input_file in tagdust_outputs] out_dir = os.path.join(config["dir"]["results"], "length_filtered") safe_makedir(out_dir) out_files = [os.path.join(out_dir, x) for x in out_files] filtered_fastq = [filter_seqio(x[0], length_filter, y, "fastq") for x, y in zip(tagdust_outputs, out_files)] curr_files = filtered_fastq if stage == "count_ends": logger.info("Compiling nucleotide counts at 3' and 5' ends.") # count the nucleotide at the end of each read def count_ends(x, y): """ keeps a running count of an arbitrary set of keys during the reduce step """ x[y] = x.get(y, 0) + 1 return x def get_3prime_end(x): return str(x.seq[-1]) def get_5prime_end(x): return str(x.seq[0]) def output_counts(end_function, count_file): # if the count_file already exists, skip outdir = os.path.join(config["dir"]["results"], stage) safe_makedir(outdir) count_file = os.path.join(outdir, count_file) if os.path.exists(count_file): return count_file # outputs a tab file of the counts at the end # of the fastq files kj counts = [reduce(count_ends, apply_seqio(x, end_function, kind="fastq"), {}) for x in curr_files] df = pd.DataFrame(counts, index=map(_short_name, curr_files)) df = df.astype(float) total = df.sum(axis=1) df = df.div(total, axis=0) df["total"] = total df.to_csv(count_file, sep="\t") output_counts(get_3prime_end, "3prime_counts.tsv") output_counts(get_5prime_end, "5prime_counts.tsv") if stage == "tophat": tophat_config = config["stage"][stage] logger.info("Running tophat on %s" % (str(curr_files))) nlen = len(curr_files) pair_file = None ref_file = tophat_config["annotation"] out_base = os.path.join(results_dir, "mirna") align_dir = os.path.join(results_dir, "tophat") config = config tophat_files = view.map(tophat.align, curr_files, [pair_file] * nlen, [ref_file] * nlen, [out_base] * nlen, [align_dir] * nlen, [config] * nlen) curr_files = tophat_files if stage == "novoalign": logger.info("Running novoalign on %s" % (str(curr_files))) # align ref = config["genome"]["file"] novoalign_config = config["stage"][stage] aligned_outputs = view.map(novoalign.run, curr_files, [ref] * len(curr_files), [novoalign_config] * len(curr_files), [config] * len(curr_files)) # convert sam to bam, sort and index picard = BroadRunner(config["program"]["picard"], None, {}) bamfiles = view.map(picardrun.picard_formatconverter, [picard] * len(aligned_outputs), aligned_outputs) sorted_bf = view.map(picardrun.picard_sort, [picard] * len(bamfiles), bamfiles) view.map(picardrun.picard_index, [picard] * len(sorted_bf), sorted_bf) # these files are the new starting point for the downstream # analyses, so copy them over into the data dir and setting # them to read only #data_dir = os.path.join(config["dir"]["data"], stage) #safe_makedir(data_dir) #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf)) #new_files = [os.path.join(data_dir, x) for x in # map(os.path.basename, sorted_bf)] #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files] # index the bam files for later use #view.map(picardrun.picard_index, [picard] * len(new_files), # new_files) curr_files = sorted_bf if stage == "new_coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"], None, {}) out_dir = os.path.join(results_dir, "new_coverage") safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) curr_files = out_files if stage == "coverage": gtf = blastn.prepare_ref_file(config["annotation"], config) logger.info("Calculating coverage of features in %s for %s" % (gtf, str(sorted_bf))) out_files = [replace_suffix(x, "counts.bed") for x in sorted_bf] out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) logger.info(out_files) out_files = [os.path.join(out_dir, os.path.basename(x)) for x in out_files] logger.info(out_files) view.map(bedtools.count_overlaps, sorted_bf, [gtf] * len(sorted_bf), out_files) if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, aligned_outputs, [config] * nfiles, [stage] * nfiles) column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) if stage == "bedtools_intersect": bedfiles = config["stage"]["bedtools_intersect"].get("bed", None) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for bedfile in bedfiles: bedbase, bedext = os.path.splitext(bedfile) out_files = [remove_suffix(x) for x in sorted_bf] out_files = [os.path.join(out_dir, os.path.basename(x)) for x in out_files] out_files = ["_vs_".join([x, os.path.basename(bedbase)]) for x in out_files] out_files = [".".join([x, "bam"]) for x in out_files] test_out = map(bedtools.intersectbam2bed, sorted_bf, [bedfile] * len(sorted_bf), [False] * len(sorted_bf), out_files) count_files = [replace_suffix(x, "stats") for x in out_files] map(write_ratios, sorted_bf, out_files, count_files) if stage == "piranha": piranha_runner = piranha.PiranhaStage(config) out_files = view.map(piranha_runner, curr_files) stop_cluster()
def out_file(self, in_file): results_dir = self.config["dir"].get("results", "results") out_dir = os.path.join(results_dir, self.stage) out_base = append_stem(os.path.basename(in_file), "clip") return os.path.join(out_dir, out_base)