def _get_adapters(self, chemistry): adapters = [ADAPTERS.get(x, []) for x in chemistry] adapters += self.user_adapters adapters = list(flatten(adapters)) adapters += self._rc_adapters(adapters) adapter_args = [["-a", adapter] for adapter in adapters] return list(flatten(adapter_args))
def _build_command(input_file, fastqc_config, config): program = fastqc_config["program"] options = map(str, list(flatten(fastqc_config["options"]))) outdir = _make_outdir(config) options += ["--outdir", outdir, "--kmers", "6"] cmd = list(flatten([program, options, input_file])) return cmd
def _build_command(input_file, options, control_file=None, out_dir=None): name = remove_suffix(os.path.basename(input_file)) #if out_dir: # name = os.path.join(out_dir, name) options = ["=".join(map(str, x)) for x in options] cmd = ["macs14", "--treatment=" + input_file, flatten(options), "--name=" + name] if control_file: cmd += ["--control=" + control_file] return map(str, flatten(cmd))
def _build_command(input_file, options, control_file=None, out_dir=None): name = remove_suffix(os.path.basename(input_file)) #if out_dir: # name = os.path.join(out_dir, name) options = ["=".join(map(str, x)) for x in options] cmd = [ "macs14", "--treatment=" + input_file, flatten(options), "--name=" + name ] if control_file: cmd += ["--control=" + control_file] return map(str, flatten(cmd))
def _build_command(input_file, ref, novoalign_config): cmd = [ which("novoalign"), flatten_options(novoalign_config), "-o", "SAM", "-d", ref, "-f", input_file ] return list(map(str, flatten(cmd)))
def main(config, view): # make the needed directories map(safe_makedir, config["dir"].values()) # specific for project human_input = find_sam_files(config["input_dir_human"]) mouse_input = find_sam_files(config["input_dir_mouse"]) if len(human_input) != len(mouse_input): logger.error("The length of the number of human SAM files does " "not match the length of the number of mouse SAM " "files, aborting.") sys.exit(1) input_files = zip(human_input, mouse_input) curr_files = input_files logger.info("Running disambiguation pipeline on %s." % (curr_files)) for stage in config["run"]: if stage == "disambiguate": logger.info("Disambiguating %s." % (curr_files)) disambiguate = Disambiguate(config) out_files = list(flatten(view.map(disambiguate, curr_files))) bam_files = view.map(sam.sam2bam, out_files) bam_sorted = view.map(sam.bamsort, bam_files) view.map(sam.bamindex, bam_sorted)
def _disambiguate_out(self, in_tuple): """ returns the set of output filenames that will be made from running disambiguate on the tuple of input files """ return list(flatten(map(self._organism_files, in_tuple, self.organisms)))
def input_files_from_dir(in_dir, id_file): with open(os.path.join(in_dir, id_file)) as in_handle: ids = yaml.load(in_handle) sample_names = [x for x in ids] samples = [glob.glob(in_dir + "/*_%s.R1.fastq" % (x)) for x in sample_names] return list(flatten(samples))
def _build_command(input_file, tagdust_config, config): cl = [tagdust_config["program"], flatten_options(tagdust_config)] if "clean" in tagdust_config["keep"]: cl += ["-o", _build_output_file(input_file, "clean", config)] if "dirty" in tagdust_config["keep"]: cl += ["-a", _build_output_file(input_file, "dirty", config)] cl += [tagdust_config["contaminants"], input_file] return list(map(str, flatten(cl)))
def _find_input_files(config): input_dirs = config["input_dirs"] """ find all of the fastq files by identifier """ identifier = config["sample_parse"]["identifier"] input_files = [ glob.glob(os.path.join(config["dir"]["data"], input_dir, identifier)) for input_dir in input_dirs ] return list(flatten(input_files))
def _find_input_files(config): input_dirs = config["input_dirs"] """ find all of the fastq files by identifier """ identifier = config["sample_parse"]["identifier"] input_files = [glob.glob(os.path.join(config["dir"]["data"], input_dir, identifier)) for input_dir in input_dirs] return list(flatten(input_files))
def _parse(config): # handle the adapters, defaulting to illumina and a poly-a trimmer # if none are provided adapters = [] adapters += flatten(map(_get_adapter, config.get("adapters", []))) # add built in platform if available platform = config.get("platform", None) if platform: adapters += flatten(map(_get_platform_adapters, [p for p in platform if p in ADAPTERS])) # default to illumina and poly A if not adapters: adapters += flatten(map(_get_platform_adapters, [p for p in ["illumina", "polya"]])) arguments = [] arguments += adapters # grab everything else arguments += config.get("options", []) return map(str, list(flatten(arguments)))
def input_files_from_dir(in_dir, id_file): with open(os.path.join(in_dir, id_file)) as in_handle: ids = yaml.load(in_handle) sample_names = [x for x in ids] samples = [ glob.glob(in_dir + "/*_%s.R1.fastq" % (x)) for x in sample_names ] return list(flatten(samples))
def multi_intersect(in_files, options=None, out_file=None): """ reports the intersection of multiple bed files """ if options is None: options = [] cmd = ["multiIntersectBed", options, "-i", in_files] cmd = flatten(cmd) cmd = map(str, cmd) out_file = _run_command(in_files, cmd, suffix=".intersect.bed", out_file=out_file) return out_file
def _build_command(in_file, stage_config, config): cmd = ["java", "-jar", stage_config["program"]] out_dir = os.path.join(config["dir"]["results"], stage_config.get("name", "rna_seqc"), get_stem(in_file)) safe_makedir(out_dir) cmd += ["-o", out_dir] sample = "|".join([get_stem(in_file), in_file, "rna_seqc"]) cmd += ["-s", sample] cmd += ["-r", config["ref_fasta"]] cmd += ["-t", config["gtf"]] cmd += [stage_config.get("options", [])] return list(flatten(cmd))
def run(input_file, gtf_file, options=None, out_file=None): if options is None: options = [] if out_file is None: out_file = _get_outfilename(input_file) safe_makedir(os.path.dirname(out_file)) if file_exists(out_file): return out_file cmd = map(str, flatten(["htseq-count", options, input_file, gtf_file])) with open(out_file, "w") as out_handle: subprocess.check_call(cmd, stdout=out_handle) return out_file
def __call__(self, in_file): raise NotImplementedError("Waiting to hear back from maintainer to " "handle multiple adapters before finishing.") adapters = list(flatten(map(self.get_adapters, self.chemistry))) # if it is a list assume these are pairs if isinstance(in_file, list): out_files = map(self._in2out, in_file) if all(map(file_exists, out_files)): return out_files self.trim_galore(in_file, self.options, adapters, paired=True) return out_files # if it is only one file just run it else: out_file = self._in2out(in_file) if file_exists(out_file): return out_file self.trim_galore(in_file, self.options, adapters) return out_file
def _build_command(input_file, novoindex_config, output_file): options = novoindex_config["options"].items() cmd = map(str, (flatten(["novoindex", options, output_file, input_file]))) return cmd
def get_adapters(self, chemistry): return list(flatten([["-a", x] for x in ADAPTERS.get(chemistry, [])]))
def main(config_file): if config_file: with open(config_file) as in_handle: config = yaml.load(in_handle) dirs = config["in_dir"] conditions = config["conditions"] glob_string = config["glob_string"] files = list(flatten([glob.glob(os.path.join(x, glob_string)) for x in dirs])) out_dir = config["dir"]["results"] safe_makedir(out_dir) curr_files = [] for condition in conditions: condition_files = [x for x in files if condition in x] out_file = os.path.join(out_dir, condition + "_v2_v3.bam") print "Combining %s into %s." % (condition_files, out_file) sh.samtools.merge(list(flatten([out_file, condition_files]))) # bsub_call = list(flatten(["-q", "hsph", "-o", "out" + condition, "-e", "err" + condition, "samtools", "merge", out_file, condition_files])) #sh.bsub(bsub_call) sorted_prefix = remove_suffix(out_file) + ".sorted" sorted_file = sorted_prefix + ".bam" sh.samtools.sort(out_file, sorted_prefix) sh.samtools.index(sorted_file) mapped_file = append_stem(sorted_file, "mapped") sh.samtools.view(sorted_file, F=4, b=True, o=mapped_file) sh.samtools.index(mapped_file) # find the reads that don't intersect with the rrna in_file = mapped_file out_file = os.path.join(out_dir, condition + "_noribo" + "_v2_v3.bam") ribo = config["ribo"] print "Filtering %s for rRNA in %s into %s." % (in_file, ribo, out_file) sh.bedtools.intersect("-abam", in_file, "-v", "-b", ribo, _out=out_file) filtered_file = out_file print "Calculating RNASeq metrics on %s." % (out_file) in_file = out_file ref = blastn.prepare_ref_file(config["stage"]["new_coverage"]["ref"], config) ribo = config["stage"]["new_coverage"]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], "new_coverage") safe_makedir(out_dir) out_file = replace_suffix(os.path.basename(in_file), "metrics") out_file = os.path.join(out_dir, out_file) metrics_file = picardrun.picard_rnaseq_metrics(picard, in_file, ref, ribo, out_file) jelly_dir = os.path.join(config["dir"]["results"], "jellyfish") safe_makedir(jelly_dir) # convert the filtered file to fastq for jellyfish counting fastq_file = os.path.join(jelly_dir, os.path.basename(replace_suffix(filtered_file, "fastq"))) sh.bam2fastx(filtered_file, fastq=True, _out=fastq_file) for mer in config["stage"]["jellyfish"]["mer_lengths"]: base, _ = os.path.splitext(os.path.basename(fastq_file)) out_prefix = base + "_%dmer" % (mer) out_file = os.path.join(jelly_dir, out_prefix) if not file_exists(out_file): sh.jellyfish.count(fastq_file, config["stage"]["jellyfish"]["options"], m=mer, o=out_file)
def main(config_file): # load yaml config file with open(config_file) as in_handle: config = yaml.load(in_handle) # setup logging setup_logging(config) from bipy.log import logger # start cluster start_cluster(config) from bipy.cluster import view found = sh.find(config["dir"]["data"], "-name", "Variations") var_dirs = [str(x).strip() for x in found] logger.info("Var_dirs: %s" % (var_dirs)) in_dirs = map(os.path.dirname, var_dirs) logger.info("in_dirs: %s" % (in_dirs)) # XXX for testing only load 3 #curr_files = in_dirs[0:5] curr_files = in_dirs # run the illumina fixer logger.info("Running illumina fixer on %s." % (curr_files)) illf_class = STAGE_LOOKUP.get("illumina_fixer") illf = illf_class(config) curr_files = view.map(illf, curr_files) # sort the vcf files def sort_vcf(in_file): from bipy.utils import append_stem from bcbio.distributed.transaction import file_transaction from bcbio.utils import file_exists import sh out_file = append_stem(in_file, "sorted") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: sh.vcf_sort(in_file, _out=tmp_out_file) return out_file # combine out_file = os.path.join(config["dir"].get("results", "results"), "geminiloader", "all_combined.vcf") logger.info("Combining files %s into %s." % (curr_files, out_file)) if file_exists(out_file): curr_files = [out_file] else: curr_files = [ genotype.combine_variant_files(curr_files, out_file, config["ref"]["fasta"], config) ] # break the VCF files up by chromosome for speed logger.info("Breaking up %s by chromosome." % (curr_files)) breakvcf_class = STAGE_LOOKUP.get("breakvcf") breakvcf = breakvcf_class(config) curr_files = view.map(breakvcf, curr_files) # run VEP on the separate files in parallel logger.info("Running VEP on %s." % (curr_files)) vep_class = STAGE_LOOKUP.get("vep") vep = vep_class(config) curr_files = view.map(vep, list(flatten(curr_files))) curr_files = filter(file_exists, curr_files) # load the files into gemini not in parallel # don't run in parallel # sort the vcf files logger.info("Sorting %s." % (curr_files)) curr_files = view.map(sort_vcf, curr_files) # don't run the rest of this in parallel, so take the cluster down stop_cluster() out_file = os.path.join(config["dir"].get("results", "results"), "geminiloader", "all_combined.vep.vcf") logger.info("Combining files %s into %s." % (curr_files, out_file)) if file_exists(out_file): curr_files = [out_file] else: curr_files = [ genotype.combine_variant_files(curr_files, out_file, config["ref"]["fasta"], config) ] logger.info("Loading %s into gemini." % (curr_files)) gemini_class = STAGE_LOOKUP.get("geminiloader") geminiloader = gemini_class(config) curr_files = map(geminiloader, curr_files) logger.info("Run complete.")
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline input_dirs = config["input_dirs"] results_dir = config["dir"].get("results", "results") input_files = _find_input_files(config) conditions = _group_input_by_condition(input_files) logger.info("Input_files: %s" % (input_files)) logger.info("Condition groups %s" %(conditions)) htseq_outdict = {} for condition, curr_files in conditions.items(): condition_dir = os.path.join(results_dir, condition) safe_makedir(condition_dir) config["dir"]["results"] = condition_dir for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) view.map(fastqc.run, *fastqc_args) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip(*product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = cutadapt_outputs logger.info("Fixing mate pair information.") pairs = combine_pairs(curr_files) first = [x[0] for x in pairs] second = [x[1] for x in pairs] logger.info("Forward: %s" % (first)) logger.info("Reverse: %s" % (second)) fixed = view.map(fastq.fix_mate_pairs_with_config, first, second, [config] * len(first)) curr_files = list(flatten(fixed)) if stage == "sickle": _emit_stage_message(stage, curr_files) pairs = combine_pairs(curr_files) first = [x[0] for x in pairs] second = [x[1] for x in pairs] fixed = view.map(sickle.run_with_config, first, second, [config] * len(first)) curr_files = list(flatten(fixed)) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) pairs = combine_pairs(curr_files) first = [x[0] for x in pairs] second = [x[1] for x in pairs] logger.info("first %s" % (first)) logger.info("second %s" % (second)) #tophat_args = zip(*product(first, second, [config["ref"]], # ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, first, second, [config["ref"]] * len(first), ["tophat"] * len(first), [config] * len(first)) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_outdict[condition] = htseq_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs # combine htseq-count files and run deseq on them conditions, htseq_files = dict_to_vectors(htseq_outdict) deseq_config = _get_stage_config(config, "deseq") cell_types = _group_input_by_cell_type(htseq_files) for cell_type, files in cell_types.items(): for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) deseq_dir = os.path.join(results_dir, "deseq", cell_type, comparison_name) safe_makedir(deseq_dir) out_file = os.path.join(deseq_dir, comparison_name + ".counts.txt") files_by_condition = _group_input_by_condition(files) _emit_stage_message("deseq", files_by_condition) c, f = dict_to_vectors(files_by_condition) combined_out = htseq_count.combine_counts(f, None, out_file) deseq_out = os.path.join(deseq_dir, comparison_name) logger.info("Running deseq on %s with conditions %s " "and writing ot %s" % (combined_out, conditions, deseq_out)) deseq_out = view.map(deseq.run, [combined_out], [c], [deseq_out]) annotate.annotate_table_with_biomart(deseq_out[0], "id", "ensembl_gene_id", "human") #annotated_file = view.map(annotate.annotate_table_with_biomart, # [deseq_out], # ["id"], # ["ensembl_gene_id"], # ["human"]) # end gracefully stop_cluster()
def main(config_file): # load yaml config file with open(config_file) as in_handle: config = yaml.load(in_handle) # setup logging setup_logging(config) from bipy.log import logger # start cluster start_cluster(config) from bipy.cluster import view found = sh.find(config["dir"]["data"], "-name", "Variations") var_dirs = [str(x).strip() for x in found] logger.info("Var_dirs: %s" % (var_dirs)) in_dirs = map(os.path.dirname, var_dirs) logger.info("in_dirs: %s" % (in_dirs)) # XXX for testing only load 3 #curr_files = in_dirs[0:5] curr_files = in_dirs # run the illumina fixer logger.info("Running illumina fixer on %s." % (curr_files)) illf_class = STAGE_LOOKUP.get("illumina_fixer") illf = illf_class(config) curr_files = view.map(illf, curr_files) # sort the vcf files def sort_vcf(in_file): from bipy.utils import append_stem from bcbio.distributed.transaction import file_transaction from bcbio.utils import file_exists import sh out_file = append_stem(in_file, "sorted") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: sh.vcf_sort(in_file, _out=tmp_out_file) return out_file # combine out_file = os.path.join(config["dir"].get("results", "results"), "geminiloader", "all_combined.vcf") logger.info("Combining files %s into %s." % (curr_files, out_file)) if file_exists(out_file): curr_files = [out_file] else: curr_files = [genotype.combine_variant_files(curr_files, out_file, config["ref"]["fasta"], config)] # break the VCF files up by chromosome for speed logger.info("Breaking up %s by chromosome." % (curr_files)) breakvcf_class = STAGE_LOOKUP.get("breakvcf") breakvcf = breakvcf_class(config) curr_files = view.map(breakvcf, curr_files) # run VEP on the separate files in parallel logger.info("Running VEP on %s." % (curr_files)) vep_class = STAGE_LOOKUP.get("vep") vep = vep_class(config) curr_files = view.map(vep, list(flatten(curr_files))) curr_files = filter(file_exists, curr_files) # load the files into gemini not in parallel # don't run in parallel # sort the vcf files logger.info("Sorting %s." % (curr_files)) curr_files = view.map(sort_vcf, curr_files) # don't run the rest of this in parallel, so take the cluster down stop_cluster() out_file = os.path.join(config["dir"].get("results", "results"), "geminiloader", "all_combined.vep.vcf") logger.info("Combining files %s into %s." % (curr_files, out_file)) if file_exists(out_file): curr_files = [out_file] else: curr_files = [genotype.combine_variant_files(curr_files, out_file, config["ref"]["fasta"], config)] logger.info("Loading %s into gemini." % (curr_files)) gemini_class = STAGE_LOOKUP.get("geminiloader") geminiloader = gemini_class(config) curr_files = map(geminiloader, curr_files) logger.info("Run complete.")
def _build_command(input_file, out_prefix, jellyfish_config): cmd = ["jellyfish", jellyfish_config["task"], jellyfish_config["options"]] cmd += ["-o", out_prefix, input_file] return list(flatten(cmd))
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) from bipy.log import logger start_cluster(config) data_dir = config["dir"]["data"] from bipy.cluster import view input_files = [glob.glob(os.path.join(data_dir, x, "*_rep*")) for x in config["input_dirs"]] input_files = list(flatten(input_files)) logger.info("Input files to process: %s" % (input_files)) results_dir = config["dir"]["results"] map(safe_makedir, config["dir"].values()) curr_files = input_files for stage in config["run"]: if stage == "fastqc": nfiles = len(curr_files) logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = _get_stage_config(config, stage) fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * nfiles, [config] * nfiles) if stage == "cutadapt": nfiles = len(curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_outputs = view.map(cutadapt_tool.run, curr_files, [cutadapt_config] * nfiles, [config] * nfiles) curr_files = cutadapt_outputs if stage == "novoalign": nfiles = len(curr_files) novoalign_config = _get_stage_config(config, stage) #db = novoindex.run(config["ref"], # _get_stage_config(config, "novoindex"), # config) db = config["genome"]["file"] novoalign_outputs = view.map(novoalign.run, curr_files, [db] * nfiles, [novoalign_config] * nfiles, [config] * nfiles) picard = BroadRunner(config["program"]["picard"]) args = zip(*itertools.product([picard], novoalign_outputs)) # conver to bam bamfiles = view.map(picardrun.picard_formatconverter, *args) args = zip(*itertools.product([picard], bamfiles)) # sort bam sorted_bf = view.map(picardrun.picard_sort, *args) # index bam args = zip(*itertools.product([picard], sorted_bf)) view.map(picardrun.picard_index, *args) curr_files = novoalign_outputs if stage == "htseq-count": logger.info("Running htseq-count on %s" %(curr_files)) htseq_outputs = curr_files column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) rpkm = htseq_count.calculate_rpkm(combined_out, config["annotation"]["file"]) rpkm_file = os.path.join(config["dir"]["results"], stage, "rpkm.txt") rpkm.to_csv(rpkm_file, sep="\t") if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "deseq": conditions = [os.path.basename(x).split("_")[0] for x in input_files] deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) # get the of the conditons that match this comparison indexes = [x for x, y in enumerate(conditions) if y in comparison] # find the htseq_files to combine and combine them htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [column_names[index] for index in indexes] logger.info(htseq_files) logger.info(htseq_columns) out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts(htseq_files, htseq_columns, out_file) deseq_conds = [conditions[index] for index in indexes] deseq_prefix = os.path.join(out_dir, comparison_name) deseq_out = view.map(deseq.run, [combined_out], [deseq_conds], [deseq_prefix]) logger.info("Annotating %s." % (deseq_out)) annotated_file = view.map(annotate.annotate_table_with_biomart, deseq_out, ["id"], ["ensembl_gene_id"], ["human"]) if stage == "dss": conditions = [os.path.basename(x).split("_")[0] for x in input_files] dss_config = _get_stage_config(config, stage) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for comparison in dss_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) # get the of the conditons that match this comparison indexes = [x for x, y in enumerate(conditions) if y in comparison] # find the htseq_files to combine and combine them htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [column_names[index] for index in indexes] out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts(htseq_files, htseq_columns, out_file) dss_conds = [conditions[index] for index in indexes] dss_prefix = os.path.join(out_dir, comparison_name) logger.info("Running DSS on %s with conditions %s and comparison %s." % (combined_out, dss_conds, comparison)) dss_out = dss.run(combined_out, dss_conds, comparison, dss_prefix) stop_cluster()
def _build_command(input_file, ref, novoalign_config): cmd = [which("novoalign"), flatten_options(novoalign_config), "-o", "SAM", "-d", ref, "-f", input_file] return list(map(str, flatten(cmd)))
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline input_dirs = config["input_dirs"] results_dir = config["dir"].get("results", "results") input_files = _find_input_files(config) conditions = _group_input_by_condition(input_files) logger.info("Input_files: %s" % (input_files)) logger.info("Condition groups %s" % (conditions)) htseq_outdict = {} for condition, curr_files in conditions.items(): condition_dir = os.path.join(results_dir, condition) safe_makedir(condition_dir) config["dir"]["results"] = condition_dir for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip( *product(curr_files, [fastqc_config], [config])) view.map(fastqc.run, *fastqc_args) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip( *product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = cutadapt_outputs logger.info("Fixing mate pair information.") pairs = combine_pairs(curr_files) first = [x[0] for x in pairs] second = [x[1] for x in pairs] logger.info("Forward: %s" % (first)) logger.info("Reverse: %s" % (second)) fixed = view.map(fastq.fix_mate_pairs_with_config, first, second, [config] * len(first)) curr_files = list(flatten(fixed)) if stage == "sickle": _emit_stage_message(stage, curr_files) pairs = combine_pairs(curr_files) first = [x[0] for x in pairs] second = [x[1] for x in pairs] fixed = view.map(sickle.run_with_config, first, second, [config] * len(first)) curr_files = list(flatten(fixed)) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) pairs = combine_pairs(curr_files) first = [x[0] for x in pairs] second = [x[1] for x in pairs] logger.info("first %s" % (first)) logger.info("second %s" % (second)) #tophat_args = zip(*product(first, second, [config["ref"]], # ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, first, second, [config["ref"]] * len(first), ["tophat"] * len(first), [config] * len(first)) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_outdict[condition] = htseq_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs # combine htseq-count files and run deseq on them conditions, htseq_files = dict_to_vectors(htseq_outdict) deseq_config = _get_stage_config(config, "deseq") cell_types = _group_input_by_cell_type(htseq_files) for cell_type, files in cell_types.items(): for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) deseq_dir = os.path.join(results_dir, "deseq", cell_type, comparison_name) safe_makedir(deseq_dir) out_file = os.path.join(deseq_dir, comparison_name + ".counts.txt") files_by_condition = _group_input_by_condition(files) _emit_stage_message("deseq", files_by_condition) c, f = dict_to_vectors(files_by_condition) combined_out = htseq_count.combine_counts(f, None, out_file) deseq_out = os.path.join(deseq_dir, comparison_name) logger.info("Running deseq on %s with conditions %s " "and writing ot %s" % (combined_out, conditions, deseq_out)) deseq_out = view.map(deseq.run, [combined_out], [c], [deseq_out]) annotate.annotate_table_with_biomart(deseq_out[0], "id", "ensembl_gene_id", "human") #annotated_file = view.map(annotate.annotate_table_with_biomart, # [deseq_out], # ["id"], # ["ensembl_gene_id"], # ["human"]) # end gracefully stop_cluster()