def __init__(self, in_files, config): self.in_files = in_files self.config = config setup_logging(self.config) start_cluster(self.config) from bipy.cluster import view self.view = view self.curr_files = [in_files] self.to_run = Queue() map(self.to_run.put, self.setup_stages(config["run"]))
def test_cluster(): with open(CONFIG_FILE) as in_handle: config = yaml.load(in_handle) setup_logging(config) from bipy.log import logger start_cluster(config) from bipy.cluster import view logger.info("Serial result") serial_result = map(mappable_function, range(32)) logger.info("Parallel result") parallel_result = view.map(mappable_function, range(32)) assert(serial_result == parallel_result)
def test_cluster(): with open(CONFIG_FILE) as in_handle: config = yaml.load(in_handle) setup_logging(config) from bipy.log import logger start_cluster(config) from bipy.cluster import view logger.info("Serial result") serial_result = map(mappable_function, range(32)) logger.info("Parallel result") parallel_result = view.map(mappable_function, range(32)) assert (serial_result == parallel_result)
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) from bipy.cluster import view input_files = config["input"] for stage in config["run"]: if config["stage"][stage]["program"] == "tagdust": tagdust_config = config["stage"][stage] view.map(tagdust.run, input_files, [tagdust_config] * len(input_files), [config] * len(input_files)) stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) from bipy.log import logger start_cluster(config) from bipy.cluster import view input_dir = config["dir"]["input_dir"] results_dir = config["dir"]["results"] input_files = glob.glob(os.path.join(input_dir, "*.bam")) """ example running with macs macs.run_with_config(input_file, config, control_file=None, stage=None) """ curr_files = input_files # first combine all the negative controls into one file negative_control = _merge_condition(input_files, config["groups"]["negative"]) test_files = [_merge_condition(input_files, condition) for condition in config["groups"]["test"]] test_files = [x for x in test_files if x] curr_files = test_files for stage in config["run"]: # for now just run macs on all of these files without the control # file if stage == "macs": nfiles = len(curr_files) out_files = view.map(macs.run_with_config, curr_files, [config] * nfiles, [negative_control] * nfiles, [stage] * nfiles) # just use the peak files going forward peak_files = [x[0] for x in out_files] curr_files = peak_files if stage == "piranha": nfiles = len(curr_files) piranha_runner = piranha.PiranhaStage(config) out_files = view.map(piranha_runner, curr_files) stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) from bipy.log import logger start_cluster(config) from bipy.cluster import view input_dir = config["dir"]["input_dir"] results_dir = config["dir"]["results"] input_files = glob.glob(os.path.join(input_dir, "*.bam")) """ example running with macs macs.run_with_config(input_file, config, control_file=None, stage=None) """ curr_files = input_files # first combine all the negative controls into one file negative_control = _merge_condition(input_files, config["groups"]["negative"]) test_files = [ _merge_condition(input_files, condition) for condition in config["groups"]["test"] ] test_files = [x for x in test_files if x] curr_files = test_files for stage in config["run"]: # for now just run macs on all of these files without the control # file if stage == "macs": nfiles = len(curr_files) out_files = view.map(macs.run_with_config, curr_files, [config] * nfiles, [negative_control] * nfiles, [stage] * nfiles) # just use the peak files going forward peak_files = [x[0] for x in out_files] curr_files = peak_files if stage == "piranha": nfiles = len(curr_files) piranha_runner = piranha.PiranhaStage(config) out_files = view.map(piranha_runner, curr_files) stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) from bipy.log import logger start_cluster(config) from bipy.cluster import view # view.push({'logger': logger}) input_files = [ os.path.join(config["dir"]["data"], x) for x in config["input"] ] results_dir = config["dir"]["results"] map(safe_makedir, config["dir"].values()) curr_files = input_files for stage in config["run"]: if stage == "fastqc": nfiles = len(curr_files) logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = _get_stage_config(config, stage) fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * nfiles, [config] * nfiles) if stage == "cutadapt": nfiles = len(curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_outputs = view.map(cutadapt_tool.run, curr_files, [cutadapt_config] * nfiles, [config] * nfiles) curr_files = cutadapt_outputs if stage == "novoalign": nfiles = len(curr_files) novoalign_config = _get_stage_config(config, stage) #db = novoindex.run(config["ref"], # _get_stage_config(config, "novoindex"), # config) db = config["genome"]["file"] novoalign_outputs = view.map(novoalign.run, curr_files, [db] * nfiles, [novoalign_config] * nfiles, [config] * nfiles) picard = BroadRunner(config["program"]["picard"]) args = zip(*itertools.product([picard], novoalign_outputs)) # conver to bam bamfiles = view.map(picardrun.picard_formatconverter, *args) args = zip(*itertools.product([picard], bamfiles)) # sort bam sorted_bf = view.map(picardrun.picard_sort, *args) # index bam args = zip(*itertools.product([picard], sorted_bf)) view.map(picardrun.picard_index, *args) curr_files = novoalign_outputs if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, curr_files, [config] * nfiles, [stage] * nfiles) column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) rpkm = htseq_count.calculate_rpkm(combined_out, config["annotation"]["file"]) rpkm_file = os.path.join(config["dir"]["results"], stage, "rpkm.txt") rpkm.to_csv(rpkm_file, sep="\t") if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "deseq": conditions = [ os.path.basename(x).split("_")[0] for x in input_files ] deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) # get the of the conditons that match this comparison indexes = [ x for x, y in enumerate(conditions) if y in comparison ] # find the htseq_files to combine and combine them htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [column_names[index] for index in indexes] logger.info(htseq_files) logger.info(htseq_columns) out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts( htseq_files, htseq_columns, out_file) deseq_conds = [conditions[index] for index in indexes] deseq_prefix = os.path.join(out_dir, comparison_name) deseq_out = view.map(deseq.run, [combined_out], [deseq_conds], [deseq_prefix]) logger.info("Annotating %s." % (deseq_out)) annotated_file = view.map(annotate.annotate_table_with_biomart, deseq_out, ["id"], ["ensembl_gene_id"], ["human"]) stop_cluster()
def _args_valid_for_local(args): return not args.scheduler and not args.queue if __name__ == "__main__": parser = argparse.ArgumentParser(description='generic launcher') parser.add_argument('--profile', required=True, help="IPython profile name to use") parser.add_argument('--cores', required=True, help="Number of IPython engines to start.") parser.add_argument('--queue', help="Name of queue to use.") parser.add_argument('--scheduler', default="", help="Name of scheduler to use (LSF or SGE)") args = parser.parse_args() cluster_config = {"cluster": {"profile": args.profile, "cores": int(args.cores), "queue": args.queue, "scheduler": args.scheduler}} setup_logging(cluster_config) if _args_valid_for_scheduler(args) or _args_valid_for_local(args): start_cluster(cluster_config) from bipy.cluster import view main() stop_cluster() else: print parser.usage() sys.exit(1)
def main(config_file): # load yaml config file with open(config_file) as in_handle: config = yaml.load(in_handle) # setup logging setup_logging(config) from bipy.log import logger # start cluster start_cluster(config) from bipy.cluster import view found = sh.find(config["dir"]["data"], "-name", "Variations") var_dirs = [str(x).strip() for x in found] logger.info("Var_dirs: %s" % (var_dirs)) in_dirs = map(os.path.dirname, var_dirs) logger.info("in_dirs: %s" % (in_dirs)) # XXX for testing only load 3 #curr_files = in_dirs[0:5] curr_files = in_dirs # run the illumina fixer logger.info("Running illumina fixer on %s." % (curr_files)) illf_class = STAGE_LOOKUP.get("illumina_fixer") illf = illf_class(config) curr_files = view.map(illf, curr_files) # sort the vcf files def sort_vcf(in_file): from bipy.utils import append_stem from bcbio.distributed.transaction import file_transaction from bcbio.utils import file_exists import sh out_file = append_stem(in_file, "sorted") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: sh.vcf_sort(in_file, _out=tmp_out_file) return out_file # combine out_file = os.path.join(config["dir"].get("results", "results"), "geminiloader", "all_combined.vcf") logger.info("Combining files %s into %s." % (curr_files, out_file)) if file_exists(out_file): curr_files = [out_file] else: curr_files = [ genotype.combine_variant_files(curr_files, out_file, config["ref"]["fasta"], config) ] # break the VCF files up by chromosome for speed logger.info("Breaking up %s by chromosome." % (curr_files)) breakvcf_class = STAGE_LOOKUP.get("breakvcf") breakvcf = breakvcf_class(config) curr_files = view.map(breakvcf, curr_files) # run VEP on the separate files in parallel logger.info("Running VEP on %s." % (curr_files)) vep_class = STAGE_LOOKUP.get("vep") vep = vep_class(config) curr_files = view.map(vep, list(flatten(curr_files))) curr_files = filter(file_exists, curr_files) # load the files into gemini not in parallel # don't run in parallel # sort the vcf files logger.info("Sorting %s." % (curr_files)) curr_files = view.map(sort_vcf, curr_files) # don't run the rest of this in parallel, so take the cluster down stop_cluster() out_file = os.path.join(config["dir"].get("results", "results"), "geminiloader", "all_combined.vep.vcf") logger.info("Combining files %s into %s." % (curr_files, out_file)) if file_exists(out_file): curr_files = [out_file] else: curr_files = [ genotype.combine_variant_files(curr_files, out_file, config["ref"]["fasta"], config) ] logger.info("Loading %s into gemini." % (curr_files)) gemini_class = STAGE_LOOKUP.get("geminiloader") geminiloader = gemini_class(config) curr_files = map(geminiloader, curr_files) logger.info("Run complete.")
view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs # end gracefully stop_cluster() if __name__ == "__main__": # read in the config file and perform initial setup main_config_file = sys.argv[1] with open(main_config_file) as config_in_handle: startup_config = yaml.load(config_in_handle) setup_logging(startup_config) start_cluster(startup_config) from bipy.cluster import view main(main_config_file)
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) from bipy.log import logger start_cluster(config) data_dir = config["dir"]["data"] from bipy.cluster import view input_files = [glob.glob(os.path.join(data_dir, x, "*_rep*")) for x in config["input_dirs"]] input_files = list(flatten(input_files)) logger.info("Input files to process: %s" % (input_files)) results_dir = config["dir"]["results"] map(safe_makedir, config["dir"].values()) curr_files = input_files for stage in config["run"]: if stage == "fastqc": nfiles = len(curr_files) logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = _get_stage_config(config, stage) fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * nfiles, [config] * nfiles) if stage == "cutadapt": nfiles = len(curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_outputs = view.map(cutadapt_tool.run, curr_files, [cutadapt_config] * nfiles, [config] * nfiles) curr_files = cutadapt_outputs if stage == "novoalign": nfiles = len(curr_files) novoalign_config = _get_stage_config(config, stage) #db = novoindex.run(config["ref"], # _get_stage_config(config, "novoindex"), # config) db = config["genome"]["file"] novoalign_outputs = view.map(novoalign.run, curr_files, [db] * nfiles, [novoalign_config] * nfiles, [config] * nfiles) picard = BroadRunner(config["program"]["picard"]) args = zip(*itertools.product([picard], novoalign_outputs)) # conver to bam bamfiles = view.map(picardrun.picard_formatconverter, *args) args = zip(*itertools.product([picard], bamfiles)) # sort bam sorted_bf = view.map(picardrun.picard_sort, *args) # index bam args = zip(*itertools.product([picard], sorted_bf)) view.map(picardrun.picard_index, *args) curr_files = novoalign_outputs if stage == "htseq-count": logger.info("Running htseq-count on %s" %(curr_files)) htseq_outputs = curr_files column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) rpkm = htseq_count.calculate_rpkm(combined_out, config["annotation"]["file"]) rpkm_file = os.path.join(config["dir"]["results"], stage, "rpkm.txt") rpkm.to_csv(rpkm_file, sep="\t") if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "deseq": conditions = [os.path.basename(x).split("_")[0] for x in input_files] deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) # get the of the conditons that match this comparison indexes = [x for x, y in enumerate(conditions) if y in comparison] # find the htseq_files to combine and combine them htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [column_names[index] for index in indexes] logger.info(htseq_files) logger.info(htseq_columns) out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts(htseq_files, htseq_columns, out_file) deseq_conds = [conditions[index] for index in indexes] deseq_prefix = os.path.join(out_dir, comparison_name) deseq_out = view.map(deseq.run, [combined_out], [deseq_conds], [deseq_prefix]) logger.info("Annotating %s." % (deseq_out)) annotated_file = view.map(annotate.annotate_table_with_biomart, deseq_out, ["id"], ["ensembl_gene_id"], ["human"]) if stage == "dss": conditions = [os.path.basename(x).split("_")[0] for x in input_files] dss_config = _get_stage_config(config, stage) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for comparison in dss_config["comparisons"]: comparison_name = "_vs_".join(comparison) out_dir = os.path.join(results_dir, stage, comparison_name) safe_makedir(out_dir) # get the of the conditons that match this comparison indexes = [x for x, y in enumerate(conditions) if y in comparison] # find the htseq_files to combine and combine them htseq_files = [htseq_outputs[index] for index in indexes] htseq_columns = [column_names[index] for index in indexes] out_file = os.path.join(out_dir, comparison_name + ".counts.txt") combined_out = htseq_count.combine_counts(htseq_files, htseq_columns, out_file) dss_conds = [conditions[index] for index in indexes] dss_prefix = os.path.join(out_dir, comparison_name) logger.info("Running DSS on %s with conditions %s and comparison %s." % (combined_out, dss_conds, comparison)) dss_out = dss.run(combined_out, dss_conds, comparison, dss_prefix) stop_cluster()
parser = argparse.ArgumentParser(description='generic launcher') parser.add_argument('--profile', required=True, help="IPython profile name to use") parser.add_argument('--cores', required=True, help="Number of IPython engines to start.") parser.add_argument('--queue', help="Name of queue to use.") parser.add_argument('--scheduler', default="", help="Name of scheduler to use (LSF or SGE)") args = parser.parse_args() cluster_config = { "cluster": { "profile": args.profile, "cores": int(args.cores), "queue": args.queue, "scheduler": args.scheduler } } setup_logging(cluster_config) if _args_valid_for_scheduler(args) or _args_valid_for_local(args): start_cluster(cluster_config) from bipy.cluster import view main() stop_cluster() else: print parser.usage() sys.exit(1)
def main(config_file): # load yaml config file with open(config_file) as in_handle: config = yaml.load(in_handle) # setup logging setup_logging(config) from bipy.log import logger # start cluster start_cluster(config) from bipy.cluster import view found = sh.find(config["dir"]["data"], "-name", "Variations") var_dirs = [str(x).strip() for x in found] logger.info("Var_dirs: %s" % (var_dirs)) in_dirs = map(os.path.dirname, var_dirs) logger.info("in_dirs: %s" % (in_dirs)) # XXX for testing only load 3 #curr_files = in_dirs[0:5] curr_files = in_dirs # run the illumina fixer logger.info("Running illumina fixer on %s." % (curr_files)) illf_class = STAGE_LOOKUP.get("illumina_fixer") illf = illf_class(config) curr_files = view.map(illf, curr_files) # sort the vcf files def sort_vcf(in_file): from bipy.utils import append_stem from bcbio.distributed.transaction import file_transaction from bcbio.utils import file_exists import sh out_file = append_stem(in_file, "sorted") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: sh.vcf_sort(in_file, _out=tmp_out_file) return out_file # combine out_file = os.path.join(config["dir"].get("results", "results"), "geminiloader", "all_combined.vcf") logger.info("Combining files %s into %s." % (curr_files, out_file)) if file_exists(out_file): curr_files = [out_file] else: curr_files = [genotype.combine_variant_files(curr_files, out_file, config["ref"]["fasta"], config)] # break the VCF files up by chromosome for speed logger.info("Breaking up %s by chromosome." % (curr_files)) breakvcf_class = STAGE_LOOKUP.get("breakvcf") breakvcf = breakvcf_class(config) curr_files = view.map(breakvcf, curr_files) # run VEP on the separate files in parallel logger.info("Running VEP on %s." % (curr_files)) vep_class = STAGE_LOOKUP.get("vep") vep = vep_class(config) curr_files = view.map(vep, list(flatten(curr_files))) curr_files = filter(file_exists, curr_files) # load the files into gemini not in parallel # don't run in parallel # sort the vcf files logger.info("Sorting %s." % (curr_files)) curr_files = view.map(sort_vcf, curr_files) # don't run the rest of this in parallel, so take the cluster down stop_cluster() out_file = os.path.join(config["dir"].get("results", "results"), "geminiloader", "all_combined.vep.vcf") logger.info("Combining files %s into %s." % (curr_files, out_file)) if file_exists(out_file): curr_files = [out_file] else: curr_files = [genotype.combine_variant_files(curr_files, out_file, config["ref"]["fasta"], config)] logger.info("Loading %s into gemini." % (curr_files)) gemini_class = STAGE_LOOKUP.get("geminiloader") geminiloader = gemini_class(config) curr_files = map(geminiloader, curr_files) logger.info("Run complete.")
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) # after the cluster is up, import the view to i from bipy.cluster import view input_files = config["input"] results_dir = config["dir"]["results"] # make the needed directories map(safe_makedir, config["dir"].values()) curr_files = input_files ## qc steps for stage in config["run"]: if stage == "fastqc": # run the basic fastqc logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = config["stage"][stage] fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * len(curr_files), [config] * len(curr_files)) # this does nothing for now, not implemented yet summary_file = _combine_fastqc(fastqc_outputs) if stage == "trim": logger.info("Trimming poor quality ends " " from %s" % (str(curr_files))) nlen = len(curr_files) min_length = str(config["stage"][stage].get("min_length", 20)) # trim low quality ends of reads # do this dirty for now out_dir = os.path.join(results_dir, "trimmed") safe_makedir(out_dir) out_files = [append_stem(os.path.basename(x), "trim") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] # XXX remove the magic number of 10 the length of the # minimum read to keep out_files = view.map(sickle.run, curr_files, ["se"] * nlen, ["sanger"] * nlen, [min_length] * nlen, out_files) curr_files = out_files if stage == "tagdust": input_files = curr_files # remove tags matching the other miRNA tested logger.info("Running %s on %s." % (stage, input_files)) tagdust_config = config["stage"][stage] tagdust_outputs = view.map(tagdust.run, input_files, [tagdust_config] * len(input_files), [config] * len(input_files)) curr_files = [x[0] for x in tagdust_outputs] if stage == "filter_length": # filter out reads below or above a certain length filter_config = config["stage"][stage] min_length = filter_config.get("min_length", 0) max_length = filter_config.get("max_length", MAX_READ_LENGTH) # length predicate def length_filter(x): return min_length < len(x.seq) < max_length # filter the input reads based on length # parallelizing this doesn't seem to work # ipython can't accept closures as an argument to view.map() """ filtered_fastq = view.map(filter_seqio, tagdust_outputs, [lf] * len(tagdust_outputs), ["filt"] * len(tagdust_outputs), ["fastq"] * len(tagdust_outputs))""" out_files = [append_stem(os.path.basename(input_file[0]), "filt") for input_file in tagdust_outputs] out_dir = os.path.join(config["dir"]["results"], "length_filtered") safe_makedir(out_dir) out_files = [os.path.join(out_dir, x) for x in out_files] filtered_fastq = [filter_seqio(x[0], length_filter, y, "fastq") for x, y in zip(tagdust_outputs, out_files)] curr_files = filtered_fastq if stage == "count_ends": logger.info("Compiling nucleotide counts at 3' and 5' ends.") # count the nucleotide at the end of each read def count_ends(x, y): """ keeps a running count of an arbitrary set of keys during the reduce step """ x[y] = x.get(y, 0) + 1 return x def get_3prime_end(x): return str(x.seq[-1]) def get_5prime_end(x): return str(x.seq[0]) def output_counts(end_function, count_file): # if the count_file already exists, skip outdir = os.path.join(config["dir"]["results"], stage) safe_makedir(outdir) count_file = os.path.join(outdir, count_file) if os.path.exists(count_file): return count_file # outputs a tab file of the counts at the end # of the fastq files kj counts = [reduce(count_ends, apply_seqio(x, end_function, kind="fastq"), {}) for x in curr_files] df = pd.DataFrame(counts, index=map(_short_name, curr_files)) df = df.astype(float) total = df.sum(axis=1) df = df.div(total, axis=0) df["total"] = total df.to_csv(count_file, sep="\t") output_counts(get_3prime_end, "3prime_counts.tsv") output_counts(get_5prime_end, "5prime_counts.tsv") if stage == "tophat": tophat_config = config["stage"][stage] logger.info("Running tophat on %s" % (str(curr_files))) nlen = len(curr_files) pair_file = None ref_file = tophat_config["annotation"] out_base = os.path.join(results_dir, "mirna") align_dir = os.path.join(results_dir, "tophat") config = config tophat_files = view.map(tophat.align, curr_files, [pair_file] * nlen, [ref_file] * nlen, [out_base] * nlen, [align_dir] * nlen, [config] * nlen) curr_files = tophat_files if stage == "novoalign": logger.info("Running novoalign on %s" % (str(curr_files))) # align ref = config["genome"]["file"] novoalign_config = config["stage"][stage] aligned_outputs = view.map(novoalign.run, curr_files, [ref] * len(curr_files), [novoalign_config] * len(curr_files), [config] * len(curr_files)) # convert sam to bam, sort and index picard = BroadRunner(config["program"]["picard"], None, {}) bamfiles = view.map(picardrun.picard_formatconverter, [picard] * len(aligned_outputs), aligned_outputs) sorted_bf = view.map(picardrun.picard_sort, [picard] * len(bamfiles), bamfiles) view.map(picardrun.picard_index, [picard] * len(sorted_bf), sorted_bf) # these files are the new starting point for the downstream # analyses, so copy them over into the data dir and setting # them to read only #data_dir = os.path.join(config["dir"]["data"], stage) #safe_makedir(data_dir) #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf)) #new_files = [os.path.join(data_dir, x) for x in # map(os.path.basename, sorted_bf)] #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files] # index the bam files for later use #view.map(picardrun.picard_index, [picard] * len(new_files), # new_files) curr_files = sorted_bf if stage == "new_coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"], None, {}) out_dir = os.path.join(results_dir, "new_coverage") safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) curr_files = out_files if stage == "coverage": gtf = blastn.prepare_ref_file(config["annotation"], config) logger.info("Calculating coverage of features in %s for %s" % (gtf, str(sorted_bf))) out_files = [replace_suffix(x, "counts.bed") for x in sorted_bf] out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) logger.info(out_files) out_files = [os.path.join(out_dir, os.path.basename(x)) for x in out_files] logger.info(out_files) view.map(bedtools.count_overlaps, sorted_bf, [gtf] * len(sorted_bf), out_files) if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, aligned_outputs, [config] * nfiles, [stage] * nfiles) column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) if stage == "bedtools_intersect": bedfiles = config["stage"]["bedtools_intersect"].get("bed", None) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for bedfile in bedfiles: bedbase, bedext = os.path.splitext(bedfile) out_files = [remove_suffix(x) for x in sorted_bf] out_files = [os.path.join(out_dir, os.path.basename(x)) for x in out_files] out_files = ["_vs_".join([x, os.path.basename(bedbase)]) for x in out_files] out_files = [".".join([x, "bam"]) for x in out_files] test_out = map(bedtools.intersectbam2bed, sorted_bf, [bedfile] * len(sorted_bf), [False] * len(sorted_bf), out_files) count_files = [replace_suffix(x, "stats") for x in out_files] map(write_ratios, sorted_bf, out_files, count_files) if stage == "piranha": piranha_runner = piranha.PiranhaStage(config) out_files = view.map(piranha_runner, curr_files) stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) # after the cluster is up, import the view to i from bipy.cluster import view input_files = config["input"] results_dir = config["dir"]["results"] # make the needed directories map(safe_makedir, config["dir"].values()) curr_files = input_files ## qc steps for stage in config["run"]: if stage == "fastqc": # run the basic fastqc logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = config["stage"][stage] fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * len(curr_files), [config] * len(curr_files)) # this does nothing for now, not implemented yet summary_file = _combine_fastqc(fastqc_outputs) if stage == "trim": logger.info("Trimming poor quality ends " " from %s" % (str(curr_files))) nlen = len(curr_files) min_length = str(config["stage"][stage].get("min_length", 20)) # trim low quality ends of reads # do this dirty for now out_dir = os.path.join(results_dir, "trimmed") safe_makedir(out_dir) out_files = [ append_stem(os.path.basename(x), "trim") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] # XXX remove the magic number of 10 the length of the # minimum read to keep out_files = view.map(sickle.run, curr_files, ["se"] * nlen, ["sanger"] * nlen, [min_length] * nlen, out_files) curr_files = out_files if stage == "tagdust": input_files = curr_files # remove tags matching the other miRNA tested logger.info("Running %s on %s." % (stage, input_files)) tagdust_config = config["stage"][stage] tagdust_outputs = view.map(tagdust.run, input_files, [tagdust_config] * len(input_files), [config] * len(input_files)) curr_files = [x[0] for x in tagdust_outputs] if stage == "filter_length": # filter out reads below or above a certain length filter_config = config["stage"][stage] min_length = filter_config.get("min_length", 0) max_length = filter_config.get("max_length", MAX_READ_LENGTH) # length predicate def length_filter(x): return min_length < len(x.seq) < max_length # filter the input reads based on length # parallelizing this doesn't seem to work # ipython can't accept closures as an argument to view.map() """ filtered_fastq = view.map(filter_seqio, tagdust_outputs, [lf] * len(tagdust_outputs), ["filt"] * len(tagdust_outputs), ["fastq"] * len(tagdust_outputs))""" out_files = [ append_stem(os.path.basename(input_file[0]), "filt") for input_file in tagdust_outputs ] out_dir = os.path.join(config["dir"]["results"], "length_filtered") safe_makedir(out_dir) out_files = [os.path.join(out_dir, x) for x in out_files] filtered_fastq = [ filter_seqio(x[0], length_filter, y, "fastq") for x, y in zip(tagdust_outputs, out_files) ] curr_files = filtered_fastq if stage == "count_ends": logger.info("Compiling nucleotide counts at 3' and 5' ends.") # count the nucleotide at the end of each read def count_ends(x, y): """ keeps a running count of an arbitrary set of keys during the reduce step """ x[y] = x.get(y, 0) + 1 return x def get_3prime_end(x): return str(x.seq[-1]) def get_5prime_end(x): return str(x.seq[0]) def output_counts(end_function, count_file): # if the count_file already exists, skip outdir = os.path.join(config["dir"]["results"], stage) safe_makedir(outdir) count_file = os.path.join(outdir, count_file) if os.path.exists(count_file): return count_file # outputs a tab file of the counts at the end # of the fastq files kj counts = [ reduce(count_ends, apply_seqio(x, end_function, kind="fastq"), {}) for x in curr_files ] df = pd.DataFrame(counts, index=map(_short_name, curr_files)) df = df.astype(float) total = df.sum(axis=1) df = df.div(total, axis=0) df["total"] = total df.to_csv(count_file, sep="\t") output_counts(get_3prime_end, "3prime_counts.tsv") output_counts(get_5prime_end, "5prime_counts.tsv") if stage == "tophat": tophat_config = config["stage"][stage] logger.info("Running tophat on %s" % (str(curr_files))) nlen = len(curr_files) pair_file = None ref_file = tophat_config["annotation"] out_base = os.path.join(results_dir, "mirna") align_dir = os.path.join(results_dir, "tophat") config = config tophat_files = view.map(tophat.align, curr_files, [pair_file] * nlen, [ref_file] * nlen, [out_base] * nlen, [align_dir] * nlen, [config] * nlen) curr_files = tophat_files if stage == "novoalign": logger.info("Running novoalign on %s" % (str(curr_files))) # align ref = config["genome"]["file"] novoalign_config = config["stage"][stage] aligned_outputs = view.map(novoalign.run, curr_files, [ref] * len(curr_files), [novoalign_config] * len(curr_files), [config] * len(curr_files)) # convert sam to bam, sort and index picard = BroadRunner(config["program"]["picard"], None, {}) bamfiles = view.map(picardrun.picard_formatconverter, [picard] * len(aligned_outputs), aligned_outputs) sorted_bf = view.map(picardrun.picard_sort, [picard] * len(bamfiles), bamfiles) view.map(picardrun.picard_index, [picard] * len(sorted_bf), sorted_bf) # these files are the new starting point for the downstream # analyses, so copy them over into the data dir and setting # them to read only #data_dir = os.path.join(config["dir"]["data"], stage) #safe_makedir(data_dir) #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf)) #new_files = [os.path.join(data_dir, x) for x in # map(os.path.basename, sorted_bf)] #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files] # index the bam files for later use #view.map(picardrun.picard_index, [picard] * len(new_files), # new_files) curr_files = sorted_bf if stage == "new_coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"], None, {}) out_dir = os.path.join(results_dir, "new_coverage") safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) curr_files = out_files if stage == "coverage": gtf = blastn.prepare_ref_file(config["annotation"], config) logger.info("Calculating coverage of features in %s for %s" % (gtf, str(sorted_bf))) out_files = [replace_suffix(x, "counts.bed") for x in sorted_bf] out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) logger.info(out_files) out_files = [ os.path.join(out_dir, os.path.basename(x)) for x in out_files ] logger.info(out_files) view.map(bedtools.count_overlaps, sorted_bf, [gtf] * len(sorted_bf), out_files) if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, aligned_outputs, [config] * nfiles, [stage] * nfiles) column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) if stage == "bedtools_intersect": bedfiles = config["stage"]["bedtools_intersect"].get("bed", None) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for bedfile in bedfiles: bedbase, bedext = os.path.splitext(bedfile) out_files = [remove_suffix(x) for x in sorted_bf] out_files = [ os.path.join(out_dir, os.path.basename(x)) for x in out_files ] out_files = [ "_vs_".join([x, os.path.basename(bedbase)]) for x in out_files ] out_files = [".".join([x, "bam"]) for x in out_files] test_out = map(bedtools.intersectbam2bed, sorted_bf, [bedfile] * len(sorted_bf), [False] * len(sorted_bf), out_files) count_files = [replace_suffix(x, "stats") for x in out_files] map(write_ratios, sorted_bf, out_files, count_files) if stage == "piranha": piranha_runner = piranha.PiranhaStage(config) out_files = view.map(piranha_runner, curr_files) stop_cluster()