def run_main(config, config_file, fc_dir, run_info_yaml): work_dir = os.getcwd() fc_name, fc_date = get_flowcell_info(fc_dir) if run_info_yaml and os.path.exists(run_info_yaml): log.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml) with open(run_info_yaml) as in_handle: run_details = yaml.load(in_handle) run_info = dict(details=run_details, run_id="") else: log.info("Fetching run details from Galaxy instance") galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_info = galaxy_api.run_details(fc_name, fc_date) fastq_dir = get_fastq_dir(fc_dir) run_items = _add_multiplex_across_lanes(run_info["details"], fastq_dir, fc_name) align_dir = os.path.join(work_dir, "alignments") # process each flowcell lane with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap: for _ in cpmap(process_lane, ((i, fastq_dir, fc_name, fc_date, align_dir, config, config_file) for i in run_items)): pass # process samples, potentially multiplexed across multiple lanes sample_files, sample_fastq, sample_info = organize_samples(align_dir, fastq_dir, work_dir, fc_name, fc_date, run_items) with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap: for _ in cpmap(process_sample, ((name, sample_fastq[name], sample_info[name], bam_files, work_dir, config, config_file) for name, bam_files in sample_files)): pass write_metrics(run_info, work_dir, fc_dir, fc_name, fc_date, fastq_dir)
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) create_dirs(config) with cpmap(config["algorithm"]["cores"]) as cur_map: for _ in cur_map(process_fastq, ((fastq, ref_index, config, config_file) for fastq, ref_index in fastq_to_process(config))): pass
def run_main(config, config_file, fc_dir, run_info_yaml): work_dir = os.getcwd() fc_name, fc_date = get_flowcell_info(fc_dir) if run_info_yaml and os.path.exists(run_info_yaml): log.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml) with open(run_info_yaml) as in_handle: run_details = yaml.load(in_handle) run_info = dict(details=run_details, run_id="") else: log.info("Fetching run details from Galaxy instance") galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_info = galaxy_api.run_details(fc_name, fc_date) fastq_dir = get_fastq_dir(fc_dir) run_items = _add_multiplex_across_lanes(run_info["details"], fastq_dir, fc_name) align_dir = os.path.join(work_dir, "alignments") # process each flowcell lane with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap: for _ in cpmap( process_lane, ((i, fastq_dir, fc_name, fc_date, align_dir, config, config_file) for i in run_items)): pass # process samples, potentially multiplexed across multiple lanes sample_files, sample_fastq, sample_info = organize_samples( align_dir, fastq_dir, work_dir, fc_name, fc_date, run_items) with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap: for _ in cpmap(process_sample, ((name, sample_fastq[name], sample_info[name], bam_files, work_dir, config, config_file) for name, bam_files in sample_files)): pass write_metrics(run_info, work_dir, fc_dir, fc_name, fc_date, fastq_dir)
def _run_parallel(fn_name, items, dirs, config): """Process a supplied function: single, multi-processor or distributed. """ parallel = config["algorithm"]["num_cores"] if str(parallel).lower() == "messaging": runner = messaging.runner(dirs, config) return runner(fn_name, items) else: out = [] fn = globals()[fn_name] with utils.cpmap(int(parallel)) as cpmap: for data in cpmap(fn, items): if data: out.extend(data) return out
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) ref_index = novoalign.refindex(config["ref"], kmer_size=13, step_size=1) create_dirs(config) for cur in config["input"]: in_fastq = cur["fastq"] if cur.get("old_style_barcodes", False): in_fastq = convert_illumina_oldstyle(in_fastq) bc_files = demultiplex(in_fastq, cur["barcodes"], config["dir"]["tmp"], config) with cpmap(config["algorithm"]["cores"]) as cur_map: for _ in cur_map(process_fastq, ((bc_file, ref_index, cur, config, config_file) for bc_file in bc_files)): pass
def run_parallel(fn_name, items): parallel = config["algorithm"]["num_cores"] if str(parallel).lower() == "messaging": task_module = "{base}.tasks".format(base=module) runner_fn = runner(task_module, dirs, config, config_file) return runner_fn(fn_name, items) else: out = [] fn = getattr(__import__("{base}.multitasks".format(base=module), fromlist=["multitasks"]), fn_name) with utils.cpmap(int(parallel)) as cpmap: for data in cpmap(fn, items): if data: out.extend(data) return out
def run_parallel(fn_name, items, metadata=None): parallel = config["algorithm"]["num_cores"] if str(parallel).lower() == "messaging": task_module = "{base}.tasks".format(base=module) runner_fn = runner(task_module, dirs, config, config_file) return runner_fn(fn_name, items) else: out = [] fn = getattr( __import__("{base}.multitasks".format(base=module), fromlist=["multitasks"]), fn_name) cores = cores_including_resources(int(parallel), metadata, config) with utils.cpmap(cores) as cpmap: for data in cpmap(fn, filter(lambda x: x is not None, items)): if data: out.extend(data) return out
def run_parallel(fn_name, items, metadata=None): parallel = config["algorithm"]["num_cores"] if str(parallel).lower() == "messaging": task_module = "{base}.tasks".format(base=module) runner_fn = runner(task_module, dirs, config, config_file) return runner_fn(fn_name, items) else: out = [] fn = getattr(__import__("{base}.multitasks".format(base=module), fromlist=["multitasks"]), fn_name) cores = cores_including_resources(int(parallel), metadata, config) with utils.cpmap(cores) as cpmap: for data in cpmap(fn, filter(lambda x: x is not None, items)): if data: out.extend(data) return out
def run_parallel(fn_name, items, metadata=None): if parallel["type"].startswith("messaging"): task_module = "{base}.tasks".format(base=parallel["module"]) runner_fn = runner(task_module, dirs, config, config_file) return runner_fn(fn_name, items) elif parallel["type"] == "ipython": return ipython.runner(parallel, fn_name, items, dirs["work"], config) else: out = [] fn = getattr(__import__("{base}.multitasks".format(base=parallel["module"]), fromlist=["multitasks"]), fn_name) cores = cores_including_resources(int(parallel["cores"]), metadata, config) with utils.cpmap(cores) as cpmap: for data in cpmap(fn, filter(lambda x: x is not None, items)): if data: out.extend(data) return out
def process_ref(in_file, ref, config): db_info = {"blast" : (prepare_blast_db, blast_search), "blat" : (prepare_blat_db, blat_search)} prepare_db, do_search = db_info[ref.get("aligner", "blast")] ref_file = prepare_ref_file(ref, config) blast_db = prepare_db(ref_file, "nucl") out_file = "%s-%s.tsv" % (os.path.splitext(in_file)[0], ref["name"]) if not os.path.exists(out_file): with open(in_file) as in_handle: with open(out_file, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow(["query", "length", "hit", "hitlength", "hitidentities"]) with utils.cpmap(config["algorithm"]["cores"]) as cpmap: results = cpmap(do_search, ((rec, blast_db, config["dir"]["work"]) for rec in SeqIO.parse(in_handle, "fasta"))) for info in results: writer.writerow(info) return out_file
def run_parallel(fn_name, items, metadata=None): if parallel["type"].startswith("messaging"): task_module = "{base}.tasks".format(base=parallel["module"]) runner_fn = runner(task_module, dirs, config, config_file) return runner_fn(fn_name, items) elif parallel["type"] == "ipython": return ipython.runner(parallel, fn_name, items, dirs["work"], config) else: out = [] fn = getattr( __import__("{base}.multitasks".format(base=parallel["module"]), fromlist=["multitasks"]), fn_name) cores = cores_including_resources(int(parallel["cores"]), metadata, config) with utils.cpmap(cores) as cpmap: for data in cpmap(fn, filter(lambda x: x is not None, items)): if data: out.extend(data) return out
def run_parallel(fn_name, items, metadata=None): items = [x for x in items if x is not None] items = diagnostics.track_parallel(items, fn_name) if parallel["type"].startswith("messaging"): task_module = "{base}.tasks".format(base=parallel["module"]) runner_fn = runner(task_module, dirs, config, config_file) return runner_fn(fn_name, items) elif parallel["type"] == "ipython": return ipython.runner(parallel, fn_name, items, dirs["work"], config) else: logger.info("multiprocessing: %s" % fn_name) out = [] fn = getattr(__import__("{base}.multitasks".format(base=parallel["module"]), fromlist=["multitasks"]), fn_name) num_jobs, cores_per_job = ipython.find_cores_per_job(fn, parallel, items, config) items = [ipython.add_cores_to_config(x, cores_per_job) for x in items] num_jobs = cores_including_resources(num_jobs, metadata, config) with utils.cpmap(num_jobs) as cpmap: for data in cpmap(fn, items): if data: out.extend(data) return out
def process_ref(in_file, ref, config): db_info = { "blast": (prepare_blast_db, blast_search), "blat": (prepare_blat_db, blat_search) } prepare_db, do_search = db_info[ref.get("aligner", "blast")] ref_file = prepare_ref_file(ref, config) blast_db = prepare_db(ref_file, "nucl") out_file = "%s-%s.tsv" % (os.path.splitext(in_file)[0], ref["name"]) if not os.path.exists(out_file): with open(in_file) as in_handle: with open(out_file, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow( ["query", "length", "hit", "hitlength", "hitidentities"]) with utils.cpmap(config["algorithm"]["cores"]) as cpmap: results = cpmap( do_search, ((rec, blast_db, config["dir"]["work"]) for rec in SeqIO.parse(in_handle, "fasta"))) for info in results: writer.writerow(info) return out_file