def main(config_file, fc_dir, analysis_dir, run_info_yaml=None): with open(config_file) as in_handle: config = yaml.load(in_handle) fc_name, fc_date = get_flowcell_info(fc_dir) galaxy_api = GalaxyApiAccess(config["galaxy_url"], config["galaxy_api_key"]) # run_info will override some galaxy details, if present if run_info_yaml: with open(run_info_yaml) as in_handle: run_details = yaml.load(in_handle) run_info = dict(details=run_details, run_id="") else: run_info = galaxy_api.run_details(fc_name, fc_date) base_folder_name = "%s_%s" % (fc_date, fc_name) run_details = lims_run_details(run_info, fc_name, base_folder_name) for (library_name, access_role, dbkey, lane, bc_id, name, desc, local_name) in run_details: library_id = get_galaxy_library(library_name, galaxy_api) if library_name else None upload_files = list(select_upload_files(local_name, bc_id, fc_dir, analysis_dir, config)) if len(upload_files) > 0: print lane, bc_id, name, desc, library_name print "Creating storage directory" if library_id: folder, cur_galaxy_files = get_galaxy_folder(library_id, base_folder_name, name, desc, galaxy_api) else: cur_galaxy_files = [] store_dir = move_to_storage(lane, bc_id, base_folder_name, upload_files, cur_galaxy_files, config) if store_dir and library_id: print "Uploading directory of files to Galaxy" print galaxy_api.upload_directory(library_id, folder["id"], store_dir, dbkey, access_role) if galaxy_api: add_run_summary_metrics(analysis_dir, galaxy_api)
def run_main(config, config_file, fc_dir, run_info_yaml): work_dir = os.getcwd() fc_name, fc_date = get_flowcell_info(fc_dir) if run_info_yaml and os.path.exists(run_info_yaml): log.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml) with open(run_info_yaml) as in_handle: run_details = yaml.load(in_handle) run_info = dict(details=run_details, run_id="") else: log.info("Fetching run details from Galaxy instance") galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_info = galaxy_api.run_details(fc_name, fc_date) fastq_dir = get_fastq_dir(fc_dir) run_items = _add_multiplex_across_lanes(run_info["details"], fastq_dir, fc_name) align_dir = os.path.join(work_dir, "alignments") # process each flowcell lane with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap: for _ in cpmap(process_lane, ((i, fastq_dir, fc_name, fc_date, align_dir, config, config_file) for i in run_items)): pass # process samples, potentially multiplexed across multiple lanes sample_files, sample_fastq, sample_info = organize_samples(align_dir, fastq_dir, work_dir, fc_name, fc_date, run_items) with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap: for _ in cpmap(process_sample, ((name, sample_fastq[name], sample_info[name], bam_files, work_dir, config, config_file) for name, bam_files in sample_files)): pass write_metrics(run_info, work_dir, fc_dir, fc_name, fc_date, fastq_dir)
def main(config_file, fc_dir, analysis_dir): with open(config_file) as in_handle: config = yaml.load(in_handle) galaxy_api = GalaxyApiAccess(config["galaxy_url"], config["galaxy_api_key"]) fc_name, fc_date = get_flowcell_info(fc_dir) folder_name = "%s_%s" % (fc_date, fc_name) run_info = lims_run_details(galaxy_api, fc_name, folder_name) for (dl_folder, access_role, dbkey, lane, bc_id, name, desc) in run_info: print folder_name, lane, bc_id, name, desc, dl_folder library_id = get_galaxy_library(dl_folder, galaxy_api) folder, cur_galaxy_files = get_galaxy_folder(library_id, folder_name, name, desc, galaxy_api) print "Creating storage directory" base_select = "%s_%s" % (lane, folder_name) store_dir = move_to_storage( lane, bc_id, folder_name, select_upload_files(base_select, bc_id, fc_dir, analysis_dir), cur_galaxy_files, config, ) if store_dir: print "Uploading directory of files to Galaxy" print galaxy_api.upload_directory(library_id, folder["id"], store_dir, dbkey, access_role) add_run_summary_metrics(analysis_dir, galaxy_api)
def main(config_file, fc_dir): work_dir = os.getcwd() with open(config_file) as in_handle: config = yaml.load(in_handle) galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) fc_name, fc_date = get_flowcell_info(fc_dir) run_info = galaxy_api.run_details(fc_name) fastq_dir = get_fastq_dir(fc_dir) #print "Generating fastq files" #all_lanes = [i['lane'] for i in run_info["details"]] #short_fc_name = "%s_%s" % (fc_date, fc_name) #fastq_dir = generate_fastq(fc_dir, short_fc_name, all_lanes) if config["algorithm"]["num_cores"] > 1: pool = Pool(config["algorithm"]["num_cores"]) try: pool.map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"])) except: pool.terminate() raise else: map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"])) write_metrics(run_info, work_dir, fc_dir, fastq_dir)
def get_runinfo(galaxy_url, galaxy_apikey, run_folder, storedir): """Retrieve flattened run information for a processed directory from Galaxy nglims API. """ galaxy_api = GalaxyApiAccess(galaxy_url, galaxy_apikey) fc_name, fc_date = flowcell.parse_dirname(run_folder) galaxy_info = galaxy_api.run_details(fc_name, fc_date) if "error" in galaxy_info: return galaxy_info if not galaxy_info["run_name"].startswith(fc_date) and not galaxy_info["run_name"].endswith(fc_name): raise ValueError("Galaxy NGLIMS information %s does not match flowcell %s %s" % (galaxy_info["run_name"], fc_date, fc_name)) ldetails = _flatten_lane_details(galaxy_info) out = [] for item in ldetails: # Do uploads for all non-controls if item["description"] != "control" or item["project_name"] != "control": item["upload"] = {"method": "galaxy", "run_id": galaxy_info["run_id"], "fc_name": fc_name, "fc_date": fc_date, "dir": storedir, "galaxy_url": galaxy_url, "galaxy_api_key": galaxy_apikey} for k in ["lab_association", "private_libs", "researcher", "researcher_id", "sample_id", "galaxy_library", "galaxy_role"]: item["upload"][k] = item.pop(k, "") out.append(item) return out
def organize(dirs, config, run_info_yaml): """Organize run information from a passed YAML file or the Galaxy API. Creates the high level structure used for subsequent processing. """ if run_info_yaml and os.path.exists(run_info_yaml): logger.info("Using input YAML configuration: %s" % run_info_yaml) run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config) else: logger.info("Fetching run details from Galaxy instance") fc_name, fc_date = get_flowcell_info(dirs["flowcell"]) galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_details = [] galaxy_info = galaxy_api.run_details(fc_name, fc_date) for item in galaxy_info["details"]: item["upload"] = { "method": "galaxy", "run_id": galaxy_info["run_id"], "fc_name": fc_name, "fc_date": fc_date } run_details.append(item) out = [] for item in run_details: item["config"] = config_utils.update_w_custom(config, item) item["dirs"] = dirs if "name" not in item: item["name"] = ["", item["description"]] item = _add_reference_resources(item) out.append(item) return out
def main(config_file, month, year): with open(config_file) as in_handle: config = yaml.load(in_handle) galaxy_api = GalaxyApiAccess(config["galaxy_url"], config["galaxy_api_key"]) smonth, syear = (month - 1, year) if month > 1 else (12, year - 1) start_date = datetime(syear, smonth, 15, 0, 0, 0) # last day calculation useful if definition of month is # from first to last day instead of 15th-15th #(_, last_day) = calendar.monthrange(year, month) end_date = datetime(year, month, 14, 23, 59, 59) out_file = "%s_%s" % (start_date.strftime("%b"), end_date.strftime("%b-%Y-sequencing.csv")) with open(out_file, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow([ "Date", "Product", "Payment", "Researcher", "Lab", "Email", "Project", "Sample", "Description", "Genome", "Flowcell", "Lane", "Notes" ]) for s in galaxy_api.sqn_report(start_date.isoformat(), end_date.isoformat()): f_parts = s["sqn_run"]["run_folder"].split("_") flowcell = "_".join([f_parts[0], f_parts[-1]]) writer.writerow([ s["sqn_run"]["date"], s["sqn_type"], s["project"]["payment_(fund_number)"], s["project"]["researcher"], s["project"]["lab_association"], s["project"]["email"], s["project"]["project_name"], s["name"], s["description"], s["genome_build"], flowcell, s["sqn_run"]["lane"], s["sqn_run"]["results_notes"] ])
def organize(dirs, config, run_info_yaml): """Organize run information from a passed YAML file or the Galaxy API. Creates the high level structure used for subsequent processing. """ if run_info_yaml and os.path.exists(run_info_yaml): logger.info("Using input YAML configuration: %s" % run_info_yaml) run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config) else: logger.info("Fetching run details from Galaxy instance") fc_name, fc_date = flowcell.parse_dirname(dirs["flowcell"]) galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_details = [] galaxy_info = galaxy_api.run_details(fc_name, fc_date) for item in galaxy_info["details"]: item["upload"] = {"method": "galaxy", "run_id": galaxy_info["run_id"], "fc_name": fc_name, "fc_date": fc_date} run_details.append(item) out = [] for item in run_details: # add algorithm details to configuration, avoid double specification item["config"] = config_utils.update_w_custom(config, item) item.pop("algorithm", None) item["dirs"] = dirs if "name" not in item: item["name"] = ["", item["description"]] item = add_reference_resources(item) out.append(item) return out
def get_run_info(fc_dir, config, run_info_yaml): """Retrieve run information from a passed YAML file or the Galaxy API. """ if run_info_yaml and os.path.exists(run_info_yaml): logger.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml) fc_name, fc_date, run_info = _run_info_from_yaml(fc_dir, run_info_yaml) else: logger.info("Fetching run details from Galaxy instance") fc_name, fc_date = get_flowcell_info(fc_dir) galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_info = galaxy_api.run_details(fc_name, fc_date) return fc_name, fc_date, _organize_runs_by_lane(run_info)
def _get_run_info(fc_name, fc_date, config, run_info_yaml): """Retrieve run information from a passed YAML file or the Galaxy API. """ if run_info_yaml and os.path.exists(run_info_yaml): log.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml) with open(run_info_yaml) as in_handle: run_details = yaml.load(in_handle) return dict(details=run_details, run_id="") else: log.info("Fetching run details from Galaxy instance") galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) return galaxy_api.run_details(fc_name, fc_date)
def _get_run_info(fc_name, fc_date, config, run_info_yaml): """Retrieve run information from a passed YAML file or the Galaxy API. """ if run_info_yaml and os.path.exists(run_info_yaml): logger.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml) with open(run_info_yaml) as in_handle: run_details = yaml.load(in_handle) return dict(details=run_details, run_id="") else: logger.info("Fetching run details from Galaxy instance") galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) return galaxy_api.run_details(fc_name, fc_date)
def main(config_file, fc_dir, analysis_dir, run_info_yaml=None): config = load_config(config_file) galaxy_api = (GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) if config.has_key("galaxy_api_key") else None) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) base_folder_name = "%s_%s" % (fc_date, fc_name) run_details = lims_run_details(run_info, base_folder_name) for (library_name, access_role, dbkey, lane, bc_id, name, desc, local_name, fname_out) in run_details: library_id = (get_galaxy_library(library_name, galaxy_api) if library_name else None) upload_files = list( select_upload_files(local_name, bc_id, fc_dir, analysis_dir, config, fname_out)) if len(upload_files) > 0: print lane, bc_id, name, desc, library_name print "Creating storage directory" if library_id: folder, cur_galaxy_files = get_galaxy_folder( library_id, base_folder_name, name, desc, galaxy_api) else: cur_galaxy_files = [] store_dir = move_to_storage(lane, bc_id, base_folder_name, upload_files, cur_galaxy_files, config, config_file, fname_out) if store_dir and library_id: print "Uploading directory of files to Galaxy" print galaxy_api.upload_directory(library_id, folder['id'], store_dir, dbkey, access_role) if galaxy_api and not run_info_yaml: add_run_summary_metrics(analysis_dir, galaxy_api)
def main(config_file, fc_dir): work_dir = os.getcwd() config = load_config(config_file) galaxy_api = GalaxyApiAccess(config["galaxy_url"], config["galaxy_api_key"]) fc_name, fc_date = flowcell.parse_dirname(fc_dir) run_info = galaxy_api.run_details(fc_name) fastq_dir = flowcell.get_fastq_dir(fc_dir) if config["algorithm"]["num_cores"] > 1: pool = Pool(config["algorithm"]["num_cores"]) try: pool.map( _process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"]) ) except: pool.terminate() raise else: map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"]))
def main(config_file, fc_dir): work_dir = os.getcwd() config = load_config(config_file) galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) fc_name, fc_date = get_flowcell_info(fc_dir) run_info = galaxy_api.run_details(fc_name) fastq_dir = get_fastq_dir(fc_dir) if config["algorithm"]["num_cores"] > 1: pool = Pool(config["algorithm"]["num_cores"]) try: pool.map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"])) except: pool.terminate() raise else: map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"]))
def main(config_file, fc_dir): work_dir = os.getcwd() with open(config_file) as in_handle: config = yaml.load(in_handle) galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) fc_name, fc_date = get_flowcell_info(fc_dir) run_info = galaxy_api.run_details(fc_name) fastq_dir = get_fastq_dir(fc_dir) if config["algorithm"]["num_cores"] > 1: pool = Pool(config["algorithm"]["num_cores"]) try: pool.map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"])) except: pool.terminate() raise else: map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"]))
def add_to_galaxy_datalibs(prepped_files, config): """Add the organized files to synchronized Galaxy data libraries. 3 actions needed: - create data library for each top level item - create folders and sub-folders to subsequent levels - add links to data in final folders """ galaxy_api = GalaxyApiAccess(config["galaxy_url"], config["galaxy_apikey"]) for key, vals in prepped_files.iteritems(): dl_name = "SCDE: %s -- %s" % (key[0], ", ".join([k for k in key[1:] if k])) _add_data_library(galaxy_api, dl_name, vals)
def main(config_file, fc_dir, analysis_dir, run_info_yaml=None): with open(config_file) as in_handle: config = yaml.load(in_handle) fc_name, fc_date = get_flowcell_info(fc_dir) if run_info_yaml: with open(run_info_yaml) as in_handle: run_details = yaml.load(in_handle) run_info = dict(details=run_details, run_id="") galaxy_api = None else: galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_info = galaxy_api.run_details(fc_name, fc_date) base_folder_name = "%s_%s" % (fc_date, fc_name) run_details = lims_run_details(run_info, fc_name, base_folder_name) for (library_name, access_role, dbkey, lane, bc_id, name, desc, local_name) in run_details: library_id = (get_galaxy_library(library_name, galaxy_api) if library_name else None) upload_files = list( select_upload_files(local_name, bc_id, fc_dir, analysis_dir, config)) if len(upload_files) > 0: print lane, bc_id, name, desc, library_name print "Creating storage directory" if library_id: folder, cur_galaxy_files = get_galaxy_folder( library_id, base_folder_name, name, desc, galaxy_api) else: cur_galaxy_files = [] store_dir = move_to_storage(lane, bc_id, base_folder_name, upload_files, cur_galaxy_files, config) if store_dir and library_id: print "Uploading directory of files to Galaxy" print galaxy_api.upload_directory(library_id, folder['id'], store_dir, dbkey, access_role) if galaxy_api: add_run_summary_metrics(analysis_dir, galaxy_api)
def main(config_file, fc_dir, run_info_yaml=None): work_dir = os.getcwd() with open(config_file) as in_handle: config = yaml.load(in_handle) if run_info_yaml: with open(run_info_yaml) as in_handle: run_details = yaml.load(in_handle) run_info = dict(details=run_details, run_id="") else: galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_info = galaxy_api.run_details(fc_name) fc_name, fc_date = get_flowcell_info(fc_dir) run_items = _add_multiplex_to_control(run_info["details"]) fastq_dir = get_fastq_dir(fc_dir) align_dir = os.path.join(work_dir, "alignments") # process each flowcell lane pool = (Pool(config["algorithm"]["num_cores"]) if config["algorithm"]["num_cores"] > 1 else None) map_fn = pool.map if pool else map try: map_fn(_process_lane_wrapper, ((i, fastq_dir, fc_name, fc_date, align_dir, config, config_file) for i in run_items)) except: if pool: pool.terminate() raise # process samples, potentially multiplexed across multiple lanes sample_files, sample_fastq, sample_info = organize_samples(align_dir, fastq_dir, work_dir, fc_name, fc_date, run_items) try: map_fn(_process_sample_wrapper, ((name, sample_fastq[name], sample_info[name], bam_files, work_dir, config, config_file) for name, bam_files in sample_files)) except: if pool: pool.terminate() raise write_metrics(run_info, work_dir, fc_dir, fc_name, fc_date, fastq_dir)
def main(config_file, month, year): with open(config_file) as in_handle: config = yaml.safe_load(in_handle) galaxy_api = GalaxyApiAccess(config["galaxy_url"], config["galaxy_apikey"]) smonth, syear = (month - 1, year) if month > 1 else (12, year - 1) start_date = datetime(syear, smonth, 15, 0, 0, 0) # last day calculation useful if definition of month is # from first to last day instead of 15th-15th #(_, last_day) = calendar.monthrange(year, month) end_date = datetime(year, month, 14, 23, 59, 59) out_file = "%s_%s" % (start_date.strftime("%b"), end_date.strftime("%b-%Y-sequencing.csv")) with open(out_file, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow([ "Date", "Product", "Payment", "Researcher", "Lab", "Email", "Project", "Sample", "Description", "Genome", "Flowcell", "Lane", "Received", "Notes"]) for s in galaxy_api.sqn_report(start_date.isoformat(), end_date.isoformat()): f_parts = s["sqn_run"]["run_folder"].split("_") flowcell = "_".join([f_parts[0], f_parts[-1]]) writer.writerow([ s["sqn_run"]["date"], s["sqn_type"], s["project"]["payment_(fund_number)"], s["project"]["researcher"], s["project"]["lab_association"], s["project"]["email"], s["project"]["project_name"], s["name"], s["description"], s["genome_build"], flowcell, s["sqn_run"]["lane"], _received_date(s["events"]), s["sqn_run"]["results_notes"]])
def run_main(config, config_file, fc_dir, run_info_yaml): work_dir = os.getcwd() fc_name, fc_date = get_flowcell_info(fc_dir) if run_info_yaml and os.path.exists(run_info_yaml): log.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml) with open(run_info_yaml) as in_handle: run_details = yaml.load(in_handle) run_info = dict(details=run_details, run_id="") else: log.info("Fetching run details from Galaxy instance") galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_info = galaxy_api.run_details(fc_name, fc_date) fastq_dir = get_fastq_dir(fc_dir) run_items = _add_multiplex_across_lanes(run_info["details"], fastq_dir, fc_name) align_dir = os.path.join(work_dir, "alignments") # process each flowcell lane with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap: for _ in cpmap( process_lane, ((i, fastq_dir, fc_name, fc_date, align_dir, config, config_file) for i in run_items)): pass # process samples, potentially multiplexed across multiple lanes sample_files, sample_fastq, sample_info = organize_samples( align_dir, fastq_dir, work_dir, fc_name, fc_date, run_items) with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap: for _ in cpmap(process_sample, ((name, sample_fastq[name], sample_info[name], bam_files, work_dir, config, config_file) for name, bam_files in sample_files)): pass write_metrics(run_info, work_dir, fc_dir, fc_name, fc_date, fastq_dir)
def get_runinfo(galaxy_url, galaxy_apikey, run_folder): """Retrieve run information for a processed directory from Galaxy nglims API. """ galaxy_api = GalaxyApiAccess(galaxy_url, galaxy_apikey) fc_name, fc_date = flowcell.parse_dirname(run_folder) return galaxy_api.run_details(fc_name, fc_date)