def main(run_name, gdocs_spreadsheet, encoded_credentials_file, run_info_yaml, analysis_dir, archive_dir, gdocs_worksheet, gdocs_projects_folder, append, split_on_project): log.info("Processing run: %s" % run_name) # If not supplied, assume that the configuration file is named run_info.yaml and resides in the archive dir if not run_info_yaml: run_info_yaml = os.path.join(archive_dir,"run_info.yaml") log.info("No configuration file supplied, assuming it is '%s'" % run_info_yaml) if not os.path.exists(run_info_yaml): log.warn("Could not find required run_info.yaml configuration file at '%s'" % run_info_yaml) return with open(run_info_yaml) as in_handle: run_info = {'details': yaml.load(in_handle)} # Get the google docs crdentials gdocs_credentials = "" if not os.path.exists(encoded_credentials_file): log.warn("The Google Docs credentials file could not be found. No demultiplex data was written") return with open(encoded_credentials_file) as fh: gdocs_credentials = fh.read().strip() fc_name, fc_date = get_flowcell_info(run_name) # Get the barcode statistics bc_metrics = get_bc_stats(fc_date,fc_name,analysis_dir,run_info) # Write the report write_run_report_to_gdocs(fc_date,fc_name,bc_metrics,gdocs_spreadsheet,gdocs_credentials,gdocs_worksheet,append,split_on_project) # Write the bc project summary report if gdocs_projects_folder: write_project_report_to_gdocs(fc_date,fc_name,bc_metrics,gdocs_credentials,gdocs_projects_folder)
def _make_bc_metrics(self, runname, analysisdir): """Parses the run_info and generates lane folders and barcode metrics corresponding to the lanes and barcodes used""" fc_name, fc_date = get_flowcell_info(runname) barcode_dir_suffix = "_%s_%s_barcode" % (fc_date,fc_name) for lane in self.run_info: lane_name = str(lane['lane']) bc_dir = os.path.join(analysisdir,"%s%s" % (lane_name,barcode_dir_suffix)) # Create the directory if it doesn't exist if not os.path.exists(bc_dir): os.makedirs(bc_dir) # Create, or if it exists, append to the bc_metrics file bc_file = os.path.join(bc_dir,"%s_%s_%s_bc.metrics" % (lane_name,fc_date,fc_name)) with open(bc_file,"a") as fh: bcw = UnicodeWriter(fh,dialect='excel-tab') # Loop over the barcodes and generate random read counts bcs = lane.get("multiplex",[]) for bc in bcs: bc_id = str(bc['barcode_id']) bc_count = random.randint(1,10000000) bcw.writerow([bc_id,bc_count]) # Lastly write some unmatched counts, or in case no multiplex data was given, a 'trim' entry if len(bcs): bcw.writerow(['unmatched',random.randint(1,10000000)]) else: bcw.writerow(['trim',random.randint(1,100000000)])
def organize(dirs, config, run_info_yaml): """Organize run information from a passed YAML file or the Galaxy API. Creates the high level structure used for subsequent processing. """ if run_info_yaml and os.path.exists(run_info_yaml): logger.info("Using input YAML configuration: %s" % run_info_yaml) run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config) else: logger.info("Fetching run details from Galaxy instance") fc_name, fc_date = get_flowcell_info(dirs["flowcell"]) galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_details = [] galaxy_info = galaxy_api.run_details(fc_name, fc_date) for item in galaxy_info["details"]: item["upload"] = {"method": "galaxy", "run_id": galaxy_info["run_id"], "fc_name": fc_name, "fc_date": fc_date} run_details.append(item) out = [] for item in run_details: item["config"] = config_utils.update_w_custom(config, item) item["dirs"] = dirs if "name" not in item: item["name"] = ["", item["description"]] item = add_reference_resources(item) out.append(item) return out
def _generate_fastq(fc_dir, config, compress_fastq): """Generate fastq files for the current flowcell. """ fc_name, fc_date = get_flowcell_info(fc_dir) short_fc_name = "%s_%s" % (fc_date, fc_name) fastq_dir = get_fastq_dir(fc_dir) basecall_dir = os.path.split(fastq_dir)[0] postprocess_dir = config.get("postprocess_dir", "") if postprocess_dir: fastq_dir = os.path.join(postprocess_dir, os.path.basename(fc_dir), "fastq") if not fastq_dir == fc_dir: # and not os.path.exists(fastq_dir): with utils.chdir(basecall_dir): lanes = sorted(list(set([f.split("_")[1] for f in glob.glob("*qseq.txt")]))) cl = ["solexa_qseq_to_fastq.py", short_fc_name, ",".join(lanes)] if postprocess_dir: cl += ["-o", fastq_dir] if compress_fastq: cl += ["--gzip"] logger2.debug("Converting qseq to fastq on all lanes.") subprocess.check_call(cl) return fastq_dir
def _make_bc_metrics(self, runname, analysisdir): """Parses the run_info and generates lane folders and barcode metrics corresponding to the lanes and barcodes used""" fc_name, fc_date = get_flowcell_info(runname) barcode_dir_suffix = "_%s_%s_barcode" % (fc_date, fc_name) for lane in self.run_info: lane_name = str(lane['lane']) bc_dir = os.path.join(analysisdir, "%s%s" % (lane_name, barcode_dir_suffix)) # Create the directory if it doesn't exist if not os.path.exists(bc_dir): os.makedirs(bc_dir) # Create, or if it exists, append to the bc_metrics file bc_file = os.path.join(bc_dir, "%s_%s_%s.bc_metrics" % (lane_name, fc_date, fc_name)) with open(bc_file, "a") as fh: bcw = UnicodeWriter(fh, dialect='excel-tab') # Loop over the barcodes and generate random read counts bcs = lane.get("multiplex", []) for bc in bcs: bc_id = str(bc['barcode_id']) bc_count = random.randint(1, 10000000) bcw.writerow([bc_id, bc_count]) # Lastly write some unmatched counts, or in case no multiplex data was given, a 'trim' entry if len(bcs): bcw.writerow(['unmatched', random.randint(1, 10000000)]) else: bcw.writerow(['trim', random.randint(1, 100000000)])
def _run_info_from_yaml(fc_dir, run_info_yaml): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name = None try: fc_name, fc_date = get_flowcell_info(fc_dir) except ValueError: pass if isinstance(loaded, dict): if loaded.has_key("fc_name") and loaded.has_key("fc_date"): fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") loaded = loaded["details"] if fc_name is None: fc_name, fc_date = _unique_flowcell_info() run_details = [] for i, item in enumerate(loaded): if not item.has_key("lane"): item["lane"] = _generate_lane(item["files"], i) if not item.has_key("description"): item["description"] = str(item["lane"]) run_details.append(item) run_info = dict(details=run_details, run_id="") return fc_name, fc_date, run_info
def main(config_file, fc_dir, analysis_dir): with open(config_file) as in_handle: config = yaml.load(in_handle) galaxy_api = GalaxyApiAccess(config["galaxy_url"], config["galaxy_api_key"]) fc_name, fc_date = get_flowcell_info(fc_dir) folder_name = "%s_%s" % (fc_date, fc_name) run_info = lims_run_details(galaxy_api, fc_name, folder_name) for (dl_folder, access_role, dbkey, lane, bc_id, name, desc) in run_info: print folder_name, lane, bc_id, name, desc, dl_folder library_id = get_galaxy_library(dl_folder, galaxy_api) folder, cur_galaxy_files = get_galaxy_folder(library_id, folder_name, name, desc, galaxy_api) print "Creating storage directory" base_select = "%s_%s" % (lane, folder_name) store_dir = move_to_storage( lane, bc_id, folder_name, select_upload_files(base_select, bc_id, fc_dir, analysis_dir), cur_galaxy_files, config, ) if store_dir: print "Uploading directory of files to Galaxy" print galaxy_api.upload_directory(library_id, folder["id"], store_dir, dbkey, access_role) add_run_summary_metrics(analysis_dir, galaxy_api)
def main(config_file, fc_dir, analysis_dir, run_info_yaml=None): with open(config_file) as in_handle: config = yaml.load(in_handle) fc_name, fc_date = get_flowcell_info(fc_dir) galaxy_api = GalaxyApiAccess(config["galaxy_url"], config["galaxy_api_key"]) # run_info will override some galaxy details, if present if run_info_yaml: with open(run_info_yaml) as in_handle: run_details = yaml.load(in_handle) run_info = dict(details=run_details, run_id="") else: run_info = galaxy_api.run_details(fc_name, fc_date) base_folder_name = "%s_%s" % (fc_date, fc_name) run_details = lims_run_details(run_info, fc_name, base_folder_name) for (library_name, access_role, dbkey, lane, bc_id, name, desc, local_name) in run_details: library_id = get_galaxy_library(library_name, galaxy_api) if library_name else None upload_files = list(select_upload_files(local_name, bc_id, fc_dir, analysis_dir, config)) if len(upload_files) > 0: print lane, bc_id, name, desc, library_name print "Creating storage directory" if library_id: folder, cur_galaxy_files = get_galaxy_folder(library_id, base_folder_name, name, desc, galaxy_api) else: cur_galaxy_files = [] store_dir = move_to_storage(lane, bc_id, base_folder_name, upload_files, cur_galaxy_files, config) if store_dir and library_id: print "Uploading directory of files to Galaxy" print galaxy_api.upload_directory(library_id, folder["id"], store_dir, dbkey, access_role) if galaxy_api: add_run_summary_metrics(analysis_dir, galaxy_api)
def _casava_report_to_metrics(run_info_file, casava_report, dirs): """Convert the supplied CASAVA demultiplex report into bcbb-style metric files, based on the configuration in the run_info_file. Metric files are written to the workdir """ metric_files = [] metrics = defaultdict(dict) for report in casava_report: for lane, data in dmx._parse_demultiplex_stats_htm(report).items(): for sequence, metric in data.items(): # Assert that we are not overwriting a previously parsed metric assert not (lane in metrics and sequence in metrics[lane]), \ "Conflicting demultiplex metrics found for lane {} and index {}. " \ "This means that there are multiple demultiplex results for the same sample. " \ "Please review and rectify before proceeding!".format(lane,sequence) metrics[lane][sequence] = metric with open(run_info_file) as fh: info = yaml.load(fh) fc_name, fc_date = fc.get_flowcell_info(dirs["flowcell"]) for item in info: metrics_file = "{}_{}_{}.bc_metrics".format(item["lane"], fc_date, fc_name) multiplex = item.get("multiplex", []) for plex in multiplex: plex["lane"] = item["lane"] dmx._write_demultiplex_metrics(multiplex, metrics, os.path.join(dirs["work"], metrics_file)) metric_files.append(metrics_file) return metric_files
def _generate_fastq(fc_dir, config, compress_fastq): """Generate fastq files for the current flowcell. """ fc_name, fc_date = get_flowcell_info(fc_dir) short_fc_name = "%s_%s" % (fc_date, fc_name) fastq_dir = get_fastq_dir(fc_dir) basecall_dir = os.path.split(fastq_dir)[0] postprocess_dir = config.get("postprocess_dir", "") if postprocess_dir: fastq_dir = os.path.join(postprocess_dir, os.path.basename(fc_dir), "fastq") if not fastq_dir == fc_dir:# and not os.path.exists(fastq_dir): with utils.chdir(basecall_dir): lanes = sorted(list(set([f.split("_")[1] for f in glob.glob("*qseq.txt")]))) cl = ["solexa_qseq_to_fastq.py", short_fc_name, ",".join(lanes)] if postprocess_dir: cl += ["-o", fastq_dir] if compress_fastq: cl += ["--gzip"] logger2.debug("Converting qseq to fastq on all lanes.") subprocess.check_call(cl) return fastq_dir
def _run_info_from_yaml(fc_dir, run_info_yaml): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name = None try: fc_name, fc_date = get_flowcell_info(fc_dir) except ValueError: pass if isinstance(loaded, dict): if loaded.has_key("fc_name") and loaded.has_key("fc_date"): fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") loaded = loaded["details"] if fc_name is None: fc_name, fc_date = _unique_flowcell_info() run_details = [] for i, item in enumerate(loaded): if not item.has_key("lane"): item["lane"] = _generate_lane(item["files"], i) if not item.has_key("description"): item["description"] = str(item["lane"]) run_details.append(item) lanes = [x["lane"] for x in run_details] # WARNING! Commented to figure out a way to fix multiple projects per lane # assert len(lanes) == len(set(lanes)), "Non unique lanes: %s" % lanes run_info = dict(details=run_details, run_id="") return fc_name, fc_date, run_info
def _casava_report_to_metrics(run_info_file, casava_report, dirs): """Convert the supplied CASAVA demultiplex report into bcbb-style metric files, based on the configuration in the run_info_file. Metric files are written to the workdir """ metric_files = [] metrics = defaultdict(dict) for report in casava_report: for lane, data in dmx._parse_demultiplex_stats_htm(report).items(): for sequence, metric in data.items(): # Assert that we are not overwriting a previously parsed metric assert not (lane in metrics and sequence in metrics[lane]), \ "Conflicting demultiplex metrics found for lane {} and index {}. " \ "This means that there are multiple demultiplex results for the same sample. " \ "Please review and rectify before proceeding!".format(lane,sequence) metrics[lane][sequence] = metric with open(run_info_file) as fh: info = yaml.load(fh) fc_name, fc_date = fc.get_flowcell_info(dirs["flowcell"]) for item in info: metrics_file = "{}_{}_{}.bc_metrics".format( item["lane"], fc_date, fc_name) multiplex = item.get("multiplex", []) for plex in multiplex: plex["lane"] = item["lane"] dmx._write_demultiplex_metrics( multiplex, metrics, os.path.join(dirs["work"], metrics_file)) metric_files.append(metrics_file) return metric_files
def run_main(config, config_file, fc_dir, run_info_yaml): work_dir = os.getcwd() align_dir = os.path.join(work_dir, "alignments") fc_name, fc_date = get_flowcell_info(fc_dir) run_info = _get_run_info(fc_name, fc_date, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) # process each flowcell lane lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = _run_parallel("process_lane", lanes, dirs, config) _run_parallel("process_alignment", lane_items, dirs, config) # process samples, potentially multiplexed across multiple lanes sample_files, sample_fastq, sample_info = \ organize_samples(dirs, fc_name, fc_date, run_items) samples = ((n, sample_fastq[n], sample_info[n], bam_files, dirs, config, config_file) for n, bam_files in sample_files) _run_parallel("process_sample", samples, dirs, config) write_metrics(run_info, fc_name, fc_date, dirs)
def run_has_samplesheet(fc_dir, config, require_single=True): """Checks if there's a suitable SampleSheet.csv present for the run. Returns the path to the samplesheet if one is found, None otherwise. """ fc_name, _ = get_flowcell_info(fc_dir) sheet_dirs = config.get("samplesheet_directories", []) fcid_sheet = {} for ss_dir in (s for s in sheet_dirs if os.path.exists(s)): with utils.chdir(ss_dir): for ss in glob.glob("*.csv"): fc_ids = _get_flowcell_id(ss, require_single) for fcid in fc_ids: if fcid: fcid_sheet[fcid] = os.path.join(ss_dir, ss) # difflib handles human errors while entering data on the SampleSheet. # Only one best candidate is returned (if any). 0.85 cutoff allows for # maximum of 2 mismatches in fcid potential_fcids = difflib.get_close_matches(fc_name, fcid_sheet.keys(), 1, 0.85) if len(potential_fcids) > 0 and potential_fcids[0] in fcid_sheet: return fcid_sheet[potential_fcids[0]] else: return None
def main(config_file, fc_dir): work_dir = os.getcwd() with open(config_file) as in_handle: config = yaml.load(in_handle) galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) fc_name, fc_date = get_flowcell_info(fc_dir) run_info = galaxy_api.run_details(fc_name) fastq_dir = get_fastq_dir(fc_dir) #print "Generating fastq files" #all_lanes = [i['lane'] for i in run_info["details"]] #short_fc_name = "%s_%s" % (fc_date, fc_name) #fastq_dir = generate_fastq(fc_dir, short_fc_name, all_lanes) if config["algorithm"]["num_cores"] > 1: pool = Pool(config["algorithm"]["num_cores"]) try: pool.map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"])) except: pool.terminate() raise else: map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"])) write_metrics(run_info, work_dir, fc_dir, fastq_dir)
def organize(dirs, config, run_info_yaml): """Organize run information from a passed YAML file or the Galaxy API. Creates the high level structure used for subsequent processing. """ if run_info_yaml and os.path.exists(run_info_yaml): logger.info("Using input YAML configuration: %s" % run_info_yaml) run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config) else: logger.info("Fetching run details from Galaxy instance") fc_name, fc_date = get_flowcell_info(dirs["flowcell"]) galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_details = [] galaxy_info = galaxy_api.run_details(fc_name, fc_date) for item in galaxy_info["details"]: item["upload"] = { "method": "galaxy", "run_id": galaxy_info["run_id"], "fc_name": fc_name, "fc_date": fc_date } run_details.append(item) out = [] for item in run_details: item["config"] = config_utils.update_w_custom(config, item) item["dirs"] = dirs if "name" not in item: item["name"] = ["", item["description"]] item = _add_reference_resources(item) out.append(item) return out
def _run_info_from_yaml(fc_dir, run_info_yaml): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name = None if fc_dir: try: fc_name, fc_date = get_flowcell_info(fc_dir) except ValueError: pass global_config = {} if isinstance(loaded, dict): if loaded.has_key("fc_name") and loaded.has_key("fc_date"): fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") global_config = copy.deepcopy(loaded) del global_config["details"] loaded = loaded["details"] if fc_name is None: fc_name, fc_date = _unique_flowcell_info() run_details = [] for i, item in enumerate(loaded): if not item.has_key("lane"): item["lane"] = _generate_lane(item["files"], i) if not item.has_key("description"): item["description"] = str(item["lane"]) item["description_filenames"] = global_config.get( "description_filenames", False) run_details.append(item) run_info = dict(details=run_details, run_id="") return fc_name, fc_date, run_info
def get_flowcell(fc_dir, run_info_yaml, config={}): # Just get the name of the flowcell directory minus the path fc_name, fc_date = get_flowcell_info(os.path.basename(os.path.normpath(fc_dir))) with open(run_info_yaml, "r") as fh: run_info = yaml.load(fh) return Flowcell(fc_name, fc_date, run_info, fc_dir)
def run_main(config, config_file, fc_dir, run_info_yaml): work_dir = os.getcwd() fc_name, fc_date = get_flowcell_info(fc_dir) if run_info_yaml and os.path.exists(run_info_yaml): log.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml) with open(run_info_yaml) as in_handle: run_details = yaml.load(in_handle) run_info = dict(details=run_details, run_id="") else: log.info("Fetching run details from Galaxy instance") galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_info = galaxy_api.run_details(fc_name, fc_date) fastq_dir = get_fastq_dir(fc_dir) run_items = _add_multiplex_across_lanes(run_info["details"], fastq_dir, fc_name) align_dir = os.path.join(work_dir, "alignments") # process each flowcell lane with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap: for _ in cpmap(process_lane, ((i, fastq_dir, fc_name, fc_date, align_dir, config, config_file) for i in run_items)): pass # process samples, potentially multiplexed across multiple lanes sample_files, sample_fastq, sample_info = organize_samples(align_dir, fastq_dir, work_dir, fc_name, fc_date, run_items) with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap: for _ in cpmap(process_sample, ((name, sample_fastq[name], sample_info[name], bam_files, work_dir, config, config_file) for name, bam_files in sample_files)): pass write_metrics(run_info, work_dir, fc_dir, fc_name, fc_date, fastq_dir)
def get_flowcell(fc_dir, run_info_yaml, config={}): # Just get the name of the flowcell directory minus the path fc_name, fc_date = get_flowcell_info( os.path.basename(os.path.normpath(fc_dir))) with open(run_info_yaml, "r") as fh: run_info = yaml.load(fh) return Flowcell(fc_name, fc_date, run_info, fc_dir)
def _run_info_from_yaml(fc_dir, run_info_yaml, config): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if fc_dir: try: fc_name, fc_date = get_flowcell_info(fc_dir) except ValueError: pass global_config = {} global_vars = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded and "fc_date" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) loaded = loaded["details"] run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, fc_dir) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if len(item.get("files", [])) == 1 and item["files"][0].endswith(".bam"): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError( "No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name and fc_date: upload["fc_name"] = fc_name upload["fc_date"] = fc_date upload["run_id"] = "" item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths(item["algorithm"], ignore_keys=[ "variantcaller", "realign", "recalibrate", "phasing", "svcaller" ]) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) item["test_run"] = global_config.get("test_run", False) run_details.append(item) _check_sample_config(run_details, run_info_yaml) return run_details
def _find_casava_report(fc_dir): """Locate the CASAVA demultiplex report under the root directory of an illumina flowcell output directory """ fc_name, _ = fc.get_flowcell_info(fc_dir) casava_report_glob = os.path.join(fc_dir,"Unaligned*","Basecall_Stats_*{}".format(fc_name[1:]),"Demultiplex_Stats.htm") return glob.glob(casava_report_glob)
def _run_info_from_yaml(fc_dir, run_info_yaml, config): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if fc_dir: try: fc_name, fc_date = get_flowcell_info(fc_dir) except ValueError: pass global_config = {} global_vars = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if "fc_name" in loaded and "fc_date" in loaded: fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") global_vars = global_config.pop("globals", {}) loaded = loaded["details"] run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, fc_dir) if "lane" not in item: item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if "description" not in item: if len(item.get("files", [])) == 1 and item["files"][0].endswith(".bam"): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError("No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) upload = global_config.get("upload", {}) # Handle specifying a local directory directly in upload if isinstance(upload, basestring): upload = {"dir": upload} if fc_name and fc_date: upload["fc_name"] = fc_name upload["fc_date"] = fc_date upload["run_id"] = "" item["upload"] = upload item["algorithm"] = _replace_global_vars(item["algorithm"], global_vars) item["algorithm"] = genome.abs_file_paths(item["algorithm"], ignore_keys=["variantcaller", "realign", "recalibrate", "phasing", "svcaller"]) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) item["test_run"] = global_config.get("test_run", False) run_details.append(item) _check_sample_config(run_details, run_info_yaml) return run_details
def _find_casava_report(fc_dir): """Locate the CASAVA demultiplex report under the root directory of an illumina flowcell output directory """ fc_name, _ = fc.get_flowcell_info(fc_dir) casava_report_glob = os.path.join(fc_dir, "Unaligned*", "Basecall_Stats_*{}".format(fc_name[1:]), "Demultiplex_Stats.htm") return glob.glob(casava_report_glob)
def _find_samplesheet(fc_dir): """Locate the samplesheet in the root directory of an illumina flowcell output directory """ fc_name, _ = fc.get_flowcell_info(fc_dir) for name in (fc_name, fc_name[1:], "SampleSheet"): ssheet = os.path.join(fc_dir, "{}.csv".format(name)) if os.path.exists(ssheet): return ssheet return None
def get_run_info(fc_dir, config, run_info_yaml): """Retrieve run information from a passed YAML file or the Galaxy API. """ if run_info_yaml and os.path.exists(run_info_yaml): logger.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml) fc_name, fc_date, run_info = _run_info_from_yaml(fc_dir, run_info_yaml) else: logger.info("Fetching run details from Galaxy instance") fc_name, fc_date = get_flowcell_info(fc_dir) galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_info = galaxy_api.run_details(fc_name, fc_date) return fc_name, fc_date, _organize_runs_by_lane(run_info)
def test_create_bc_report(self): """Create a demultiplex report and upload it to gdocs """ # Parse the config config_file = os.path.join(self.data_dir, "post_process.yaml") self.config = load_config(config_file) # Loop over the runs for name in self.runname: print "\nProcessing %s" % name fc_name, fc_date = get_flowcell_info(name) analysisdir = os.path.join(self.workdir, name) create_bc_report_on_gdocs(fc_date, fc_name, analysisdir, {'details': self.run_info}, self.config)
def test_create_bc_report(self): """Create a demultiplex report and upload it to gdocs """ # Parse the config config_file = os.path.join(self.data_dir, "post_process.yaml") self.config = load_config(config_file) # Loop over the runs for name in self.runname: print "\nProcessing %s" % name fc_name, fc_date = get_flowcell_info(name) analysisdir = os.path.join(self.workdir, name) assert create_report_on_gdocs(fc_date, fc_name, self.run_info_file, {"work": analysisdir, "flowcell": analysisdir}, self.config), "Report creation failed"
def get_flowcell_id(run_info, fc_dir, check_bc=True, glob_ext="_fastq.txt"): for lane in run_info: for bc in lane: if check_bc: glob_str = "%s_*_barcode/*%s" % (bc['lane'], glob_ext) else: glob_str = "%s_*%s" % (lane, glob_ext) next files = glob.glob(os.path.join(fc_dir, glob_str)) try: (name, date) = get_flowcell_info(os.path.basename(files[0])) except: raise StandardError("No flowcell information found in " + str(fc_dir)) return name, date
def _find_demultiplex_stats_htm(base_name, config): try: fc_name, _ = get_flowcell_info(base_name) basecall_stats_dir = os.path.join(config["analysis"]["base_dir"],"Basecall_Stats_%s" % fc_name) # If directory doesn't exist, try stripping first character from name (which may corrspond to flowcell position) if not os.path.exists(basecall_stats_dir): basecall_stats_dir = os.path.join(config["analysis"]["base_dir"],"Basecall_Stats_%s" % fc_name[1:]) casava_stats = os.path.join(basecall_stats_dir, "Demultiplex_Stats.htm") assert os.path.exists(casava_stats) return casava_stats except: return None
def get_flowcell_id(run_info, fc_dir, check_bc=True, glob_ext="_fastq.txt"): lane = None for info in run_info: lane = info.get("lane", "") if check_bc: glob_str = "%s_*_barcode/*%s" % (lane, glob_ext) else: glob_str = "%s_*%s" % (lane, glob_ext) files = glob.glob(os.path.join(fc_dir, glob_str)) try: (name, date) = get_flowcell_info(os.path.basename(files[0])) except: raise StandardError("No flowcell information found in " + str(fc_dir)) return name, date
def _generate_fastq(fc_dir): """Generate fastq files for the current flowcell. """ fc_name, fc_date = get_flowcell_info(fc_dir) short_fc_name = "%s_%s" % (fc_date, fc_name) fastq_dir = get_fastq_dir(fc_dir) if not fastq_dir == fc_dir and not os.path.exists(fastq_dir): with utils.chdir(os.path.split(fastq_dir)[0]): lanes = sorted(list(set([f.split("_")[1] for f in glob.glob("*qseq.txt")]))) cl = ["solexa_qseq_to_fastq.py", short_fc_name, ",".join(lanes)] subprocess.check_call(cl) return fastq_dir
def _generate_fastq(fc_dir, config): """Generate fastq files for the current flowcell. """ fc_name, fc_date = get_flowcell_info(fc_dir) short_fc_name = "%s_%s" % (fc_date, fc_name) fastq_dir = get_fastq_dir(fc_dir) basecall_dir = os.path.split(fastq_dir)[0] if not fastq_dir == fc_dir and not os.path.exists(fastq_dir): log.info("Generating fastq files for %s" % fc_dir) with utils.chdir(basecall_dir): lanes = sorted( list(set([f.split("_")[1] for f in glob.glob("*qseq.txt")]))) cl = ["solexa_qseq_to_fastq.py", short_fc_name, ",".join(lanes)] log.info("Converting qseq to fastq on all lanes.") subprocess.check_call(cl) log.info("Qseq to fastq conversion completed.") return fastq_dir
def _run_info_from_yaml(fc_dir, run_info_yaml, config): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if fc_dir: try: fc_name, fc_date = get_flowcell_info(fc_dir) except ValueError: pass global_config = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if loaded.has_key("fc_name") and loaded.has_key("fc_date"): fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") loaded = loaded["details"] run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, fc_dir) if not item.has_key("lane"): item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if not item.has_key("description"): if len(item.get("files", [])) == 1 and item["files"][0].endswith(".bam"): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError( "No `description` sample name provided for input #%s" % (i + 1)) item["description"] = _clean_characters(str(item["description"])) upload = global_config.get("upload", {}) if fc_name and fc_date: upload["fc_name"] = fc_name upload["fc_date"] = fc_date upload["run_id"] = "" item["upload"] = upload item["algorithm"] = genome.abs_file_paths( item["algorithm"], ignore_keys=["variantcaller"]) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) run_details.append(item) _check_sample_config(run_details, run_info_yaml) return run_details
def _make_qc_metrics(self, runname, analysisdir): """Writes RTA quality data for each read""" fc_name, fc_date = get_flowcell_info(runname) run_info_file = os.path.join(analysisdir, "RunInfo.xml") run_info_xml = "<RunInfo><Run Id=\"%s\" Number=\"%s\"><Flowcell>%s</Flowcell><Instrument>SN0000</Instrument><Date>%s</Date><Reads><Read Number=\"1\" NumCycles=\"101\" IsIndexedRead=\"N\" /><Read Number=\"2\" NumCycles=\"7\" IsIndexedRead=\"Y\" /><Read Number=\"3\" NumCycles=\"101\" IsIndexedRead=\"N\" /></Reads><FlowcellLayout LaneCount=\"8\" SurfaceCount=\"2\" SwathCount=\"3\" TileCount=\"8\" /><AlignToPhiX><Lane>1</Lane><Lane>2</Lane><Lane>3</Lane><Lane>4</Lane><Lane>5</Lane><Lane>6</Lane><Lane>7</Lane><Lane>8</Lane></AlignToPhiX></Run></RunInfo>" % (runname, 1, fc_name, fc_date) xmlobj = xml.etree.ElementTree.fromstring(run_info_xml) xml.etree.ElementTree.ElementTree(xmlobj).write(run_info_file, "utf-8", True) qc_dir = os.path.join(analysisdir, "Data", "reports", "Summary") # Create the directory if it doesn't exist if not os.path.exists(qc_dir): os.makedirs(qc_dir) for read in (1, 2, 3): xmlfile = os.path.join(qc_dir, "read%s.xml" % read) xmlobj = xml.etree.ElementTree.fromstring(read_qc[read - 1]) xml.etree.ElementTree.ElementTree(xmlobj).write(xmlfile, "utf-8", True)
def _generate_fastq(fc_dir, config): """Generate fastq files for the current flowcell. """ fc_name, fc_date = get_flowcell_info(fc_dir) short_fc_name = "%s_%s" % (fc_date, fc_name) fastq_dir = get_fastq_dir(fc_dir) basecall_dir = os.path.split(fastq_dir)[0] if not fastq_dir == fc_dir and not os.path.exists(fastq_dir): log.info("Generating fastq files for %s" % fc_dir) with utils.chdir(basecall_dir): lanes = sorted(list(set([f.split("_")[1] for f in glob.glob("*qseq.txt")]))) cl = ["solexa_qseq_to_fastq.py", short_fc_name, ",".join(lanes)] log.info("Converting qseq to fastq on all lanes.") subprocess.check_call(cl) log.info("Qseq to fastq conversion completed.") return fastq_dir
def _find_demultiplex_stats_htm(base_name, config): try: fc_name, _ = get_flowcell_info(base_name) basecall_stats_dir = os.path.join(config["analysis"]["base_dir"], "Basecall_Stats_%s" % fc_name) # If directory doesn't exist, try stripping first character from name (which may corrspond to flowcell position) if not os.path.exists(basecall_stats_dir): basecall_stats_dir = os.path.join( config["analysis"]["base_dir"], "Basecall_Stats_%s" % fc_name[1:]) casava_stats = os.path.join(basecall_stats_dir, "Demultiplex_Stats.htm") assert os.path.exists(casava_stats) return casava_stats except: return None
def _run_info_from_yaml(fc_dir, run_info_yaml, config): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name, fc_date = None, None if fc_dir: try: fc_name, fc_date = get_flowcell_info(fc_dir) except ValueError: pass global_config = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if loaded.has_key("fc_name") and loaded.has_key("fc_date"): fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") loaded = loaded["details"] run_details = [] for i, item in enumerate(loaded): item = _normalize_files(item, fc_dir) if not item.has_key("lane"): item["lane"] = str(i+1) item["lane"] = _clean_characters(str(item["lane"])) if not item.has_key("description"): if len(item.get("files", [])) == 1 and item["files"][0].endswith(".bam"): item["description"] = get_sample_name(item["files"][0]) else: raise ValueError("No `description` sample name provided for input #%s" % (i+1)) item["description"] = _clean_characters(str(item["description"])) upload = global_config.get("upload", {}) if fc_name and fc_date: upload["fc_name"] = fc_name upload["fc_date"] = fc_date upload["run_id"] = "" item["upload"] = upload item["algorithm"] = genome.abs_file_paths(item["algorithm"], ignore_keys=["variantcaller"]) item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) run_details.append(item) _check_sample_config(run_details, run_info_yaml) return run_details
def main(config_file, fc_dir): work_dir = os.getcwd() config = load_config(config_file) galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) fc_name, fc_date = get_flowcell_info(fc_dir) run_info = galaxy_api.run_details(fc_name) fastq_dir = get_fastq_dir(fc_dir) if config["algorithm"]["num_cores"] > 1: pool = Pool(config["algorithm"]["num_cores"]) try: pool.map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"])) except: pool.terminate() raise else: map(_process_wrapper, ((i, fastq_dir, fc_name, fc_date, config, config_file) for i in run_info["details"]))
def _run_info_from_yaml(fc_dir, run_info_yaml): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name = None if fc_dir: try: fc_name, fc_date = get_flowcell_info(fc_dir) except ValueError: pass global_config = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if loaded.has_key("fc_name") and loaded.has_key("fc_date"): fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") loaded = loaded["details"] if fc_name is None: fc_name, fc_date = _unique_flowcell_info() run_details = [] for i, item in enumerate(loaded): if not item.has_key("lane"): if item.has_key("description"): item["lane"] = item["description"] elif item.has_key("files"): item["lane"] = _generate_lane(item["files"], i) else: raise ValueError("Unable to generate lane info for input %s" % item) if not item.has_key("description"): item["description"] = str(item["lane"]) item["description_filenames"] = global_config.get("description_filenames", False) upload = global_config.get("upload") if upload: upload["fc_name"] = fc_name upload["fc_date"] = fc_date item["upload"] = upload run_details.append(item) run_info = dict(details=run_details, run_id="") return fc_name, fc_date, run_info
def _run_info_from_yaml(fc_dir, run_info_yaml, config): """Read run information from a passed YAML file. """ with open(run_info_yaml) as in_handle: loaded = yaml.load(in_handle) fc_name = None if fc_dir: try: fc_name, fc_date = get_flowcell_info(fc_dir) except ValueError: pass global_config = {} if isinstance(loaded, dict): global_config = copy.deepcopy(loaded) del global_config["details"] if loaded.has_key("fc_name") and loaded.has_key("fc_date"): fc_name = loaded["fc_name"].replace(" ", "_") fc_date = str(loaded["fc_date"]).replace(" ", "_") loaded = loaded["details"] if fc_name is None: fc_name, fc_date = _unique_flowcell_info() run_details = [] for i, item in enumerate(loaded): if not item.has_key("lane"): item["lane"] = str(i + 1) item["lane"] = _clean_characters(str(item["lane"])) if not item.has_key("description"): item["description"] = str(item["lane"]) item["description"] = _clean_characters(str(item["description"])) item["description_filenames"] = global_config.get( "description_filenames", False) upload = global_config.get("upload") if upload: upload["fc_name"] = fc_name upload["fc_date"] = fc_date item["upload"] = upload item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date) run_details.append(item) _check_sample_config(run_details, run_info_yaml) run_info = dict(details=run_details, run_id="") return fc_name, fc_date, run_info
def _casava_report_to_metrics(run_info_file, casava_report, dirs): """Convert the supplied CASAVA demultiplex report into bcbb-style metric files, based on the configuration in the run_info_file. Metric files are written to the workdir """ metric_files = [] metrics = dmx._parse_demultiplex_stats_htm(casava_report) with open(run_info_file) as fh: info = yaml.load(fh) fc_name, fc_date = fc.get_flowcell_info(dirs["flowcell"]) for item in info: metrics_file = "{}_{}_{}.bc_metrics".format(item["lane"], fc_date, fc_name) multiplex = item.get("multiplex", []) for plex in multiplex: plex["lane"] = item["lane"] dmx._write_demultiplex_metrics(multiplex, metrics, os.path.join(dirs["work"], metrics_file)) metric_files.append(metrics_file) return metric_files
def main(config_file, fc_dir, run_info_yaml=None): work_dir = os.getcwd() with open(config_file) as in_handle: config = yaml.load(in_handle) if run_info_yaml: with open(run_info_yaml) as in_handle: run_details = yaml.load(in_handle) run_info = dict(details=run_details, run_id="") else: galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_info = galaxy_api.run_details(fc_name) fc_name, fc_date = get_flowcell_info(fc_dir) run_items = _add_multiplex_to_control(run_info["details"]) fastq_dir = get_fastq_dir(fc_dir) align_dir = os.path.join(work_dir, "alignments") # process each flowcell lane pool = (Pool(config["algorithm"]["num_cores"]) if config["algorithm"]["num_cores"] > 1 else None) map_fn = pool.map if pool else map try: map_fn(_process_lane_wrapper, ((i, fastq_dir, fc_name, fc_date, align_dir, config, config_file) for i in run_items)) except: if pool: pool.terminate() raise # process samples, potentially multiplexed across multiple lanes sample_files, sample_fastq, sample_info = organize_samples(align_dir, fastq_dir, work_dir, fc_name, fc_date, run_items) try: map_fn(_process_sample_wrapper, ((name, sample_fastq[name], sample_info[name], bam_files, work_dir, config, config_file) for name, bam_files in sample_files)) except: if pool: pool.terminate() raise write_metrics(run_info, work_dir, fc_dir, fc_name, fc_date, fastq_dir)
def main(config_file, fc_dir, analysis_dir, run_info_yaml=None): with open(config_file) as in_handle: config = yaml.load(in_handle) fc_name, fc_date = get_flowcell_info(fc_dir) if run_info_yaml: with open(run_info_yaml) as in_handle: run_details = yaml.load(in_handle) run_info = dict(details=run_details, run_id="") galaxy_api = None else: galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_info = galaxy_api.run_details(fc_name, fc_date) base_folder_name = "%s_%s" % (fc_date, fc_name) run_details = lims_run_details(run_info, fc_name, base_folder_name) for (library_name, access_role, dbkey, lane, bc_id, name, desc, local_name) in run_details: library_id = (get_galaxy_library(library_name, galaxy_api) if library_name else None) upload_files = list( select_upload_files(local_name, bc_id, fc_dir, analysis_dir, config)) if len(upload_files) > 0: print lane, bc_id, name, desc, library_name print "Creating storage directory" if library_id: folder, cur_galaxy_files = get_galaxy_folder( library_id, base_folder_name, name, desc, galaxy_api) else: cur_galaxy_files = [] store_dir = move_to_storage(lane, bc_id, base_folder_name, upload_files, cur_galaxy_files, config) if store_dir and library_id: print "Uploading directory of files to Galaxy" print galaxy_api.upload_directory(library_id, folder['id'], store_dir, dbkey, access_role) if galaxy_api: add_run_summary_metrics(analysis_dir, galaxy_api)
def run_has_samplesheet(fc_dir, config, require_single=True): """Checks if there's a suitable SampleSheet.csv present for the run. Returns the path to the samplesheet if one is found, None otherwise. """ fc_name, _ = get_flowcell_info(fc_dir) sheet_dirs = config.get("samplesheet_directories", []) fcid_sheet = {} for ss_dir in (s for s in sheet_dirs if os.path.exists(s)): with utils.chdir(ss_dir): for ss in glob.glob("*.csv"): fc_ids = _get_flowcell_id(ss, require_single) for fcid in fc_ids: if fcid: fcid_sheet[fcid] = os.path.join(ss_dir, ss) # The pipeline leaves the flowcell position in the name, so account for that for fcid in [fc_name, fc_name[1:]]: if fcid in fcid_sheet: return fcid_sheet[fcid] return None
def run_has_samplesheet(fc_dir, config, require_single=True): """Checks if there's a suitable SampleSheet.csv present for the run """ fc_name, _ = get_flowcell_info(fc_dir) sheet_dirs = config.get("samplesheet_directories", []) fcid_sheet = {} for ss_dir in (s for s in sheet_dirs if os.path.exists(s)): with utils.chdir(ss_dir): for ss in glob.glob("*.csv"): fc_ids = _get_flowcell_id(ss, require_single) for fcid in fc_ids: if fcid: fcid_sheet[fcid] = os.path.join(ss_dir, ss) # difflib handles human errors while entering data on the SampleSheet. # Only one best candidate is returned (if any). 0.85 cutoff allows for # maximum of 2 mismatches in fcid potential_fcids = difflib.get_close_matches(fc_name, fcid_sheet.keys(), 1, 0.85) if len(potential_fcids) > 0 and fcid_sheet.has_key(potential_fcids[0]): return fcid_sheet[potential_fcids[0]] else: return None
def _casava_report_to_metrics(run_info_file, casava_report, dirs): """Convert the supplied CASAVA demultiplex report into bcbb-style metric files, based on the configuration in the run_info_file. Metric files are written to the workdir """ metric_files = [] metrics = dmx._parse_demultiplex_stats_htm(casava_report) with open(run_info_file) as fh: info = yaml.load(fh) fc_name, fc_date = fc.get_flowcell_info(dirs["flowcell"]) for item in info: metrics_file = "{}_{}_{}.bc_metrics".format( item["lane"], fc_date, fc_name) multiplex = item.get("multiplex", []) for plex in multiplex: plex["lane"] = item["lane"] dmx._write_demultiplex_metrics( multiplex, metrics, os.path.join(dirs["work"], metrics_file)) metric_files.append(metrics_file) return metric_files
def run_main(config, config_file, fc_dir, run_info_yaml): work_dir = os.getcwd() fc_name, fc_date = get_flowcell_info(fc_dir) if run_info_yaml and os.path.exists(run_info_yaml): log.info("Found YAML samplesheet, using %s instead of Galaxy API" % run_info_yaml) with open(run_info_yaml) as in_handle: run_details = yaml.load(in_handle) run_info = dict(details=run_details, run_id="") else: log.info("Fetching run details from Galaxy instance") galaxy_api = GalaxyApiAccess(config['galaxy_url'], config['galaxy_api_key']) run_info = galaxy_api.run_details(fc_name, fc_date) fastq_dir = get_fastq_dir(fc_dir) run_items = _add_multiplex_across_lanes(run_info["details"], fastq_dir, fc_name) align_dir = os.path.join(work_dir, "alignments") # process each flowcell lane with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap: for _ in cpmap( process_lane, ((i, fastq_dir, fc_name, fc_date, align_dir, config, config_file) for i in run_items)): pass # process samples, potentially multiplexed across multiple lanes sample_files, sample_fastq, sample_info = organize_samples( align_dir, fastq_dir, work_dir, fc_name, fc_date, run_items) with utils.cpmap(config["algorithm"]["num_cores"]) as cpmap: for _ in cpmap(process_sample, ((name, sample_fastq[name], sample_info[name], bam_files, work_dir, config, config_file) for name, bam_files in sample_files)): pass write_metrics(run_info, work_dir, fc_dir, fc_name, fc_date, fastq_dir)
def main(run_id, config_file, run_info_file=None, dryrun=False): assert run_id, \ "No run id was specified" assert os.path.exists(config_file), \ "The configuration file, {}, could not be found".format(config_file) config = load_config(config_file) assert "gdocs_upload" in config, \ "The configuration file, {}, has no section specifying the Google docs details".format(config_file) analysis_cfg = config.get("analysis", {}) if "store_dir" in analysis_cfg: archive_dir = os.path.join(analysis_cfg["store_dir"], run_id) else: archive_dir = os.getcwd() analysis_dir = None if "base_dir" in analysis_cfg: analysis_dir = os.path.join(analysis_cfg["base_dir"], run_id) if analysis_dir is None or not os.path.exists(analysis_dir): analysis_dir = tempfile.mkdtemp() dirs = { "work": os.path.normpath(analysis_dir), "flowcell": os.path.normpath(archive_dir) } assert os.path.exists(dirs["flowcell"]), \ "The flowcell directory, {}, could not be found".format(dirs["flowcell"]) assert os.path.exists(dirs["work"]), \ "The work directory, {}, could not be found".format(dirs["work"]) if run_info_file is None: run_info_file = os.path.join(dirs["flowcell"], "run_info.yaml") if not os.path.exists(run_info_file): # Locate the samplesheet and convert to yaml samplesheet = _find_samplesheet(dirs["flowcell"]) assert samplesheet, \ "Could not locate samplesheet in {}, aborting..".format(dirs["flowcell"]) fh, run_info_file = tempfile.mkstemp() os.close(fh) run_info_file = ssheet.csv2yaml(samplesheet, run_info_file) assert os.path.exists(run_info_file), \ "The run info configuration file, {}, could not be found".format(run_info_file) fc_name, fc_date = fc.get_flowcell_info(dirs["flowcell"]) # If we have no bc_metrics files in the workdir, we may be looking at a Casava run. # In that case, attempt to parse the Demultiplex_Stats.htm file and create bc_metrics files metric_files = glob.glob( os.path.join(dirs["work"], "*_barcode", "*bc[_.]metrics")) + glob.glob( os.path.join(dirs["work"], "*bc[_.]metrics")) if len(metric_files) == 0: casava_report = _find_casava_report(dirs["flowcell"]) assert len(casava_report) > 0, \ "Could not locate CASAVA demultiplex report in {}, aborting..".format(dirs["flowcell"]) metric_files = _casava_report_to_metrics(run_info_file, casava_report, dirs) assert len(metric_files) > 0, \ "Could not locate or create required metric files, aborting.." print( "A report will be created on Google Docs based on the demultiplexed data in {}" .format(dirs["work"])) print("The configuration file is {0} and the run info file is {1}".format( config_file, run_info_file)) print("The run was started on {0} and has flowcell id {1}".format( fc_date, fc_name)) if not dryrun: create_report_on_gdocs(fc_date, fc_name, run_info_file, dirs, config) else: print("DRY-RUN: nothing uploaded")
def generate_report(proj_conf): ####### ### Metadata fetched from the 'Genomics project list' on Google Docs ### uppnex_proj = '' min_reads_per_sample = '' try: proj_data = ProjectMetaData(proj_conf['id'], proj_conf['config']) uppnex_proj = proj_data.uppnex_id project_id = proj_data.project_id queue_date = proj_data.queue_date no_samples = proj_data.no_samples lanes_plates = proj_data.lanes_plates min_reads_per_sample = proj_data.min_reads_per_sample customer_reference = proj_data.customer_reference application = proj_data.application no_finished_samples = proj_data.no_finished_samples except: print("WARNING: Could not fetch meta data from Google Docs") d = { 'project_id': proj_conf['id'], 'latex_opt': "", 'summary': "", 'infotable': "", 'lanetable': "", 'read1table': "", 'read2table': "", 'qcplots': "", 'qc30plots': "", 'errorrate': "", 'yieldtable': "", 'qualscale': proj_conf['qual_scale'], } ## Latex option (no of floats per page) floats_per_page = '.. raw:: latex\n\n \setcounter{totalnumber}{8}' d.update(latex_opt=floats_per_page) ## General info table tab = Texttable() if not uppnex_proj or len(uppnex_proj) < 4 or uppnex_proj[0:4] != 'b201': uppnex_proj = "b201YXXX" print "WARNING: Could not find UPPNEX project" run_name_comp = proj_conf['flowcell'].split('_') simple_run_name = run_name_comp[0] + "_" + run_name_comp[3] proj_level_dir = fixProjName(proj_conf['id']) instr_id = run_name_comp[1] fc_name, fc_date = get_flowcell_info(proj_conf['flowcell']) tab.add_row(["Run name:", proj_conf['flowcell']]) del_base = "/proj/" proj_id = proj_conf['id'] try: if len(customer_reference) > 1: proj_id += ' (' + customer_reference + ')' except: pass if len(proj_id) > 30: print "Project ID + customer reference too long: ", proj_id tab.add_rows([["Project id:", proj_id], ["Date:", fc_date], ["Instrument ID:", instr_id], ["Flow cell ID:", fc_name], ["Uppnex project:", uppnex_proj], ["Delivery directory:", del_base + uppnex_proj + "/INBOX/" + proj_level_dir + "/" + simple_run_name]]) d.update(infotable=tab.draw()) ## Lane table tab = Texttable() tab.add_row(["Lane", "Sample(s)"]) for l in proj_conf['lanes']: main_proj = l['description'].split(',')[1].strip() samples = [] if 'multiplex' in l: for mp in l['multiplex']: if 'sample_prj' in mp: if mp['sample_prj'] == proj_conf['id']: samples.append(mp['name']) tab.add_row([l['lane'], ", ".join(samples)]) else: tab.add_row([l['lane'], "Non-multiplexed lane"]) d.update(lanetable=tab.draw()) tab_r1 = Texttable() tab_r2 = Texttable() tab_r1.set_cols_width([2, 12, 12, 12, 12, 12, 12, 30]) tab_r2.set_cols_width([2, 12, 12, 12, 12, 12, 12, 30]) tab_r1.add_row(["Lane", "Clu. dens. #/mm2", "% PF clusters", "Clu. PF #/mm2", "% phas/prephas", "% aln PhiX", "% error rate", "Comment"]) tab_r2.add_row(["Lane", "Clu. dens. #/mm2", "% PF clusters", "Clu. PF #/mm2", "% phas/prephas", "% aln PhiX", "% error rate", "Comment"]) # These should be moved to a cfg file. ( + perhaps provide an alternative for v1.5 FC ) if (options.v1_5_fc): min_clupf = 300 else: min_clupf = 475 max_phas = 0.4 max_prephas = 1.0 # 0.5 max_mean_err = 2 statspath = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "Data", "reports", "Summary") stats = summ.getQCstats(statspath) # Check quality criteria and add comments comm_r1 = '' comm_r2 = '' ok_r1 = True ok_r2 = True ok_cludens_r1 = True ok_cludens_r2 = True ok_err_rate = True ok_err_r1 = True ok_err_r2 = True for l in proj_conf['lanes']: # Cluster densities clu_dens_r1 = stats['raw_cluster_dens']['read1'][l['lane']] clu_dens_r2 = stats['raw_cluster_dens']['read2'][l['lane']] clu_dens_sd_r1 = stats['raw_cluster_dens_sd']['read1'][l['lane']] clu_dens_sd_r2 = stats['raw_cluster_dens_sd']['read2'][l['lane']] clu_dens_string_r1 = str(clu_dens_r1) + '+/-' + str(clu_dens_sd_r1) clu_dens_string_r2 = str(clu_dens_r2) + '+/-' + str(clu_dens_sd_r2) # Cluster PF densities clu_dens_pf_r1 = stats['pf_cluster_dens']['read1'][l['lane']] clu_dens_pf_r2 = stats['pf_cluster_dens']['read2'][l['lane']] clu_dens_pf_sd_r1 = stats['pf_cluster_dens_sd']['read1'][l['lane']] clu_dens_pf_sd_r2 = stats['pf_cluster_dens_sd']['read2'][l['lane']] clu_dens_pf_string_r1 = str(clu_dens_pf_r1) + '+/-' + str(clu_dens_pf_sd_r1) clu_dens_pf_string_r2 = str(clu_dens_pf_r2) + '+/-' + str(clu_dens_pf_sd_r2) # % PF clusters prc_pf_r1 = stats['prc_pf']['read1'][l['lane']] prc_pf_r2 = stats['prc_pf']['read2'][l['lane']] prc_pf_sd_r1 = stats['prc_pf_sd']['read1'][l['lane']] prc_pf_sd_r2 = stats['prc_pf_sd']['read2'][l['lane']] prc_pf_string_r1 = str(prc_pf_r1) + '+/-' + str(prc_pf_sd_r1) prc_pf_string_r2 = str(prc_pf_r2) + '+/-' + str(prc_pf_sd_r2) # % phasing and prephasing phas_r1 = stats['phasing']['read1'][l['lane']] phas_r2 = stats['phasing']['read2'][l['lane']] prephas_r1 = stats['prephasing']['read1'][l['lane']] prephas_r2 = stats['prephasing']['read2'][l['lane']] phas_string_r1 = str(phas_r1) + '/' + str(prephas_r1) phas_string_r2 = str(phas_r2) + '/' + str(prephas_r2) # % aligned aln_r1 = stats['prc_aligned']['read1'][l['lane']] aln_r2 = stats['prc_aligned']['read2'][l['lane']] aln_sd_r1 = stats['prc_aligned_sd']['read1'][l['lane']] aln_sd_r2 = stats['prc_aligned_sd']['read2'][l['lane']] aln_string_r1 = str(aln_r1) + '+/-' + str(aln_sd_r1) aln_string_r2 = str(aln_r2) + '+/-' + str(aln_sd_r2) # error rate err_r1 = stats['error_rate']['read1'][l['lane']] err_r2 = stats['error_rate']['read2'][l['lane']] err_sd_r1 = stats['error_rate_sd']['read1'][l['lane']] err_sd_r2 = stats['error_rate_sd']['read2'][l['lane']] err_str_r1 = str(err_r1) + '+/-' + str(err_sd_r1) err_str_r2 = str(err_r2) + '+/-' + str(err_sd_r2) comm_r1 = "" comm_r2 = "" # check criteria if float(clu_dens_pf_r1[:-1]) < min_clupf: ok_r1 = False ok_cludens_r1 = False comm_r1 += "Low cluster density. " if float(clu_dens_pf_r2[:-1]) < min_clupf: ok_r2 = False ok_cludens_r2 = False comm_r2 += "Low cluster density. " avg_error_rate = (float(err_r1) + float(err_r2)) / 2 if avg_error_rate > max_mean_err: ok_err_rate = False if float(err_r1) > max_mean_err: comm_r1 += "High error rate. " ok_err_r1 = False if float(err_r2) > max_mean_err: comm_r2 += "High error rate. " ok_err_r2 = False if comm_r1 == "": comm_r1 = "OK" if comm_r2 == "": comm_r2 = "OK" tab_r1.add_row([l['lane'], clu_dens_string_r1, prc_pf_string_r1, clu_dens_pf_string_r1, phas_string_r1, aln_string_r1, err_str_r1, comm_r1]) tab_r2.add_row([l['lane'], clu_dens_string_r2, prc_pf_string_r2, clu_dens_pf_string_r2, phas_string_r2, aln_string_r2, err_str_r2, comm_r2]) # Reinitialize comments for the summary. (Which will be for several lanes, potentially) comm_r1 = "" comm_r2 = "" if not ok_cludens_r1: comm_r1 += "Low cluster density. " if not ok_cludens_r2: comm_r2 += "Low cluster density. " if not ok_err_rate: if not ok_err_r1: ok_r1 = False comm_r1 += "High error rate. " if not ok_err_r2: ok_r2 = False comm_r2 += "High error rate. " if (ok_r1 and ok_r2): comm_r1 = comm_r2 = "OK" d.update(summary = "Successful run in terms of error rate. ") else: if (ok_r1): comm_r1 = "OK" d.update(summary="Read 2 did not pass quality criteria: " + comm_r2) elif (ok_r2): comm_r2 = "OK" d.update(summary="Read 1 did not pass quality criteria: " + comm_r1) else: d.update(summary="Did not pass quality criteria. Read 1: " + comm_r1 + " Read 2: " + comm_r2) d.update(read1table=tab_r1.draw()) d.update(read2table=tab_r2.draw()) ## qcplots byCycleDir = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "Data", "reports", "ByCycle") res = [] for l in proj_conf['lanes']: res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "QScore_L%s.png" % (l['lane']))), width="100%")) d.update(qcplots="\n".join(res)) ## qc30plots res = [] for l in proj_conf['lanes']: res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "NumGT30_L%s.png" % (l['lane']))), width="100%")) d.update(qc30plots="\n".join(res)) ## qcplots res = [] for l in proj_conf['lanes']: res.append(m2r.image(os.path.relpath(os.path.join(byCycleDir, "ErrRate_L%s.png" % (l['lane']))), width="100%")) d.update(errorrate="\n".join(res)) ## Sequence yield table target_yield_per_lane = 143000000.0 if (options.v1_5_fc): target_yield_per_lane = 60000000.0 tab = Texttable() tab.add_row(['Lane', 'Sample', 'Number of sequences', 'Million sequences ordered', 'Comment']) run_info_yaml = os.path.join(proj_conf['archive_dir'], proj_conf['flowcell'], "run_info.yaml") if not os.path.exists(run_info_yaml): print("WARNING: could not find required run_info.yaml configuration file at '%s'" % run_info_yaml) return with open(run_info_yaml) as in_handle: run_info = yaml.load(in_handle) fc_name, fc_date = get_flowcell_info(proj_conf['flowcell']) low_yield = False bc_multiplier = 0.75 # Should move to cfg file ok_samples = [] low_samples = [] for l in proj_conf['lanes']: bc_file_name_prefix = os.path.join(proj_conf['analysis_dir'], proj_conf['flowcell'], '_'.join([l['lane'], fc_date, fc_name, "nophix_barcode"]), '_'.join([l['lane'], fc_date, fc_name, "nophix"])) bc_file = bc_file_name_prefix + ".bc_metrics" if not os.path.exists(bc_file): bc_file = bc_file_name_prefix + "_bc.metrics" try: bc_file = open(bc_file) except: sys.exit("Could not find bc metrics file " + bc_file) bc_count = {} for line in bc_file: c = line.strip().split() bc_count[c[0]]=c[1] + ' (~' + str (int ( round (float(c[1])/1000000) ) ) + " million)" no_samples = len(bc_count) - 1 if no_samples == 0: print("WARNING: did not find a BC metrics file... Skipping lane %s for %s" % (l['lane'], proj_conf['id'])) continue target_yield_per_sample = '' try: min_reads_per_sample = round(float(str(min_reads_per_sample))) target_yield_per_sample = min_reads_per_sample * 1000000 except ValueError: min_reads_per_sample = '' target_yield_per_sample = bc_multiplier * target_yield_per_lane / no_samples sample_name = {} is_multiplexed = True is_rerun = False # Check here for each sample if it belongs to the project for entry in run_info: if entry['lane'] == l['lane']: projs = set() if 'multiplex' in entry: for sample in entry['multiplex']: if 'sample_prj' in sample: projs.add(sample['sample_prj']) if sample['sample_prj'].strip() == proj_conf['id']: sample_name[sample['barcode_id']] = sample['name'] else: is_multiplexed = False if len(projs) > 1: is_rerun = True samp_count = {} for k in bc_count.keys(): if not k.isdigit(): pass else: if int(k) in sample_name: samp_count[sample_name[int(k)]] = bc_count[k] print "DEBUG: Target yield per sample = ", target_yield_per_sample print "DEBUG: Min reads per sample = ", min_reads_per_sample print "DEBUG: No samples: ", no_samples for k in sorted(samp_count.keys()): comment = '' if int(samp_count[k].split('(')[0]) < target_yield_per_sample: comment = 'Low. ' low_yield = True low_samples.append(k) else: ok_samples.append(k) if is_rerun: comment += '(rerun lane)' tab.add_row([l['lane'], k, samp_count[k], min_reads_per_sample, comment]) if is_multiplexed: comment = '' try: if int(bc_count['unmatched'].split('(')[0]) > target_yield_per_sample: comment = 'High.' if is_rerun: comment += '(rerun lane)' tab.add_row([l['lane'], 'unmatched', bc_count['unmatched'], min_reads_per_sample, comment]) except: print('WARNING: insufficient or no barcode metrics for lane') else: comment = '' for k in bc_count.keys(): if int(bc_count[k].split('(')[0]) < bc_multiplier * target_yield_per_lane: comment = 'Low.' tab.add_row([l['lane'], "Non-multiplexed lane", bc_count[k], min_reads_per_sample, comment]) delivery_type = "Final delivery. " if low_yield: delivery_type = "Partial delivery. " fail_comm = "Samples " + ", ".join(low_samples) + " yielded fewer sequences than expected. These will be re-run unless this was already a re-run and the total yield is now sufficient. " else: fail_comm = "" if low_yield: if len(ok_samples) > 0: ok_comm = "Samples " + ", ".join(ok_samples) + " yielded the expected number of sequences or more. " else: ok_comm = "" else: ok_comm = "All samples yielded the expected number of sequences or more. " comm = d['summary'] + fail_comm + ok_comm d.update(summary=comm) d.update(yieldtable=tab.draw()) return d
def main(flowcell_id, qual_scale, archive_dir, analysis_dir, config_file): if qual_scale not in ["phred64", "phred33"]: sys.exit("You must provide either 'phred64' or 'phred33' as the quality scale! Exiting ...") fp = os.path.join(archive_dir, flowcell_id, "run_info.yaml") with open(fp) as in_handle: run_info = yaml.load(in_handle) if config_file: config = load_config(config_file) else: config = {} project_ids = dict() for lane in run_info: (l, proj_id) = [x.strip() for x in lane['description'].split(",")] if proj_id in project_ids: if not lane in project_ids[proj_id]: project_ids[proj_id].append(lane) else: project_ids[proj_id] = [lane] # Check here if project is a "sub project" of the lane if not 'multiplex' in lane: continue for s in lane['multiplex']: if 'sample_prj' in s: if s['sample_prj'] in project_ids: if lane not in project_ids[s['sample_prj']]: project_ids[s['sample_prj']].append(lane) else: project_ids[s['sample_prj']] = [lane] sphinx_defs = [] for k in project_ids.keys(): lanes = [x['lane'] for x in project_ids[k]] proj_file_tag = k + "_" + get_flowcell_info(flowcell_id)[1] + get_flowcell_info(flowcell_id)[0][0] print("INFO: saw project %s in lanes %s" % (k, ", ".join(lanes))) sphinx_defs.append("('%s', '%s_delivery.tex', 'Raw data delivery note', u'SciLifeLab Stockholm', 'howto'),\n" % (proj_file_tag, proj_file_tag)) projectfile = "%s.mako" % (proj_file_tag) fp = open(projectfile, "w") fp.write(TEMPLATE) fp.close() mylookup = TemplateLookup(directories=['./']) tmpl = Template(filename=projectfile, lookup=mylookup) proj_conf = { 'id' : k, 'lanes' : project_ids[k], 'archive_dir' : archive_dir, 'analysis_dir' : analysis_dir, 'flowcell' : flowcell_id, 'config' : config, 'qual_scale': qual_scale, } d = generate_report(proj_conf) rstfile = "%s.rst" % (proj_file_tag) fp = open(rstfile, "w") fp.write(tmpl.render(**d)) fp.close() sphinxconf = os.path.join(os.getcwd(), "conf.py") if not os.path.exists(sphinxconf): print("WARNING: no sphinx configuration file conf.py found: you have to edit conf.py yourself!") else: fp = open(sphinxconf) lines = fp.readlines() fp.close() sdout = [] modify_conf = False for sd in sphinx_defs: if not sd in lines: sdout.append(sd) modify_conf = True if modify_conf: i = lines.index("latex_documents = [\n") newconf = lines[:i+3] + sdout + lines[i+3:] ## Change the preamble i = newconf.index("#'preamble': '',\n") newconf = newconf[:i+1] + _latex_preamble() + newconf[i+1:] ## Set the logo i = newconf.index("#latex_logo = None\n") newconf = newconf[:i+1] + _latex_logo() + newconf[i+1:] fp = open("conf.py", "w") fp.write("".join(newconf)) fp.close()