def run_folder_for_run_id(runid_and_flowcellid, site=None, basedir_map=SEQDIR_BASE): """runid has to contain flowcell id AKA $RAWSEQDIR run_folder_for_run_id('HS004-PE-R00139_BC6A7HANXX') >>> "/mnt/seq/userrig/HS004/HS004-PE-R00139_BC6A7HANXX" if machineid eq MS00 """ if not site: site = get_site() if site not in basedir_map: raise ValueError(site) basedir = basedir_map[site] machineid, runid, flowcellid = get_machine_run_flowcell_id( runid_and_flowcellid) if machineid.startswith('MS00'): # FIXME untested and unclear for NSCC rundir = "{}/{}/MiSeqOutput/{}_{}".format(basedir, machineid, runid, flowcellid) else: rundir = "{}/{}/{}_{}".format(basedir, machineid, runid, flowcellid) return rundir
def run_folder_for_run_id(runid_and_flowcellid): """runid has to contain flowcell id AKA $RAWSEQDIR run_folder_for_run_id('HS004-PE-R00139_BC6A7HANXX') >>> "/mnt/seq/userrig/HS004/HS004-PE-R00139_BC6A7HANXX" if machineid eq MS00 """ basedir = site_cfg['bcl2fastq_seqdir_base'] machineid, runid, flowcellid = get_machine_run_flowcell_id( runid_and_flowcellid) if machineid.startswith('MS00'): # FIXME needs proper cfg handling # FIXME untested and unclear for NSCC rundir = "{}/{}/MiSeqOutput/{}_{}".format(basedir, machineid, runid, flowcellid) else: if machineid.startswith('NG0'): # FIXME needs proper cfg handling basedir = basedir.replace("userrig", "novogene") rundir = "{}/{}/{}_{}".format(basedir, machineid, runid, flowcellid) return rundir
def runs_from_db(connection, testing, win=14): """Get the runs from pipeline_run collections""" db = connection.gisds.runcomplete epoch_present, epoch_back = generate_window(win) results = db.find({"run" : {"$regex" : "^NG00"}, "timestamp": {"$gt": epoch_back, "$lt": epoch_present}}) logger.info("Found %d runs", results.count()) for record in results: run_number = record['run'] logger.debug("record: %s", record) if not record.get('analysis'): continue # Check if Novogene run_mode _, run_id, _ = get_machine_run_flowcell_id(run_number) if testing: rest_url = rest_services['run_details']['testing'].replace("run_num", run_id) else: rest_url = rest_services['run_details']['production'].replace("run_num", run_id) response = requests.get(rest_url) if response.status_code != requests.codes.ok: response.raise_for_status() rest_data = response.json() sg10k_lib_list = get_sg10_lib_list(rest_data) run_records = {} for (analysis_count, analysis) in enumerate(record['analysis']): analysis_id = analysis['analysis_id'] per_mux_status = analysis.get("per_mux_status", None) if per_mux_status is None: continue for (mux_count, mux_status) in enumerate(per_mux_status): # sanity checks against corrupted DB entries if mux_status is None or mux_status.get('mux_id') is None: logger.warning("mux_status is None or incomplete for run %s analysis %s." " Requires fix in DB. Skipping entry for now.", \ run_number, analysis_id) continue if mux_status.get('Status', None) != "SUCCESS": continue mux_id = mux_status['mux_id'] out_dir = analysis['out_dir'] if not os.path.exists(out_dir): logger.warning("Direcotry does not exists %s", out_dir) continue downstream_id = "analysis.{}.per_mux_status.{}.DownstreamSubmission".format( analysis_count, mux_count) if mux_status.get('Status') == "SUCCESS" and \ mux_status.get('DownstreamSubmission') == "TODO": mux_info = (run_number, downstream_id, analysis_id, out_dir) if mux_id in run_records: logger.info("MUX %s from %s has been analyzed more than 1 time \ succeessfully, please check", mux_id, run_number) del run_records[mux_id] elif mux_id in sg10k_lib_list: run_records[mux_id] = mux_info if run_records: yield run_records
def get_sample_info(child, rows, mux_analysis_list, mux_id, fastq_data_dir, \ run_num_flowcell, sample_info): """Collects sample info from ELM JOSN """ sample_cfg = {} site = get_site() ctime, _ = generate_window(1) _, _, flowcellid = get_machine_run_flowcell_id(run_num_flowcell) mux_analysis_list.add(mux_id) sample_id = child['libraryId'] sample_cfg['requestor'] = rows['requestor'] sample_cfg['ctime'] = ctime sample_cfg['site'] = site try: sample_cfg['pipeline_name'] = legacy_mapper['pipeline_mapper'][ child['Analysis']] except KeyError as e: sample_cfg['pipeline_name'] = child['Analysis'] logger.warning(str(e) + " Pipeline not mappped to newer version") return sample_info pipeline_version = get_pipeline_version(child['pipeline_version'] \ if 'pipeline_version' in rows else None) sample_cfg['pipeline_version'] = pipeline_version #sample_cfg['pipeline_params'] = 'params' ref_info = get_reference_info(child['Analysis'], \ sample_cfg['pipeline_version'], child['genome']) if not ref_info: logger.info("ref_info not available") return sample_info cmdline_info = get_cmdline_info(child) sample_cfg['references_cfg'] = ref_info if cmdline_info: sample_cfg['cmdline'] = cmdline_info readunits_dict = {} status, fq1, fq2 = check_fastq(fastq_data_dir, child['libraryId'],\ rows['laneId']) if status: ru = ReadUnit(run_num_flowcell, flowcellid, child['libraryId'],\ rows['laneId'], None, fq1, fq2) k = key_for_readunit(ru) readunits_dict[k] = dict(ru._asdict()) sample_cfg['readunits'] = readunits_dict if sample_info.get(sample_id, {}).get('readunits', {}): sample_info[sample_id]['readunits'].update(readunits_dict) else: sample_info[sample_id] = sample_cfg return sample_info
def runs_from_db(db, mail_to, ccaddr, win=34): """Get the runs from pipeline_run collections""" epoch_present, epoch_back = generate_window(win) results = db.find({"analysis" : {"$exists": False}, "timestamp": {"$gt": epoch_back, "$lt": epoch_present}}) logger.info("Found %d runs for last %s days", results.count(), win) mail = False subject = "Runs with missing ELM information" body = "Dear NGSP, " + "\n" body += subject + " for the following runs. Please include in the ELM." + "\n" for record in results: logger.debug("record: %s", record) _, runid, _ = get_machine_run_flowcell_id(record.get('run')) rest_data = get_rest_data(runid) if not rest_data.get('runId'): body += record.get('run')+ "\n" mail = True if mail: send_mail(subject, body, toaddr=mail_to, ccaddr=ccaddr)
def get_bcl2fastq_outdir(runid_and_flowcellid): """where to write bcl2fastq output to """ if is_devel_version(): basedir = site_cfg['bcl2fastq_outdir_base']['devel'] else: basedir = site_cfg['bcl2fastq_outdir_base']['production'] machineid, runid, flowcellid = get_machine_run_flowcell_id( runid_and_flowcellid) outdir = "{basedir}/{mid}/{rid}_{fid}/bcl2fastq_{ts}".format( basedir=basedir, mid=machineid, rid=runid, fid=flowcellid, ts=generate_timestamp()) return outdir
def get_bcl2fastq_outdir(runid_and_flowcellid, site=None, basedir_map=OUTDIR_BASE): """FIXME:add-doc """ if not site: site = get_site() if site not in basedir_map: raise ValueError(site) if is_devel_version(): basedir = basedir_map[site]['devel'] else: basedir = basedir_map[site]['production'] machineid, runid, flowcellid = get_machine_run_flowcell_id( runid_and_flowcellid) outdir = "{basedir}/{mid}/{rid}_{fid}/bcl2fastq_{ts}".format( basedir=basedir, mid=machineid, rid=runid, fid=flowcellid, ts=generate_timestamp()) return outdir
def get_lib_details(run_num_flowcell, mux_list, testing): """Lib info collection from ELM per run """ _, run_num, _ = get_machine_run_flowcell_id(run_num_flowcell) # Call rest service to get component libraries if testing: rest_url = rest_services['run_details']['testing'].replace( "run_num", run_num) logger.info("development server") else: rest_url = rest_services['run_details']['production'].replace( "run_num", run_num) logger.info("production server") response = requests.get(rest_url) if response.status_code != requests.codes.ok: response.raise_for_status() rest_data = response.json() logger.debug("rest_data from %s: %s", rest_url, rest_data) sample_info = {} mux_analysis_list = set() if rest_data.get('runId') is None: logger.info("JSON data is empty for run num %s", run_num) return sample_info for mux_id, out_dir in mux_list: fastq_data_dir = os.path.join(out_dir[0], 'out', "Project_" + mux_id) if os.path.exists(fastq_data_dir): for rows in rest_data['lanes']: if mux_id in rows['libraryId']: logger.info("Checking the pipeline params for %s from run number %s", \ rows['libraryId'], run_num) if "MUX" in rows['libraryId']: for child in rows['Children']: if child['Analysis'] != "Sequence only": sample_info = get_sample_info(child, rows, mux_analysis_list, \ mux_id, fastq_data_dir, run_num_flowcell, sample_info) else: if rows['Analysis'] != "Sequence only": sample_info = get_sample_info(rows, rows, mux_analysis_list, \ mux_id, fastq_data_dir, run_num_flowcell, sample_info) return sample_info, mux_analysis_list
def get_mux_details(run_number, mux_id, fastq_dest): """Fastq details etc for a MUX """ sample_list = glob.glob(os.path.join(fastq_dest, "*"+ mux_id, 'Sample_*')) _, run_id, flowcell_id = get_machine_run_flowcell_id(run_number) readunits_dict = {} samples_dict = {} for sample_dir in sample_list: readunits = readunits_for_sampledir(sample_dir) # insert run id and flowcell id which can't be inferred from filename for ru in readunits.values(): ru['run_id'] = run_id ru['flowcell_id'] = flowcell_id lib_ids = [ru['library_id'] for ru in readunits.values()] assert len(set(lib_ids)) == 1 sample_name = lib_ids[0] assert sample_name not in samples_dict samples_dict[sample_name] = list(readunits.keys()) for k, v in readunits.items(): assert k not in readunits_dict readunits_dict[k] = v return {'samples': samples_dict, 'readunits': readunits_dict}
def get_lib_details(run_num_flowcell, mux_list, testing): """Lib info collection from ELM per run """ _, run_num, flowcellid = get_machine_run_flowcell_id(run_num_flowcell) # Call rest service to get component libraries if testing: print(run_num) rest_url = rest_services['run_details']['testing'].replace("run_num", run_num) logger.info("development server") else: rest_url = rest_services['run_details']['production'].replace("run_num", run_num) logger.info("production server") response = requests.get(rest_url) if response.status_code != requests.codes.ok: response.raise_for_status() rest_data = response.json() logger.debug("rest_data from %s: %s", rest_url, rest_data) sample_info = {} if rest_data.get('runId') is None: logger.info("JSON data is empty for run num %s", run_num) return sample_info for mux_id, out_dir in mux_list: fastq_data_dir = os.path.join(out_dir[0], 'out', "Project_"+mux_id) if os.path.exists(fastq_data_dir): for rows in rest_data['lanes']: if mux_id in rows['libraryId']: if "MUX" in rows['libraryId']: for child in rows['Children']: if child['Analysis'] != "Sequence only": ctime, _ = generate_window(1) sample_dict = {} sample = child['libraryId'] sample_dict['requestor'] = rows['requestor'] sample_dict['ctime'] = ctime sample_dict['pipeline_name'] = child['Analysis'] if 'pipeline_version' in rows: sample_dict['pipeline_version'] = child['pipeline_version'] else: sample_dict['pipeline_version'] = None sample_dict['pipeline_params'] = 'params' sample_dict['site'] = get_site() out_dir = get_downstream_outdir(sample_dict['requestor'], \ sample_dict['pipeline_version'], sample_dict['pipeline_name']) sample_dict['out_dir'] = out_dir readunits_dict = {} status, fq1, fq2 = check_fastq(fastq_data_dir, child['libraryId'],\ rows['laneId']) if status: ru = ReadUnit(run_num_flowcell, flowcellid, child['libraryId'],\ rows['laneId'], None, fq1, fq2) k = key_for_read_unit(ru) readunits_dict[k] = dict(ru._asdict()) sample_dict['readunits'] = readunits_dict if sample_info.get(sample, {}).get('readunits'): sample_info[sample]['readunits'].update(readunits_dict) else: sample_info[sample] = sample_dict else: if rows['Analysis'] != "Sequence only": sample = rows['libraryId'] status, fq1, fq2 = check_fastq(fastq_data_dir, rows['libraryId'], \ rows['laneId']) if status: ctime, _ = generate_window(1) sample_dict = {} readunits_dict = {} ru = ReadUnit(run_num_flowcell, flowcellid, rows['libraryId'], \ rows['laneId'], None, fq1, fq2) k = key_for_read_unit(ru) readunits_dict[k] = dict(ru._asdict()) sample_dict['readunits'] = readunits_dict sample_info[sample] = sample_dict return sample_info
def main(): """ The main function """ parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--force-overwrite", action="store_true", help="Force overwriting of output files") parser.add_argument("-r", "--rundir", dest="rundir", required=True, help="rundir, e.g. /mnt/seq/userrig/HS004/HS004-PE-R00139_BC6A7HANXX") parser.add_argument('-t', "--test_server", action='store_true') parser.add_argument("-o", "--outdir", required=True, dest="outdir", help="Output directory") parser.add_argument('-v', '--verbose', action='count', default=0, help="Increase verbosity") parser.add_argument('-q', '--quiet', action='count', default=0, help="Decrease verbosity") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) (rundir, outdir, runinfo) = getdirs(args) samplesheet_csv = os.path.join(outdir, SAMPLESHEET_CSV) usebases_cfg = os.path.join(outdir, USEBASES_CFG) muxinfo_cfg = os.path.join(outdir, MUXINFO_CFG) for f in [samplesheet_csv, usebases_cfg, muxinfo_cfg]: if not args.force_overwrite and os.path.exists(f): logger.fatal("Refusing to overwrite existing file %s", f) sys.exit(1) _, run_num, flowcellid = get_machine_run_flowcell_id(rundir) logger.info("Querying ELM for %s", run_num) if args.test_server: rest_url = rest_services['run_details']['testing'].replace("run_num", run_num) logger.info("development server") else: rest_url = rest_services['run_details']['production'].replace("run_num", run_num) logger.info("production server") response = requests.get(rest_url) if response.status_code != requests.codes.ok: response.raise_for_status() rest_data = response.json() logger.debug("rest_data from {}: {}".format(rest_url, rest_data)) run_id = rest_data['runId'] #counter = 0 if rest_data['runPass'] != 'Pass': logger.warning("Skipping non-passed run") # NOTE: exit 0 and missing output files is the upstream signal for a failed run sys.exit(0) # this is the master samplesheet logger.info("Writing to %s", samplesheet_csv) # keys: lanes, values are barcode lens in lane (always two tuples, -1 if not present) barcode_lens = {} mux_units = dict() with open(samplesheet_csv, 'w') as fh_out: fh_out.write(SAMPLESHEET_HEADER + '\n') for rows in rest_data['lanes']: if rows['lanePass'] != 'Pass': continue BCL_Mismatch = [] if 'requestor' in rows: requestor = rows['requestor'] else: requestor = None if "MUX" in rows['libraryId']: # multiplexed #counter = 0 for child in rows['Children']: #counter += 1 #id = 'S' + str(counter) if 'BCL_Mismatch' in child: BCL_Mismatch.append(child['BCL_Mismatch']) # older samples have no values and that's okay if "-" in child['barcode']: # dual index index = child['barcode'].split('-') sample = rows['laneId']+',Sample_'+child['libraryId']+','+ \ child['libraryId']+'-'+child['barcode']+',,,,'+ index[0] +',,'+ \ index[1] + ',' +'Project_'+rows['libraryId']+','+child['libtech'] index_lens = (len((index[0])), len((index[1]))) else: sample = rows['laneId']+',Sample_'+child['libraryId']+','+ \ child['libraryId']+'-'+child['barcode']+',,,,'+child['barcode']+',,,'\ +'Project_'+rows['libraryId']+','+child['libtech'] index_lens = (len(child['barcode']), -1) barcode_lens.setdefault(rows['laneId'], []).append(index_lens) fh_out.write(sample+ '\n') else:# non-multiplexed sample = rows['laneId']+',Sample_'+rows['libraryId']+','+rows['libraryId']+ \ '-NoIndex'+',,,,,,,'+'Project_'+rows['libraryId']+','+rows['libtech'] index_lens = (-1, -1) barcode_lens.setdefault(rows['laneId'], []).append(index_lens) fh_out.write(sample + '\n') #Barcode mismatch has to be the same for all the libraries in one MUX. #Otherwise default mismatch value to be used if len(set(BCL_Mismatch)) == 1: barcode_mismatches = BCL_Mismatch[0] else: barcode_mismatches = DEFAULT_BARCODE_MISMATCHES mu = MuxUnit._make([run_id, flowcellid, rows['libraryId'], [rows['laneId']], \ 'Project_'+ rows['libraryId'], barcode_mismatches, requestor]) # merge lane into existing mux if needed if mu.mux_id in mux_units: mu_orig = mux_units[mu.mux_id] assert mu.barcode_mismatches == mu_orig.barcode_mismatches assert len(mu.lane_ids) == 1# is a list by design but just one element. #otherwise below fails lane_ids = mu_orig.lane_ids.extend(mu.lane_ids) mu_orig = mu_orig._replace(lane_ids=lane_ids) else: mux_units[mu.mux_id] = mu logger.info("Writing to %s", usebases_cfg) usebases = generate_usebases(barcode_lens, runinfo) with open(usebases_cfg, 'w') as fh: fh.write(yaml.dump(dict(usebases=usebases), default_flow_style=True)) logger.info("Writing to %s", muxinfo_cfg) with open(muxinfo_cfg, 'w') as fh: fh.write(yaml.dump([dict(mu._asdict()) for mu in mux_units.values()], \ default_flow_style=True))
def main(): """ The main function """ parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--force-overwrite", action="store_true", help="Force overwriting of output files") parser.add_argument( "-r", "--rundir", dest="rundir", required=True, help="rundir, e.g. /mnt/seq/userrig/HS004/HS004-PE-R00139_BC6A7HANXX") parser.add_argument('-t', "--test-server", action='store_true') parser.add_argument("-o", "--outdir", required=True, dest="outdir", help="Output directory") parser.add_argument('-v', '--verbose', action='count', default=0, help="Increase verbosity") parser.add_argument('-q', '--quiet', action='count', default=0, help="Decrease verbosity") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) (rundir, outdir, runinfo) = getdirs(args) samplesheet_csv = os.path.join(outdir, SAMPLESHEET_CSV) muxinfo_cfg = os.path.join(outdir, MUXINFO_CFG) for f in [samplesheet_csv, muxinfo_cfg]: if not args.force_overwrite and os.path.exists(f): logger.fatal("Refusing to overwrite existing file %s", f) sys.exit(1) _, run_num, flowcellid = get_machine_run_flowcell_id(rundir) logger.info("Querying ELM for %s", run_num) rest_data = get_rest_data(run_num, args.test_server) status_cfg = os.path.join(outdir, STATUS_CFG) assert rest_data['runId'], ( "Rest data from ELM does not have runId {}".format(run_num)) if rest_data['runPass'] != 'Pass': logger.warning("Skipping non-passed run") with open(status_cfg, 'w') as fh_out: fh_out.write("SEQRUNFAILED") sys.exit(0) status = generate_samplesheet(rest_data, flowcellid, outdir, runinfo) if not status: with open(status_cfg, 'w') as fh_out: fh_out.write("NON-BCL")
def main(): """ The main function """ parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--force-overwrite", action="store_true", help="Force overwriting of output files") parser.add_argument( "-r", "--rundir", dest="rundir", required=True, help="rundir, e.g. /mnt/seq/userrig/HS004/HS004-PE-R00139_BC6A7HANXX") parser.add_argument('-t', "--test-server", action='store_true') parser.add_argument("-o", "--outdir", required=True, dest="outdir", help="Output directory") parser.add_argument('-v', '--verbose', action='count', default=0, help="Increase verbosity") parser.add_argument('-q', '--quiet', action='count', default=0, help="Decrease verbosity") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) (rundir, outdir, runinfo) = getdirs(args) samplesheet_csv = os.path.join(outdir, SAMPLESHEET_CSV) usebases_cfg = os.path.join(outdir, USEBASES_CFG) muxinfo_cfg = os.path.join(outdir, MUXINFO_CFG) for f in [samplesheet_csv, usebases_cfg, muxinfo_cfg]: if not args.force_overwrite and os.path.exists(f): logger.fatal("Refusing to overwrite existing file %s", f) sys.exit(1) _, run_num, flowcellid = get_machine_run_flowcell_id(rundir) logger.info("Querying ELM for %s", run_num) rest_data = get_rest_data(run_num, args.test_server) assert rest_data['runId'], ( "Rest data from ELM does not have runId {}".format(run_num)) run_id = rest_data['runId'] #counter = 0 if rest_data['runPass'] != 'Pass': logger.warning("Skipping non-passed run") # NOTE: exit 0 and missing output files is the upstream signal for a failed run sys.exit(0) # this is the master samplesheet logger.info("Writing to %s", samplesheet_csv) # keys: lanes, values are barcode lens in lane (always two tuples, -1 if not present) barcode_lens = {} mux_units = dict() with open(samplesheet_csv, 'w') as fh_out: fh_out.write(SAMPLESHEET_HEADER + '\n') for rows in rest_data['lanes']: if rows['lanePass'] != 'Pass': continue BCL_Mismatch = [] if 'requestor' in rows: requestor = rows['requestor'] else: requestor = None if "MUX" in rows['libraryId']: # multiplexed #counter = 0 for child in rows['Children']: #counter += 1 #id = 'S' + str(counter) if 'BCL_Mismatch' in child: BCL_Mismatch.append(child['BCL_Mismatch']) # older samples have no values and that's okay if "-" in child['barcode']: # dual index index = child['barcode'].split('-') sample = rows['laneId']+',Sample_'+child['libraryId']+','+ \ child['libraryId']+'-'+child['barcode']+',,,,'+ index[0] +',,'+ \ index[1] + ',' +'Project_'+rows['libraryId']+','+child['libtech'] index_lens = (len((index[0])), len((index[1]))) else: sample = rows['laneId']+',Sample_'+child['libraryId']+','+ \ child['libraryId']+'-'+child['barcode']+',,,,'+child['barcode']+',,,'\ +'Project_'+rows['libraryId']+','+child['libtech'] index_lens = (len(child['barcode']), -1) barcode_lens.setdefault(rows['laneId'], []).append(index_lens) fh_out.write(sample + '\n') else: # non-multiplexed sample = rows['laneId']+',Sample_'+rows['libraryId']+','+rows['libraryId']+ \ '-NoIndex'+',,,,,,,'+'Project_'+rows['libraryId']+','+rows['libtech'] index_lens = (-1, -1) barcode_lens.setdefault(rows['laneId'], []).append(index_lens) fh_out.write(sample + '\n') #Barcode mismatch has to be the same for all the libraries in one MUX. #Otherwise default mismatch value to be used if len(set(BCL_Mismatch)) == 1: barcode_mismatches = BCL_Mismatch[0] else: barcode_mismatches = DEFAULT_BARCODE_MISMATCHES mu = MuxUnit._make([run_id, flowcellid, rows['libraryId'], [rows['laneId']], \ 'Project_'+ rows['libraryId'], barcode_mismatches, requestor]) # merge lane into existing mux if needed if mu.mux_id in mux_units: mu_orig = mux_units[mu.mux_id] assert mu.barcode_mismatches == mu_orig.barcode_mismatches assert len(mu.lane_ids ) == 1 # is a list by design but just one element. #otherwise below fails lane_ids = mu_orig.lane_ids.extend(mu.lane_ids) mu_orig = mu_orig._replace(lane_ids=lane_ids) else: mux_units[mu.mux_id] = mu logger.info("Writing to %s", usebases_cfg) usebases = generate_usebases(barcode_lens, runinfo) with open(usebases_cfg, 'w') as fh: fh.write(yaml.dump(dict(usebases=usebases), default_flow_style=True)) logger.info("Writing to %s", muxinfo_cfg) with open(muxinfo_cfg, 'w') as fh: fh.write(yaml.dump([dict(mu._asdict()) for mu in mux_units.values()], \ default_flow_style=True))
def main(): """main function """ # FIXME ugly and code duplication in bcl2fastq_dbupdate.py mongo_status_script = os.path.abspath( os.path.join(os.path.dirname(sys.argv[0]), "mongo_status.py")) assert os.path.exists(mongo_status_script) default_parser = default_argparser(CFG_DIR, allow_missing_cfgfile=True, allow_missing_outdir=True, default_db_logging=True) parser = argparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version()), parents=[default_parser]) parser._optionals.title = "Arguments" # pipeline specific args parser.add_argument('-r', "--runid", help="Run ID plus flowcell ID (clashes with -d)") parser.add_argument( '-d', "--rundir", help= "BCL input directory (clashes with -r; you also probably want to disable logging)" ) parser.add_argument('-t', "--testing", action='store_true', help="Use MongoDB test server") parser.add_argument('--no-archive', action='store_true', help="Don't archieve this analysis") parser.add_argument( '-l', '--lanes', type=int, nargs="*", help="Limit run to given lane/s (multiples separated by space") parser.add_argument( '-i', '--mismatches', type=int, help="Max. number of allowed barcode mismatches (0>=x<=2)" " setting a value here overrides the default settings read from ELM)") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) aux_logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) if args.mismatches is not None: if args.mismatches > 2 or args.mismatches < 0: logger.fatal("Number of mismatches must be between 0-2") sys.exit(1) lane_info = '' lane_nos = [] if args.lanes: lane_info = '--tiles ' for lane in args.lanes: if lane > 8 or lane < 1: logger.fatal("Lane number must be between 1-8") sys.exit(1) else: lane_info += 's_{}'.format(lane) + ',' lane_info = lane_info.rstrip() lane_info = lane_info[:-1] lane_nos = list(args.lanes) if args.runid and args.rundir: logger.fatal( "Cannot use run-id and input directory arguments simultaneously") sys.exit(1) elif args.runid: rundir = run_folder_for_run_id(args.runid) elif args.rundir: rundir = os.path.abspath(args.rundir) else: logger.fatal("Need either run-id or input directory") sys.exit(1) if not os.path.exists(rundir): logger.fatal("Expected run directory %s does not exist", rundir) logger.info("Rundir is %s", rundir) if not args.outdir: outdir = get_bcl2fastq_outdir(args.runid) args.outdir = outdir else: outdir = args.outdir if os.path.exists(outdir): logger.fatal("Output directory %s already exists", outdir) sys.exit(1) # create now so that generate_bcl2fastq_cfg.py can run os.makedirs(outdir) # catch cases where rundir was user provided and looks weird try: _, runid, flowcellid = get_machine_run_flowcell_id(rundir) run_num = runid + "_" + flowcellid except: run_num = "UNKNOWN-" + rundir.split("/")[-1] # call generate_bcl2fastq_cfg # # FIXME ugly assumes same directory (just like import above). better to import and run main()? generate_bcl2fastq = os.path.join(os.path.dirname(sys.argv[0]), "generate_bcl2fastq_cfg.py") assert os.path.exists(generate_bcl2fastq) cmd = [generate_bcl2fastq, '-r', rundir, '-o', outdir] if args.testing: cmd.append("-t") logger.debug("Executing %s", ' '.join(cmd)) try: res = subprocess.check_output(cmd, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: logger.fatal("The following command failed with return code %s: %s", e.returncode, ' '.join(cmd)) logger.fatal("Output: %s", e.output.decode()) logger.fatal("Exiting") os.rmdir(outdir) sys.exit(1) # generate_bcl2fastq is normally quiet. if there's output, make caller aware of it # use sys instead of logger to avoid double logging if res: sys.stderr.write(res.decode()) # just created files muxinfo_cfg = os.path.join(outdir, MUXINFO_CFG) status_cfg = os.path.join(outdir, STATUS_CFG) # NOTE: signal for failed runs is exit 0 from generate_bcl2fastq and missing output files # if any([not os.path.exists(x) for x in [muxinfo_cfg]]): # one missing means all should be missing assert all([not os.path.exists(x) for x in [muxinfo_cfg]]) #Check status as seqrunfailed or non-bcl run with open(status_cfg, 'r') as fh: status = fh.read().strip() update_run_status(mongo_status_script, run_num, outdir, status, args.testing) sys.exit(0) # turn arguments into cfg_dict that gets merged into pipeline config cfg_dict = { 'rundir': rundir, 'lanes_arg': lane_info, 'no_archive': args.no_archive, 'run_num': run_num } mux_units = get_mux_units_from_cfgfile(muxinfo_cfg, lane_nos) if args.mismatches is not None: mux_units = [ mu._replace(barcode_mismatches=args.mismatches) for mu in mux_units ] os.unlink(muxinfo_cfg) cfg_dict['units'] = dict() for mu in mux_units: # special case: mux split across multiple lanes. make lanes a list # and add in extra lanes if needed. k = mu.mux_dir mu_dict = dict(mu._asdict()) cfg_dict['units'][k] = mu_dict # create mongodb update command, used later, after submission mongo_update_cmd = "{} -r {} -s STARTED".format(mongo_status_script, cfg_dict['run_num']) mongo_update_cmd += " -a $ANALYSIS_ID -o {}".format( outdir) # set in run.sh if args.testing: mongo_update_cmd += " -t" pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, args, cfg_dict, logger_cmd=mongo_update_cmd, cluster_cfgfile=get_cluster_cfgfile(CFG_DIR)) pipeline_handler.setup_env() pipeline_handler.submit(args.no_run)
def main(): """main function """ # FIXME ugly and code duplication in bcl2fastq_dbupdate.py mongo_status_script = os.path.abspath(os.path.join( os.path.dirname(sys.argv[0]), "mongo_status.py")) assert os.path.exists(mongo_status_script) parser = argparse.ArgumentParser(description=__doc__.format( PIPELINE_NAME=PIPELINE_NAME, PIPELINE_VERSION=get_pipeline_version())) parser.add_argument('-r', "--runid", help="Run ID plus flowcell ID (clashes with -d)") parser.add_argument('-d', "--rundir", help="BCL input directory (clashes with -r)") parser.add_argument('-o', "--outdir", help="Output directory (must not exist; required if called by user)") parser.add_argument('-t', "--testing", action='store_true', help="Use MongoDB test server") parser.add_argument('--no-archive', action='store_true', help="Don't archieve this analysis") parser.add_argument('--no-mail', action='store_true', help="Don't send mail on completion") site = get_site() default = DEFAULT_SLAVE_Q.get(site, None) parser.add_argument('-w', '--slave-q', default=default, help="Queue to use for slave jobs (default: {})".format(default)) default = DEFAULT_MASTER_Q.get(site, None) parser.add_argument('-m', '--master-q', default=default, help="Queue to use for master job (default: {})".format(default)) parser.add_argument('-l', '--lanes', type=int, nargs="*", help="Limit run to given lane/s (multiples separated by space") parser.add_argument('-i', '--mismatches', type=int, help="Max. number of allowed barcode mismatches (0>=x<=2)" " setting a value here overrides the default settings read from ELM)") parser.add_argument('-n', '--no-run', action='store_true') parser.add_argument('-v', '--verbose', action='count', default=0, help="Increase verbosity") parser.add_argument('-q', '--quiet', action='count', default=0, help="Decrease verbosity") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) aux_logger.setLevel(logging.WARN + 10*args.quiet - 10*args.verbose) if args.mismatches is not None: if args.mismatches > 2 or args.mismatches < 0: logger.fatal("Number of mismatches must be between 0-2") sys.exit(1) lane_info = '' lane_nos = [] if args.lanes: lane_info = '--tiles ' for lane in args.lanes: if lane > 8 or lane < 1: logger.fatal("Lane number must be between 1-8") sys.exit(1) else: lane_info += 's_{}'.format(lane)+',' lane_info = lane_info.rstrip() lane_info = lane_info[:-1] lane_nos = list(args.lanes) if args.runid and args.rundir: logger.fatal("Cannot use run-id and input directory arguments simultaneously") sys.exit(1) elif args.runid: rundir = run_folder_for_run_id(args.runid) elif args.rundir: rundir = os.path.abspath(args.rundir) else: logger.fatal("Need either run-id or input directory") sys.exit(1) if not os.path.exists(rundir): logger.fatal("Expected run directory {} does not exist".format(rundir)) logger.info("Rundir is {}".format(rundir)) if not args.outdir: outdir = get_bcl2fastq_outdir(args.runid) else: outdir = args.outdir if os.path.exists(outdir): logger.fatal("Output directory %s already exists", outdir) sys.exit(1) # create now so that generate_bcl2fastq_cfg.py can run os.makedirs(outdir) # catch cases where rundir was user provided and looks weird try: _, runid, flowcellid = get_machine_run_flowcell_id(rundir) run_num = runid + "_" + flowcellid except: run_num = "UNKNOWN-" + rundir.split("/")[-1] # call generate_bcl2fastq_cfg # # FIXME ugly assumes same directory (just like import above). better to import and run main()? generate_bcl2fastq = os.path.join( os.path.dirname(sys.argv[0]), "generate_bcl2fastq_cfg.py") assert os.path.exists(generate_bcl2fastq) cmd = [generate_bcl2fastq, '-r', rundir, '-o', outdir] if args.testing: cmd.append("-t") logger.debug("Executing {}".format(' ' .join(cmd))) try: res = subprocess.check_output(cmd, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: logger.fatal("The following command failed with return code {}: {}".format( e.returncode, ' '.join(cmd))) logger.fatal("Output: {}".format(e.output.decode())) logger.fatal("Exiting") sys.exit(1) # generate_bcl2fastq is normally quiet. if there's output, make caller aware of it # use sys instead of logger to avoid double logging if res: sys.stderr.write(res.decode()) # just created files muxinfo_cfg = os.path.join(outdir, MUXINFO_CFG) samplesheet_csv = os.path.join(outdir, SAMPLESHEET_CSV) usebases_cfg = os.path.join(outdir, USEBASES_CFG) # NOTE: signal for failed runs is exit 0 from generate_bcl2fastq and missing output files # if any([not os.path.exists(x) for x in [muxinfo_cfg, samplesheet_csv, usebases_cfg]]): # one missing means all should be missing assert all([not os.path.exists(x) for x in [muxinfo_cfg, samplesheet_csv, usebases_cfg]]) seqrunfailed(mongo_status_script, run_num, outdir, args.testing) sys.exit(0) # turn arguments into user_data that gets merged into pipeline config user_data = {'rundir': rundir, 'lanes_arg': lane_info, 'samplesheet_csv': samplesheet_csv, 'no_archive': args.no_archive, 'mail_on_completion': not args.no_mail, 'run_num': run_num} usebases_arg = '' with open(usebases_cfg, 'r') as stream: try: d = yaml.load(stream) assert 'usebases' in d assert len(d) == 1# make sure usebases is only key for ub in d['usebases']: #print (ub) usebases_arg += '--use-bases-mask {} '.format(ub) #user_data = {'usebases_arg' : usebases_arg} except yaml.YAMLError as exc: logger.fatal(exc) raise user_data['usebases_arg'] = usebases_arg os.unlink(usebases_cfg) mux_units = get_mux_units_from_cfgfile(muxinfo_cfg, lane_nos) if args.mismatches is not None: mux_units = [mu._replace(barcode_mismatches=args.mismatches) for mu in mux_units] os.unlink(muxinfo_cfg) user_data['units'] = dict() for mu in mux_units: # special case: mux split across multiple lanes. make lanes a list # and add in extra lanes if needed. k = mu.mux_dir mu_dict = dict(mu._asdict()) user_data['units'][k] = mu_dict # create mongodb update command, used later, after queueing mongo_update_cmd = "{} -r {} -s STARTED".format(mongo_status_script, user_data['run_num']) mongo_update_cmd += " -a $ANALYSIS_ID -o {}".format(outdir)# set in run.sh if args.testing: mongo_update_cmd += " -t" # NOTE: bcl2fastq has a special run template, so we need to # interfer with the default pipeline_handler. plenty of # opportunity to shoot yourself in the foot pipeline_handler = PipelineHandler( PIPELINE_NAME, PIPELINE_BASEDIR, outdir, user_data, site=site, master_q=args.master_q, slave_q=args.slave_q) # use local run template pipeline_handler.run_template = os.path.join( PIPELINE_BASEDIR, "run.template.{}.sh".format(pipeline_handler.site)) assert os.path.exists(pipeline_handler.run_template) pipeline_handler.setup_env() # final mongo update line in run_out tmp_run_out = pipeline_handler.run_out + ".tmp" with open(pipeline_handler.run_out) as fh_in, \ open(tmp_run_out, 'w') as fh_out: for line in fh_in: line = line.replace("@MONGO_UPDATE_CMD@", mongo_update_cmd) fh_out.write(line) shutil.move(tmp_run_out, pipeline_handler.run_out) pipeline_handler.submit(args.no_run)
def main(): """main function""" parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("-o", "--out_dir", required=True, help="out_dir") parser.add_argument("-m", "--mux_id", required=True, help="mux_id") parser.add_argument('-t', "--test-server", action='store_true', help="Use test-server for stats uploading") parser.add_argument('-v', '--verbose', action='count', default=0, help="Increase verbosity") parser.add_argument('-q', '--quiet', action='count', default=0, help="Decrease verbosity") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) if not os.path.exists(args.out_dir): logger.fatal("out_dir %s does not exist", args.out_dir) sys.exit(1) logger.info("out_dir is %s", args.out_dir) confinfo = os.path.join(args.out_dir + '/conf.yaml') if not os.path.exists(confinfo): logger.fatal("conf info '%s' does not exist under Run directory.\n", confinfo) sys.exit(1) if args.test_server: rest_url = rest_services['stats_upload']['testing'] logger.info("send status to development server") else: rest_url = rest_services['stats_upload']['production'] logger.info("send status to production server") with open(confinfo) as fh_cfg: yaml_data = yaml.safe_load(fh_cfg) assert "run_num" in yaml_data runid_with_flowcellid = yaml_data["run_num"] _, runid, _ = get_machine_run_flowcell_id(runid_with_flowcellid) assert "modules" in yaml_data soft_ver = yaml_data["modules"].get('bcl2fastq') if not soft_ver: logger.fatal("bclpath software version %s does not exist", soft_ver) sys.exit(1) assert "units" in yaml_data if not "Project_" + args.mux_id in yaml_data["units"]: logger.fatal("mux_id %s does not exist in conf.yaml under %s", args.mux_id, args.out_dir) sys.exit(1) for k, v in yaml_data["units"].items(): if k == "Project_{}".format(args.mux_id): data = {} mux_dir = v.get('mux_dir') index_html_path = glob.glob( os.path.join(args.out_dir, "out", mux_dir, "html/*/all/all/all/lane.html")) index_html = index_html_path[0] # FIXME should use the snakemake trigger to decide if complete if os.path.exists(index_html): logger.info("Uploading stats for completed bcl2fastq %s", mux_dir) data['path'] = index_html data['software'] = soft_ver data['runid'] = runid test_json = json.dumps(data) data_json = test_json.replace("\\", "") headers = {'content-type': 'application/json'} response = requests.post(rest_url, data=data_json, headers=headers) # Response Code is 201 for STATs posting if response.status_code == 201: logger.info("Uploading %s completed successfully", index_html) logger.info("JSON request was %s", data_json) logger.info("Response was %s", response.status_code) else: logger.error("Uploading %s failed", index_html) sys.exit(1) else: logger.info( "Skipping incomplete (html missing) bcl2fastq in %s", mux_dir)
def main(): """main function""" parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('-1', "--break-after-first", action='store_true', help="Only process first run returned") parser.add_argument('-n', "--dry-run", action='store_true', help="Don't run anything") parser.add_argument('-t', "--testing", action='store_true', help="Use MongoDB test-server") default = 14 parser.add_argument( '-w', '--win', type=int, default=default, help="Number of days to look back (default {})".format(default)) parser.add_argument('-v', '--verbose', action='count', default=0, help="Increase verbosity") parser.add_argument('-q', '--quiet', action='count', default=0, help="Decrease verbosity") args = parser.parse_args() # Repeateable -v and -q for setting logging level. # See https://www.reddit.com/r/Python/comments/3nctlm/what_python_tools_should_i_be_using_on_every/ # and https://gist.github.com/andreas-wilm/b6031a84a33e652680d4 # script -vv -> DEBUG # script -v -> INFO # script -> WARNING # script -q -> ERROR # script -qq -> CRITICAL # script -qqq -> no logging at all logger.setLevel(logging.WARN + 10 * args.quiet - 10 * args.verbose) #Check if pipeline scripts are available assert os.path.exists(BWA) assert os.path.exists(RNA) connection = mongodb_conn(args.testing) if connection is None: sys.exit(1) db = connection.gisds.runcomplete epoch_present, epoch_back = generate_window(args.win) num_triggers = 0 results = db.find({ "analysis.Status": "SUCCESS", "timestamp": { "$gt": epoch_back, "$lt": epoch_present } }) logger.info("Found %s runs", results.count()) for record in results: run_number = record['run'] analysis = record['analysis'] # Downstream analysis will not be intiated for Novogene (NG00*) runs if "NG00" in run_number: continue for analysis in record['analysis']: out_dir = analysis.get("out_dir") #Check if bcl2Fastq is completed successfully if 'Status' in analysis and analysis.get("Status") == "SUCCESS": if not os.path.exists(out_dir): logger.critical( "Following directory listed in DB doesn't exist: %s", out_dir) continue #Check if downstream analysis has been started if not os.path.exists( os.path.join(out_dir, "config_casava-1.8.2.txt".format())): logger.info("Start the downstream analysis at %s", out_dir) os.makedirs(os.path.join(out_dir, LOG_DIR_REL), exist_ok=True) #generate config file config_cmd = [CONFIG, '-r', run_number] try: f = open( os.path.join(out_dir, "config_casava-1.8.2.txt".format()), "w") _ = subprocess.call(config_cmd, stderr=subprocess.STDOUT, stdout=f) except subprocess.CalledProcessError as e: logger.fatal( "The following command failed with return code %s: %s", e.returncode, ' '.join(config_cmd)) logger.fatal("Output: %s", e.output.decode()) logger.fatal("Exiting") sys.exit(1) #generic sample sheet samplesheet_cmd = 'cd {} && {} -r {}'.format( out_dir, SAMPLESHEET, run_number) try: _ = subprocess.check_output(samplesheet_cmd, shell=True) except subprocess.CalledProcessError as e: logger.fatal( "The following command failed with return code %s: %s", e.returncode, ' '.join(samplesheet_cmd)) logger.fatal("Output: %s", e.output.decode()) logger.fatal("Exiting") sys.exit(1) #Generate and Submit BWA and RNAseq mapping pipeline _, runid, _ = get_machine_run_flowcell_id(run_number) generic_samplesheet = (os.path.join( out_dir, runid + "_SampleSheet.csv")) if os.path.exists( os.path.join(out_dir, generic_samplesheet)): dirs = os.path.join(out_dir, "out") cmd = "cd {} && {} -r {} -f {} -s {} -j 0 -p Production -c 5 >> {}" \ .format(dirs, BWA, run_number, out_dir, os.path.join(out_dir, \ generic_samplesheet), os.path.join(out_dir, SUBMISSIONLOG)) cmd += "&& {} -r {} -f {} -s {} -j 0 -p Production -c 5 >> {}" \ .format(RNA, run_number, out_dir, os.path.join(out_dir, \ generic_samplesheet), os.path.join(out_dir, SUBMISSIONLOG)) if args.dry_run: logger.warning("Skipped following run: %s", cmd) #Remove config txt os.remove( os.path.join( out_dir, "config_casava-1.8.2.txt".format())) else: try: #ananlysisReport into submission log with open(os.path.join(out_dir, SUBMISSIONLOG), 'w') as fh: fh.write(cmd) _ = subprocess.check_output(cmd, shell=True) except subprocess.CalledProcessError as e: logger.fatal( "The following command failed with return code %s: %s", e.returncode, cmd) logger.fatal("Output: %s", e.output.decode()) logger.fatal("Exiting") #send_status_mail send_status_mail(PIPELINE_NAME, False, analysis_id, \ os.path.join(out_dir, LOG_DIR_REL, "mapping_submission.log")) sys.exit(1) num_triggers += 1 if args.break_after_first: logger.info("Stopping after first sequencing run") sys.exit(0) else: #send_status_mail logger.info("samplesheet.csv missing for %s under %s", run_number, out_dir) send_status_mail(PIPELINE_NAME, False, analysis_id, \ os.path.abspath(out_dir)) elif analysis.get("Status") == "FAILED": logger.debug("BCL2FASTQ FAILED for %s under %s", run_number, out_dir) # close the connection to MongoDB connection.close() logger.info("%s dirs with triggers", num_triggers)