def parse_samplesheet_csv(self): log.info("parse_samplesheet_csv: going to read {}.csv in directory {}".format(self["RunInfo"]["Flowcell"][1:], self.path)) infile = os.path.join(os.path.abspath(self.path), "{}.csv".format(self["RunInfo"]["Flowcell"][1:])) try: fp = open(infile) runinfo = json.dumps([x for x in csv.reader(fp)]) fp.close() self["run_info_csv"] = runinfo except: log.warn("No such file {}".format(infile))
def _parseRunInfo(self, fn="RunInfo.xml"): log.info("_parseRunInfo: going to read RunInfo.xml in directory {}".format(self.path)) try: fp = open(os.path.join(os.path.abspath(self.path), fn)) parser = RunInfoParser() data = parser.parse(fp) fp.close() self["RunInfo"] = data except: log.warn("No such file %s" % os.path.join(os.path.abspath(self.path), fn))
def read_picard_metrics(self): log.info("read_picard_metrics for sample {}, project {}, lane {} in run {}".format(self["barcode_name"], self["sample_prj"], self["lane"], self["flowcell"])) picard_parser = ExtendedPicardMetricsParser() pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?_{}-.*.(align|hs|insert|dup)_metrics".format(self["lane"], self["barcode_id"]) try: files = self.filter_files(pattern) metrics = picard_parser.extract_metrics(files) self["picard_metrics"] = metrics except: log.warn("no picard metrics for sample {}".format(self["barcode_name"]))
def parse_run_info_yaml(self, run_info_yaml="run_info.yaml"): log.info("parse_run_info_yaml: going to read {} in directory {}".format(run_info_yaml, self.path)) infile = os.path.join(os.path.abspath(self.path), run_info_yaml) try: fp = open(infile) runinfo = yaml.load(fp) fp.close() self["run_info_yaml"] = runinfo except: log.warn("No such file {}".format(infile))
def process_lane(lane_items, fc_name, fc_date, dirs, config): """Prepare lanes, potentially splitting based on barcodes. """ lane_name = "%s_%s_%s" % (lane_items[0]['lane'], fc_date, fc_name) full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"], dirs["work"], lane_items[0], fc_name, config=config) # Filter phiX custom_config = _update_config_w_custom(config, lane_items[0]) if custom_config["algorithm"].get("filter_phix", False): # If we are starting from demultiplexed material, we will skip a lane-wise screening # Screening will be performed on a sample basis if custom_config["algorithm"].get("demultiplexed", False): logger.warn("Will not filter phix lane-wise on already demultiplexed files. " \ "You will have to specify genomes_filter_out option for each sample") else: logger.info("Filtering phiX from %s" % lane_name) info = {"genomes_filter_out": "spiked_phix", "description": lane_name} processed = remove_contaminants(full_fastq1, full_fastq2, info, lane_name, info["description"], dirs, custom_config) (full_fastq1, full_fastq2, _, lane_name) = processed[0][0:4] logger.info("Demultiplexing %s" % lane_name) bc_files = split_by_barcode(full_fastq1, full_fastq2, lane_items, lane_name, dirs, config) out = [] for item in lane_items: config = _update_config_w_custom(config, item) # Can specify all barcodes but might not have actual sequences # Would be nice to have a good way to check this is okay here. if item["barcode_id"] in bc_files: fastq1, fastq2 = bc_files[item["barcode_id"]] cur_lane_name = lane_name cur_lane_desc = item["description"] if item.get("name", "") and config["algorithm"].get("include_short_name", True): cur_lane_desc = "%s : %s" % (item["name"], cur_lane_desc) if item["barcode_id"] is not None: cur_lane_name += "_%s" % (item["barcode_id"]) if config["algorithm"].get("trim_reads", False): trim_info = brun_trim_fastq([x for x in [fastq1, fastq2] if x is not None], dirs, config) fastq1 = trim_info[0] if fastq2 is not None: fastq2 = trim_info[1] out.append((fastq1, fastq2, item, cur_lane_name, cur_lane_desc, dirs, config)) return out
def parse_fastq_screen(self): log.info("parse_fastq_screen for sample {}, lane {} in run {}".format(self["barcode_name"], self["lane"], self["flowcell"])) parser = MetricsParser() pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?_{}_[12]_fastq_screen.txt".format(self["lane"], self["barcode_id"]) files = self.filter_files(pattern) try: fp = open(files[0]) data = parser.parse_fastq_screen_metrics(fp) fp.close() self["metrics"]["fastq_scr"] = data except: log.warn("no fastq screen metrics for sample {}".format(self["barcode_name"]))
def parse_bc_metrics(self): log.info("parse_bc_metrics for lane {} in flowcell {}".format(self["lane"], self["flowcell"])) pattern = "{}*barcode/{}_[0-9]+_[0-9A-Za-z]+(_nophix)?.bc_metrics".format(self["lane"], self["lane"]) files = self.filter_files(pattern) try: parser = MetricsParser() fp = open(files[0]) data = parser.parse_bc_metrics(fp) fp.close() self["bc_metrics"] = data except: log.warn("No bc_metrics info for lane {}".format(self["lane"]))
def parse_filter_metrics(self, re_str="*filter[_.]metrics"): log.info("parse_filter_metrics for lane {} in flowcell {}".format(self["lane"], self["flowcell"])) pattern = "nophix/{}_[0-9]+_[0-9A-Za-z]+(_nophix)?.filter_metrics".format(self["lane"]) files = self.filter_files(pattern) self["filter_metrics"] = {"reads":None, "reads_aligned":None, "reads_fail_align":None} try: fp = open(files[0]) parser = MetricsParser() data = parser.parse_filter_metrics(fp) fp.close() self["filter_metrics"] = data except: log.warn("No filter nophix metrics for lane {}".format(self["lane"]))
def parse_bc_metrics(self): """Parse bc metrics at sample level""" log.info("parse_bc_metrics for sample {}, project {} in flowcell {}".format(self["barcode_name"], self["sample_prj"], self["flowcell"])) pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?[\._]bc[\._]metrics".format(self["lane"]) files = self.filter_files(pattern) try: parser = MetricsParser() fp = open(files[0]) data = parser.parse_bc_metrics(fp) fp.close() self["bc_count"] = data[str(self["barcode_id"])] except: log.warn("No bc_metrics info for lane {}".format(self["lane"]))
def read_fastqc_metrics(self): log.info("read_fastq_metrics for sample {}, project {}, lane {} in run {}".format(self["barcode_name"], self["sample_prj"], self["lane"], self["flowcell"])) if self["barcode_name"] == "unmatched": return self["fastqc"] = {'stats':None} pattern = "fastqc/{}_[0-9]+_[0-9A-Za-z]+(_nophix)?_{}-*".format(self["lane"], self["barcode_id"]) files = self.filter_files(pattern) try: fastqc_dir = os.path.dirname(files[0]) fqparser = ExtendedFastQCParser(fastqc_dir) stats = fqparser.get_fastqc_summary() self["fastqc"] = {'stats':stats} except: log.warn("no fastq screen metrics for sample {}".format(self["barcode_name"]))
def parse_filter_metrics(self): """CASAVA: Parse filter metrics at sample level""" log.info("parse_filter_metrics for lane {}, project {} in flowcell {}".format(self["lane"], self["sample_prj"], self["flowcell"])) pattern = "{}_[0-9]+_[0-9A-Za-z]+_{}(_nophix)?.filter_metrics".format(self["lane"], self["barcode_id"]) files = self.filter_files(pattern) self["filter_metrics"] = {"reads":None, "reads_aligned":None, "reads_fail_align":None} try: fp = open(files[0]) parser = MetricsParser() data = parser.parse_filter_metrics(fp) fp.close() self["filter_metrics"] = data except: log.warn("No filter nophix metrics for lane {}".format(self["lane"]))
def _write_to_worksheet(client, ssheet, wsheet_title, rows, header, append, keys=[]): """Generic method to write a set of rows to a worksheet on google docs. """ # Convert the worksheet title to unicode wsheet_title = _to_unicode(wsheet_title) # Add a new worksheet, possibly appending or replacing a pre-existing # worksheet according to the append-flag. wsheet = g_spreadsheet.add_worksheet(client, \ ssheet, \ wsheet_title, \ len(rows) + 1, \ len(header), \ append) if wsheet is None: logger2.error("ERROR: Could not add a worksheet {!r} to " \ "spreadsheet {!r}".format(wsheet_title, ssheet.title.text)) return False # If keys are specified (will correspond to indexes in the header), delete pre-existing rows with matching keys if append and len(keys) > 0: wsheet_data = g_spreadsheet.get_cell_content(client, ssheet, wsheet, '2') wsheet_header = g_spreadsheet.get_header(client, ssheet, wsheet) try: wsheet_indexes = [wsheet_header.index(key) for key in keys] header_indexes = [header.index(key) for key in keys] except ValueError: logger2.warn("WARNING: Could not identify correct header for duplicate detection") else: for row in rows: try: key = "#".join([row[i] for i in header_indexes]) for i, wrow in enumerate(wsheet_data): wkey = "#".join([wrow[j] for j in wsheet_indexes]) if wkey == key: g_spreadsheet.delete_row(client, ssheet, wsheet, i+1) wsheet_data.pop(i) break except: logger2.warn("WARNING: Could not identify/replace duplicate rows") # Write the data to the worksheet success = g_spreadsheet.write_rows(client, ssheet, wsheet, header, rows) if success: logger2.info("Wrote data to the {!r}:{!r} " \ "worksheet".format(ssheet.title.text, wsheet_title)) else: logger2.error("ERROR: Could not write data to the {!r}:{!r} " \ "worksheet".format(ssheet.title.text, wsheet_title)) return success
def parse_bc_metrics(self): """Parse bc metrics at sample level""" log.info("parse_bc_metrics for flowcell {}".format(self["RunInfo"]["Flowcell"])) for lane in self._lanes: pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?[\._]bc[\._]metrics".format(lane) self["lanes"][str(lane)]["bc_metrics"] = {"reads":None, "reads_aligned":None, "reads_fail_align":None} files = self.filter_files(pattern) try: parser = MetricsParser() fp = open(files[0]) data = parser.parse_bc_metrics(fp) fp.close() self["lanes"][str(lane)]["bc_metrics"] = data except: log.warn("No bc_metrics info for lane {}".format(lane))
def parse_filter_metrics(self): """pre-CASAVA: Parse filter metrics at flowcell level""" log.info("parse_filter_metrics for flowcell {}".format(self["RunInfo"]["Flowcell"])) for lane in self._lanes: pattern = "{}_[0-9]+_[0-9A-Za-z]+(_nophix)?.filter_metrics".format(lane) self["lanes"][str(lane)]["filter_metrics"] = {"reads":None, "reads_aligned":None, "reads_fail_align":None} files = self.filter_files(pattern) try: fp = open(files[0]) parser = MetricsParser() data = parser.parse_filter_metrics(fp) fp.close() self["lanes"][str(lane)]["filter_metrics"] = data except: log.warn("No filter nophix metrics for lane {}".format(lane))
def get_spreadsheet(ssheet_title, encoded_credentials): """Connect to Google docs and get a spreadsheet""" # Convert the spreadsheet title to unicode ssheet_title = _to_unicode(ssheet_title) # Create a client class which will make HTTP requests with Google Docs server. client = g_spreadsheet.get_client() bcbio.google.connection.authenticate(client, encoded_credentials) # Locate the spreadsheet ssheet = g_spreadsheet.get_spreadsheet(client, ssheet_title) # Check that we got a result back if not ssheet: logger2.warn("No document with specified title '%s' found in \ GoogleDocs repository" % ssheet_title) return (None, None) return (client, ssheet)
def parse_run_info_yaml(self, run_info_yaml): log.info("parse_run_info_yaml: going to read {} in directory {}".format(run_info_yaml, self.path)) fp = open(run_info_yaml) runinfo = yaml.load(fp) fp.close() for info in runinfo: if not self["lane"].has_key(info["lane"]): lane = LaneQCMetrics(self.get_full_flowcell(), self.get_date(), info["lane"]) self["lane"][info["lane"]] = lane ## Add sample for unmatched data sample = SampleQCMetrics(self.get_full_flowcell(), self.get_date(), info["lane"], "unmatched", "unmatched", "NA", "NA", "NA", "NA") bc_index = "%s_%s" % (info["lane"], "unmatched") self.sample[bc_index] = sample ## Lane could be empty try: for mp in info["multiplex"]: sample = SampleQCMetrics(self.get_full_flowcell(), self.get_date(), info["lane"], mp["name"], mp["barcode_id"], mp.get("sample_prj", None), mp["sequence"], mp["barcode_type"], mp.get("genomes_filter_out", None)) bc_index = "%s_%s" % (info["lane"], mp["barcode_id"]) self.sample[bc_index] = sample except: log.warn("No multiplexing information for lane %s" % info['lane']) self["metrics"]["run_info_yaml"] = runinfo
def split_sample_name(sample_name): """Split a sample name into parts consisting of - project_name [PNNN] - sample_number [NNN] - reception_qc [F] - prep_version [B] - index_id [indexN] """ splits = sample_name.split("_") prep = "" try: if len(splits) < 2: raise ValueError() if splits[0][0] != 'P': raise ValueError() if type(int(splits[0][1:])) != int: raise ValueError() while splits[1][-1] in "FB": prep = "%c%s" % (splits[1][-1], prep) splits[1] = splits[1][0:-1] if type(int(splits[1])) != int: raise ValueError() except: logger2.warn( "Sample name '%s' does not follow the expected format PXXX_XXX[FB]" % sample_name) if len(prep) > 0: splits[1] = "%s%s" % (splits[1], prep) name = [] index = [] for s in splits: if len(index) == 0 and s.find('index') < 0: name.append(s) else: index.append(s) return "_".join(name), "_".join(index)
def split_sample_name(sample_name): """Split a sample name into parts consisting of - project_name [PNNN] - sample_number [NNN] - reception_qc [F] - prep_version [B] - index_id [indexN] """ splits = sample_name.split("_") prep = "" try: if len(splits) < 2: raise ValueError() if splits[0][0] != 'P': raise ValueError() if type(int(splits[0][1:])) != int: raise ValueError() while splits[1][-1] in "FB": prep = "%c%s" % (splits[1][-1],prep) splits[1] = splits[1][0:-1] if type(int(splits[1])) != int: raise ValueError() except: logger2.warn("Sample name '%s' does not follow the expected format PXXX_XXX[FB]" % sample_name) if len(prep) > 0: splits[1] = "%s%s" % (splits[1],prep) name = [] index = [] for s in splits: if len(index) == 0 and s.find('index') < 0: name.append(s) else: index.append(s) return "_".join(name), "_".join(index)
def split_by_barcode(fastq1, fastq2, multiplex, base_name, dirs, config): """Split a fastq file into multiplex pieces using barcode details. """ unmatched_str = "unmatched" demultiplexed = config["algorithm"].get("demultiplexed", False) if len(multiplex) == 1 and multiplex[0]["barcode_id"] is None: return {None: (fastq1, fastq2)} bc_dir = os.path.join(dirs["work"], "%s_barcode" % base_name) nomatch_file = "%s_%s_1_fastq.txt" % (base_name, unmatched_str) metrics_file = "%s.bc_metrics" % base_name out_files = [] for info in multiplex: if demultiplexed: out_tuple = [info["barcode_id"]] # If the data is already demultiplexed, the sequence files must have been specified in the config out_tuple.extend(get_fastq_files(dirs["fastq"], dirs["work"], info, "", config=config)) #out_tuple.extend([fastq1,fastq2]) out_files.append(tuple(out_tuple)) continue fq_fname = lambda x: os.path.join(bc_dir, "%s_%s_%s_fastq.txt" % (base_name, info["barcode_id"], x)) bc_file1 = fq_fname("1") bc_file2 = fq_fname("2") if fastq2 else None out_files.append((info["barcode_id"], bc_file1, bc_file2)) if not utils.file_exists(bc_dir) and not demultiplexed: with file_transaction(bc_dir) as tx_bc_dir: with utils.chdir(tx_bc_dir): tag_file, need_trim = _make_tag_file(multiplex, unmatched_str, config) cl = [config["program"]["barcode"], tag_file, "%s_--b--_--r--_fastq.txt" % base_name, fastq1] if fastq2: cl.append(fastq2) cl.append("--mismatch=%s" % config["algorithm"]["bc_mismatch"]) cl.append("--metrics=%s" % metrics_file) if int(config["algorithm"]["bc_read"]) > 1: cl.append("--read=%s" % config["algorithm"]["bc_read"]) if int(config["algorithm"]["bc_position"]) == 5: cl.append("--five") if config["algorithm"].get("bc_allow_indels", True) is False: cl.append("--noindel") if "bc_offset" in config["algorithm"]: cl.append("--bc_offset=%s" % config["algorithm"]["bc_offset"]) subprocess.check_call(cl) else: with utils.curdir_tmpdir() as tmp_dir: with utils.chdir(tmp_dir): _, need_trim = _make_tag_file(multiplex, unmatched_str, config) out = {} for b, f1, f2 in out_files: if os.path.exists(f1): if b in need_trim: f1, f2 = _basic_trim(f1, f2, need_trim[b], config) out[b] = (f1, f2) if not demultiplexed: return out casava_stats = _find_demultiplex_stats_htm(base_name, config) if not casava_stats: logger2.warn("Demultiplex_Stats.htm not found! " \ "Barcode stats will be meaningless.") bc_metrics = {int(multiplex[0]["lane"]): \ {None: { "read_count": 0, "name": None, "barcode_id": None}} } else: bc_metrics = _parse_demultiplex_stats_htm(casava_stats) _write_demultiplex_metrics(multiplex, bc_metrics, metrics_file) return out
def report_to_statusdb(fc_name, fc_date, run_info_yaml, dirs, config): """ Create statusdb report on a couchdb server. A FlowcellQCMetrics object holds information about a flowcell. QC results are stored at the flowcell level and sample level depending on analysis. Lane level QC data are stored in the FlowcellQCMetrics object. """ success = True try: statusdb_config = config.get("statusdb", None) if statusdb_config is None: log.info("Could not find statusdb section in configuration. No statusdb reporting will be done") return False statusdb_url = statusdb_config.get("url", None) if statusdb_url is None: log.warn("No url field found in statusdb configuration section.") return False # Add email notification email = statusdb_config.get("statusdb_email_notification", None) smtp_host = config.get("smtp_host", "") smtp_port = config.get("smtp_port", "") log_handler = create_log_handler({'email': email, 'smtp_host': smtp_host, 'smtp_port': smtp_port}, True) with log_handler.applicationbound(): with logbook.Processor(lambda record: record.extra.__setitem__('run', "%s_%s" % (fc_date, fc_name))): log.info("Started creating QC Metrics report on statusdb for %s_%s on %s" % (fc_date, fc_name, datetime.now().isoformat())) # Create object and parse all available metrics; no checking # is currently done for missing files try: qc_obj = FlowcellQCMetrics(fc_date, fc_name, run_info_yaml, dirs.get("work", None), dirs.get("flowcell", None)) except: qc_obj = None # FIXME: error checking! if qc_obj is not None: try: # Save data at a sample level log.info("Connecting to server at %s" % statusdb_url) try: couch = couchdb.Server(url="http://%s" % statusdb_url) except: log.warn("Connecting to server at %s failed" % statusdb_url) log.info("Connecting to server at %s succeeded" % statusdb_url) db=couch['qc'] # Save samples for s in qc_obj.sample.keys(): obj = qc_obj.sample[s] log.info("Saving sample %s" % obj.name()) _save_obj(db, obj, statusdb_url) # Save flowcell object _save_obj(db, qc_obj, statusdb_url) except Exception as e: success = False else: log.warn("Couldn't populate FlowcellQCMetrics object. No QC data written to statusdb for %s_%s" % (fc_date, fc_name)) success = False if success: log.info("QC Metrics report successfully written to statusdb for %s_%s on %s" \ % (fc_date, fc_name, datetime.now().isoformat())) else: log.warn("Encountered exception when writing to statusdb for %s_%s on %s" \ % (fc_date, fc_name, datetime.now().isoformat())) except Exception as e: success = False log.warn("Encountered exception when writing QC metrics to statusdb: %s" % e) return success
def report_to_statusdb(fc_name, fc_date, run_info_yaml, dirs, config): """ Create statusdb report on a couchdb server. A FlowcellQCMetrics object holds information about a flowcell. QC results are stored at the flowcell level and sample level depending on analysis. Lane level QC data are stored in the FlowcellQCMetrics object. """ success = True try: statusdb_config = config.get("statusdb", None) if statusdb_config is None: log.info( "Could not find statusdb section in configuration. No statusdb reporting will be done" ) return False statusdb_url = statusdb_config.get("url", None) if statusdb_url is None: log.warn("No url field found in statusdb configuration section.") return False # Add email notification email = statusdb_config.get("statusdb_email_notification", None) smtp_host = config.get("smtp_host", "") smtp_port = config.get("smtp_port", "") log_handler = create_log_handler( { 'email': email, 'smtp_host': smtp_host, 'smtp_port': smtp_port }, True) with log_handler.applicationbound(): with logbook.Processor(lambda record: record.extra.__setitem__( 'run', "%s_%s" % (fc_date, fc_name))): log.info( "Started creating QC Metrics report on statusdb for %s_%s on %s" % (fc_date, fc_name, datetime.now().isoformat())) # Create object and parse all available metrics; no checking # is currently done for missing files try: qc_obj = FlowcellQCMetrics(fc_date, fc_name, run_info_yaml, dirs.get("work", None), dirs.get("flowcell", None)) except: qc_obj = None # FIXME: error checking! if qc_obj is not None: try: # Save data at a sample level log.info("Connecting to server at %s" % statusdb_url) try: couch = couchdb.Server(url="http://%s" % statusdb_url) except: log.warn("Connecting to server at %s failed" % statusdb_url) log.info("Connecting to server at %s succeeded" % statusdb_url) db = couch['qc'] # Save samples for s in qc_obj.sample.keys(): obj = qc_obj.sample[s] log.info("Saving sample %s" % obj.name()) _save_obj(db, obj, statusdb_url) # Save flowcell object _save_obj(db, qc_obj, statusdb_url) except Exception as e: success = False else: log.warn( "Couldn't populate FlowcellQCMetrics object. No QC data written to statusdb for %s_%s" % (fc_date, fc_name)) success = False if success: log.info("QC Metrics report successfully written to statusdb for %s_%s on %s" \ % (fc_date, fc_name, datetime.now().isoformat())) else: log.warn("Encountered exception when writing to statusdb for %s_%s on %s" \ % (fc_date, fc_name, datetime.now().isoformat())) except Exception as e: success = False log.warn( "Encountered exception when writing QC metrics to statusdb: %s" % e) return success
def process_lane(lane_items, fc_name, fc_date, dirs, config): """Prepare lanes, potentially splitting based on barcodes. """ lane_name = "%s_%s_%s" % (lane_items[0]['lane'], fc_date, fc_name) full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"], dirs["work"], lane_items[0], fc_name, config=config) # Filter phiX custom_config = _update_config_w_custom(config, lane_items[0]) if custom_config["algorithm"].get("filter_phix", False): # If we are starting from demultiplexed material, we will skip a lane-wise screening # Screening will be performed on a sample basis if custom_config["algorithm"].get("demultiplexed", False): logger.warn("Will not filter phix lane-wise on already demultiplexed files. " \ "You will have to specify genomes_filter_out option for each sample") else: logger.info("Filtering phiX from %s" % lane_name) info = { "genomes_filter_out": "spiked_phix", "description": lane_name } processed = remove_contaminants(full_fastq1, full_fastq2, info, lane_name, info["description"], dirs, custom_config) (full_fastq1, full_fastq2, _, lane_name) = processed[0][0:4] logger.info("Demultiplexing %s" % lane_name) bc_files = split_by_barcode(full_fastq1, full_fastq2, lane_items, lane_name, dirs, config) out = [] for item in lane_items: config = _update_config_w_custom(config, item) # Can specify all barcodes but might not have actual sequences # Would be nice to have a good way to check this is okay here. if item["barcode_id"] in bc_files: fastq1, fastq2 = bc_files[item["barcode_id"]] cur_lane_name = lane_name cur_lane_desc = item["description"] if item.get("name", "") and config["algorithm"].get( "include_short_name", True): cur_lane_desc = "%s : %s" % (item["name"], cur_lane_desc) if item["barcode_id"] is not None: cur_lane_name += "_%s" % (item["barcode_id"]) if config["algorithm"].get("trim_reads", False): trim_info = brun_trim_fastq( [x for x in [fastq1, fastq2] if x is not None], dirs, config) fastq1 = trim_info[0] if fastq2 is not None: fastq2 = trim_info[1] out.append((fastq1, fastq2, item, cur_lane_name, cur_lane_desc, dirs, config)) return out
def split_by_barcode(fastq1, fastq2, multiplex, base_name, dirs, config): """Split a fastq file into multiplex pieces using barcode details. """ unmatched_str = "unmatched" demultiplexed = config["algorithm"].get("demultiplexed", False) if len(multiplex) == 1 and multiplex[0]["barcode_id"] is None: return {None: (fastq1, fastq2)} bc_dir = os.path.join(dirs["work"], "%s_barcode" % base_name) nomatch_file = "%s_%s_1_fastq.txt" % (base_name, unmatched_str) metrics_file = "%s.bc_metrics" % base_name out_files = [] for info in multiplex: if demultiplexed: out_tuple = [info["barcode_id"]] # If the data is already demultiplexed, the sequence files must have been specified in the config out_tuple.extend( get_fastq_files(dirs["fastq"], dirs["work"], info, "", config=config)) #out_tuple.extend([fastq1,fastq2]) out_files.append(tuple(out_tuple)) continue fq_fname = lambda x: os.path.join( bc_dir, "%s_%s_%s_fastq.txt" % (base_name, info["barcode_id"], x)) bc_file1 = fq_fname("1") bc_file2 = fq_fname("2") if fastq2 else None out_files.append((info["barcode_id"], bc_file1, bc_file2)) if not utils.file_exists(bc_dir) and not demultiplexed: with file_transaction(bc_dir) as tx_bc_dir: with utils.chdir(tx_bc_dir): tag_file, need_trim = _make_tag_file(multiplex, unmatched_str, config) cl = [ config["program"]["barcode"], tag_file, "%s_--b--_--r--_fastq.txt" % base_name, fastq1 ] if fastq2: cl.append(fastq2) cl.append("--mismatch=%s" % config["algorithm"]["bc_mismatch"]) cl.append("--metrics=%s" % metrics_file) if int(config["algorithm"]["bc_read"]) > 1: cl.append("--read=%s" % config["algorithm"]["bc_read"]) if int(config["algorithm"]["bc_position"]) == 5: cl.append("--five") if config["algorithm"].get("bc_allow_indels", True) is False: cl.append("--noindel") if "bc_offset" in config["algorithm"]: cl.append("--bc_offset=%s" % config["algorithm"]["bc_offset"]) subprocess.check_call(cl) else: with utils.curdir_tmpdir() as tmp_dir: with utils.chdir(tmp_dir): _, need_trim = _make_tag_file(multiplex, unmatched_str, config) out = {} for b, f1, f2 in out_files: if os.path.exists(f1): if b in need_trim: f1, f2 = _basic_trim(f1, f2, need_trim[b], config) out[b] = (f1, f2) if not demultiplexed: return out casava_stats = _find_demultiplex_stats_htm(base_name, config) if not casava_stats: logger2.warn("Demultiplex_Stats.htm not found! " \ "Barcode stats will be meaningless.") bc_metrics = {int(multiplex[0]["lane"]): \ {None: { "read_count": 0, "name": None, "barcode_id": None}} } else: bc_metrics = _parse_demultiplex_stats_htm(casava_stats) _write_demultiplex_metrics(multiplex, bc_metrics, metrics_file) return out
def create_report_on_gdocs(fc_date, fc_name, run_info_yaml, dirs, config): """Create reports on gdocs containing both demultiplexed read counts and QC data. """ success = True try: # Inject the fc_date and fc_name in the email subject def record_processor(record): return record.extra.__setitem__('run', "%s_%s" % (fc_date, fc_name)) # Parse the run_info.yaml file log.debug("Loading this run_info: {}".format(run_info_yaml)) with open(run_info_yaml, "r") as fh: run_info = yaml.load(fh) # Get the gdocs account credentials encoded_credentials = get_credentials(config) if not encoded_credentials: log.warn("Could not find Google Docs account credentials in configuration. \ No sequencing report was written") return False # Get the required parameters from the post_process.yaml configuration file gdocs = config.get("gdocs_upload", None) # Add email notification email = gdocs.get("gdocs_email_notification", None) smtp_host = config.get("smtp_host", "") smtp_port = config.get("smtp_port", "") log_handler = create_log_handler({'email': email, \ 'smtp_host': smtp_host, \ 'smtp_port': smtp_port}, True) except Exception as e: success = False log.warn("Encountered exception when writing sequencing report to Google Docs: %s" % e) with log_handler.applicationbound(), logbook.Processor(record_processor): try: log.info("Started creating sequencing report on Google docs for %s_%s on %s" \ % (fc_date, fc_name, datetime.datetime.now().isoformat())) # Get a flowcell object fc = Flowcell(fc_name, fc_date, run_info, dirs.get("work", None)) # Get the GDocs demultiplex result file title gdocs_dmplx_spreadsheet = gdocs.get("gdocs_dmplx_file", None) # Get the GDocs QC file title gdocs_qc_spreadsheet = gdocs.get("gdocs_qc_file", None) # FIXME: Make the bc stuff use the Flowcell module if gdocs_dmplx_spreadsheet is not None: # Upload the data bc_metrics.write_run_report_to_gdocs(fc, fc_date, \ fc_name, gdocs_dmplx_spreadsheet, encoded_credentials, append=True) else: log.warn("Could not find Google Docs demultiplex results file \ title in configuration. No demultiplex counts were \ written to Google Docs for %s_%s" % (fc_date, fc_name)) # Parse the QC metrics try: qc = RTAQCMetrics(dirs.get("flowcell", None)) except: qc = None if gdocs_qc_spreadsheet is not None and qc is not None: qc_metrics.write_run_report_to_gdocs(fc, qc, gdocs_qc_spreadsheet, encoded_credentials) else: log.warn("Could not find Google Docs QC file title in configuration. " \ "No QC data were written to Google Docs " \ "for %s_%s".format(fc_date, fc_name)) # Get the projects parent folder projects_folder = gdocs.get("gdocs_projects_folder", None) # Write the bc project summary report if projects_folder is not None: create_project_report_on_gdocs(fc, qc, \ encoded_credentials, projects_folder) except Exception as e: success = False log.warn("Encountered exception when writing sequencing report " \ "to Google Docs: {}".format(e)) if success: log.info("Sequencing report successfully created on Google " \ "docs for {}_{} on {}".format(fc_date, fc_name, datetime.datetime.now().isoformat())) else: log.warn("Encountered exception when writing sequencing " \ "report for %s_%s to Google docs on %s" \ % (fc_date, fc_name, datetime.datetime.now().isoformat())) return success
def create_report_on_gdocs(fc_date, fc_name, run_info_yaml, dirs, config): """Create reports on gdocs containing both demultiplexed read counts and QC data. """ success = True try: # Inject the fc_date and fc_name in the email subject def record_processor(record): return record.extra.__setitem__('run', "%s_%s" % (fc_date, fc_name)) # Parse the run_info.yaml file log.debug("Loading this run_info: {}".format(run_info_yaml)) with open(run_info_yaml, "r") as fh: run_info = yaml.load(fh) # Get the gdocs account credentials encoded_credentials = get_credentials(config) if not encoded_credentials: log.warn( "Could not find Google Docs account credentials in configuration. \ No sequencing report was written") return False # Get the required parameters from the post_process.yaml configuration file gdocs = config.get("gdocs_upload", None) # Add email notification email = gdocs.get("gdocs_email_notification", None) smtp_host = config.get("smtp_host", "") smtp_port = config.get("smtp_port", "") log_handler = create_log_handler({'email': email, \ 'smtp_host': smtp_host, \ 'smtp_port': smtp_port}, True) except Exception as e: success = False log.warn( "Encountered exception when writing sequencing report to Google Docs: %s" % e) with log_handler.applicationbound(), logbook.Processor(record_processor): try: log.info("Started creating sequencing report on Google docs for %s_%s on %s" \ % (fc_date, fc_name, datetime.datetime.now().isoformat())) # Get a flowcell object fc = Flowcell(fc_name, fc_date, run_info, dirs.get("work", None)) # Get the GDocs demultiplex result file title gdocs_dmplx_spreadsheet = gdocs.get("gdocs_dmplx_file", None) # Get the GDocs QC file title gdocs_qc_spreadsheet = gdocs.get("gdocs_qc_file", None) # FIXME: Make the bc stuff use the Flowcell module if gdocs_dmplx_spreadsheet is not None: # Upload the data bc_metrics.write_run_report_to_gdocs(fc, fc_date, \ fc_name, gdocs_dmplx_spreadsheet, encoded_credentials, append=True) else: log.warn("Could not find Google Docs demultiplex results file \ title in configuration. No demultiplex counts were \ written to Google Docs for %s_%s" % (fc_date, fc_name)) # Parse the QC metrics try: qc = RTAQCMetrics(dirs.get("flowcell", None)) except: qc = None if gdocs_qc_spreadsheet is not None and qc is not None: qc_metrics.write_run_report_to_gdocs(fc, qc, gdocs_qc_spreadsheet, encoded_credentials) else: log.warn("Could not find Google Docs QC file title in configuration. " \ "No QC data were written to Google Docs " \ "for %s_%s".format(fc_date, fc_name)) # Get the projects parent folder projects_folder = gdocs.get("gdocs_projects_folder", None) # Write the bc project summary report if projects_folder is not None: create_project_report_on_gdocs(fc, qc, \ encoded_credentials, projects_folder) except Exception as e: success = False log.warn("Encountered exception when writing sequencing report " \ "to Google Docs: {}".format(e)) if success: log.info("Sequencing report successfully created on Google " \ "docs for {}_{} on {}".format(fc_date, fc_name, datetime.datetime.now().isoformat())) else: log.warn("Encountered exception when writing sequencing " \ "report for %s_%s to Google docs on %s" \ % (fc_date, fc_name, datetime.datetime.now().isoformat())) return success