def test_get_bc_count_demux_stats(self): parser = SampleRunMetricsParser(os.path.join(project_dir, "J.Doe_00_01", "P001_101_index3", "120924_AC003CCCXX")) bc_count = parser.get_bc_count(**self.sample_kw) fc_parser = FlowcellRunMetricsParser(self.fcdir) data = fc_parser.parse_demultiplex_stats_htm(**self.fc_kw) bc_count = parser.get_bc_count(demultiplex_stats=data, **self.sample_kw) self.assertEqual(str(bc_count), str(19517198))
def _parse_samplesheet(self, runinfo, qc_objects, fc_date, fc_name, fcdir, as_yaml=False, demultiplex_stats=None): """Parse samplesheet information and populate sample run metrics object""" if as_yaml: for info in runinfo: if not info.get("multiplex", None): self.app.log.warn("No multiplex information for lane {}".format(info.get("lane"))) sample = {} sample.update({k: info.get(k, None) for k in ('analysis', 'description', 'flowcell_id', 'lane')}) sample_kw = dict(path=fcdir, flowcell=fc_name, date=fc_date, lane=sample.get('lane', None), barcode_name=sample.get('name', None), sample_prj=sample.get('sample_prj', None), barcode_id=sample.get('barcode_id', None), sequence=sample.get('sequence', "NoIndex")) for sample in info["multiplex"]: sample.update({k: info.get(k, None) for k in ('analysis', 'description', 'flowcell_id', 'lane')}) sample_kw = dict(flowcell=fc_name, date=fc_date, lane=sample['lane'], barcode_name=sample['name'], sample_prj=sample.get('sample_prj', None), barcode_id=sample['barcode_id'], sequence=sample.get('sequence', "NoIndex")) parser = SampleRunMetricsParser(fcdir) obj = SampleRunMetricsDocument(**sample_kw) obj["picard_metrics"] = parser.read_picard_metrics(**sample_kw) obj["fastq_scr"] = parser.parse_fastq_screen(**sample_kw) obj["bc_count"] = parser.get_bc_count(**sample_kw) obj["fastqc"] = parser.read_fastqc_metrics(**sample_kw) qc_objects.append(obj) else: for sample in runinfo[1:]: LOG.debug("Getting information for sample defined by {}".format(sample)) d = dict(zip(runinfo[0], sample)) if self.app.pargs.project_name and self.app.pargs.project_name != d['SampleProject']: continue if self.app.pargs.sample and self.app.pargs.sample != d['SampleID']: continue sampledir = os.path.join(os.path.abspath(self._meta.production_root_path), d['SampleProject'].replace("__", "."), d['SampleID']) if not os.path.exists(sampledir): self.app.log.warn("No such sample directory: {}".format(sampledir)) continue sample_fcdir = os.path.join(sampledir, fc_fullname(self.pargs.flowcell)) if not os.path.exists(sample_fcdir): self.app.log.warn("No such sample flowcell directory: {}".format(sample_fcdir)) continue if not modified_within_days(sample_fcdir, self.pargs.mtime): continue runinfo_yaml_file = os.path.join(sample_fcdir, "{}-bcbb-config.yaml".format(d['SampleID'])) if not os.path.exists(runinfo_yaml_file): self.app.log.warn("No such yaml file for sample: {}".format(runinfo_yaml_file)) raise IOError(2, "No such yaml file for sample: {}".format(runinfo_yaml_file), runinfo_yaml_file) with open(runinfo_yaml_file) as fh: runinfo_yaml = yaml.load(fh) if not runinfo_yaml['details'][0].get("multiplex", None): self.app.log.warn("No multiplex information for sample {}".format(d['SampleID'])) continue sample_kw = dict(flowcell=fc_name, date=fc_date, lane=d['Lane'], barcode_name=d['SampleID'], sample_prj=d['SampleProject'].replace("__", "."), barcode_id=runinfo_yaml['details'][0]['multiplex'][0]['barcode_id'], sequence=runinfo_yaml['details'][0]['multiplex'][0]['sequence']) parser = SampleRunMetricsParser(sample_fcdir) obj = SampleRunMetricsDocument(**sample_kw) obj["picard_metrics"] = parser.read_picard_metrics(**sample_kw) obj["fastq_scr"] = parser.parse_fastq_screen(**sample_kw) obj["bc_count"] = parser.get_bc_count(demultiplex_stats=demultiplex_stats, **sample_kw) obj["fastqc"] = parser.read_fastqc_metrics(**sample_kw) qc_objects.append(obj) return qc_objects
def test_get_bc_count_demux_stats(self): parser = SampleRunMetricsParser( os.path.join(project_dir, "J.Doe_00_01", "P001_101_index3", "120924_AC003CCCXX")) bc_count = parser.get_bc_count(**self.sample_kw) fc_parser = FlowcellRunMetricsParser(self.fcdir) data = fc_parser.parse_demultiplex_stats_htm(**self.fc_kw) bc_count = parser.get_bc_count(demultiplex_stats=data, **self.sample_kw) self.assertEqual(str(bc_count), str(19517198))
def upload_analysis(self): kw = vars(self.pargs) if not kw.get("flowcell"): kw["flowcell"] = "TOTAL" # Get a connection to the analysis database acon = AnalysisConnection(**kw) # Traverse the folder hierarchy and determine paths to process to_process = {} for pdir in os.listdir(self._meta.root_path): pdir = os.path.join(self._meta.root_path,pdir) if not os.path.isdir(pdir): continue plist = [] for sdir in [d for d in os.listdir(pdir) if re.match(r'^P[0-9]{3,}_[0-9]+',d)]: fdir = os.path.join(pdir,sdir,kw.get("flowcell")) if not os.path.exists(fdir) or not modified_within_days(fdir, self.pargs.mtime): continue plist.append(fdir) if plist: to_process[os.path.basename(pdir)] = plist # Collect the data from each folder for project_name, sdirs in to_process.items(): self.log.info("Processing {}".format(project_name)) samples = {} for sdir in sdirs: config = glob.glob(os.path.join(sdir,"*-bcbb-config.yaml")) if not config: self.log.error("Could not find sample configuration file in {}. Skipping sample.".format(sdir)) continue if len(config) > 1: self.log.warn("Multiple sample configuration files found in {}. Will only use {}.".format(sdir,os.path.basename(config[0]))) # Parse the config file and get the flowcell, lane and index sequence that may be needed to parse info = {} sinfos = [] with open(config[0]) as fh: info = yaml.load(fh) fcdate = info.get("fc_date") fcname = info.get("fc_name") for laneinfo in info.get("details",[]): for sampleinfo in laneinfo.get("multiplex",[laneinfo]): linfo = laneinfo linfo.update(sampleinfo) name = linfo.get("name",linfo.get("description","unknown")) m = re.match(r'(P[0-9_]{4,}[0-9])',name) if m: name = m.group(1) sample_kw = {'flowcell': linfo.get("flowcell_id") if not fcname else fcname, 'date': fcdate, 'lane': linfo.get("lane"), 'barcode_name': name, 'sample_prj': linfo.get("sample_prj",project_name), 'barcode_id': linfo.get("barcode_id","1"), 'sequence': linfo.get("sequence","NoIndex")} sinfos.append(sample_kw) # Create a parser object and collect the metrics parser = SampleRunMetricsParser(sdir) sinfo = sinfos[0] name = sinfo.get("barcode_name","unknown") samples[name] = {} samples[name]["bcbb_checkpoints"] = parser.parse_bcbb_checkpoints(**sinfo) samples[name]["software_versions"] = parser.parse_software_versions(**sinfo) samples[name]["project_summary"] = parser.parse_project_summary(**sinfo) samples[name]["snpeff_genes"] = parser.parse_snpeff_genes(**sinfo) for sinfo in sinfos: picard = parser.read_picard_metrics(**sinfo) if picard: samples[name]["picard_metrics"] = picard fq_scr = parser.parse_fastq_screen(**sinfo) if fq_scr: samples[name]["fastq_scr"] = fq_scr fastqc = parser.read_fastqc_metrics(**sinfo) if fastqc.get("stats"): samples[name]["fastqc"] = fastqc gteval = parser.parse_eval_metrics(**sinfo) if gteval: samples[name]["gatk_variant_eval"] = gteval # Store the collected metrics in an analysis document obj = AnalysisDocument(**{'project_name': project_name, 'name': project_name, 'samples': samples}) dry("Saving object {}".format(repr(obj)), acon.save(obj))
def _parse_samplesheet(self, runinfo, qc_objects, fc_date, fc_name, fcdir, as_yaml=False, demultiplex_stats=None, setup=None): """Parse samplesheet information and populate sample run metrics object""" if as_yaml: for info in runinfo: if not info.get("multiplex", None): self.app.log.warn("No multiplex information for lane {}".format(info.get("lane"))) sample = {} sample.update({k: info.get(k, None) for k in ('analysis', 'description', 'flowcell_id', 'lane')}) sample_kw = dict(path=fcdir, flowcell=fc_name, date=fc_date, lane=sample.get('lane', None), barcode_name=sample.get('name', None), sample_prj=sample.get('sample_prj', None), barcode_id=sample.get('barcode_id', None), sequence=sample.get('sequence', "NoIndex")) for sample in info["multiplex"]: sample.update({k: info.get(k, None) for k in ('analysis', 'description', 'flowcell_id', 'lane')}) sample_kw = dict(flowcell=fc_name, date=fc_date, lane=sample['lane'], barcode_name=sample['name'], sample_prj=sample.get('sample_prj', None), barcode_id=sample['barcode_id'], sequence=sample.get('sequence', "NoIndex")) parser = SampleRunMetricsParser(fcdir) obj = SampleRunMetricsDocument(**sample_kw) obj["picard_metrics"] = parser.read_picard_metrics(**sample_kw) obj["fastq_scr"] = parser.parse_fastq_screen(**sample_kw) obj["bc_count"] = parser.get_bc_count(run_setup=setup, **sample_kw) obj["fastqc"] = parser.read_fastqc_metrics(**sample_kw) obj["bcbb_checkpoints"] = parser.parse_bcbb_checkpoints(**sample_kw) qc_objects.append(obj) else: for d in runinfo: LOG.debug("Getting information for sample defined by {}".format(d.values())) if self.app.pargs.project_name and self.app.pargs.project_name != d['SampleProject']: continue if self.app.pargs.sample and self.app.pargs.sample != d['SampleID']: continue sampledir = os.path.join(os.path.abspath(self._meta.production_root_path), d['SampleProject'].replace("__", "."), d['SampleID']) if not os.path.exists(sampledir): self.app.log.warn("No such sample directory: {}".format(sampledir)) continue sample_fcdir = os.path.join(sampledir, fc_fullname(self.pargs.flowcell)) if not os.path.exists(sample_fcdir): self.app.log.warn("No such sample flowcell directory: {}".format(sample_fcdir)) continue if not modified_within_days(sample_fcdir, self.pargs.mtime): continue runinfo_yaml_file = os.path.join(sample_fcdir, "{}-bcbb-config.yaml".format(d['SampleID'])) if not os.path.exists(runinfo_yaml_file): self.app.log.warn("No such yaml file for sample: {}".format(runinfo_yaml_file)) raise IOError(2, "No such yaml file for sample: {}".format(runinfo_yaml_file), runinfo_yaml_file) with open(runinfo_yaml_file) as fh: runinfo_yaml = yaml.load(fh) if not runinfo_yaml['details'][0].get("multiplex", None): self.app.log.warn("No multiplex information for sample {}".format(d['SampleID'])) runinfo_yaml['details'][0]['multiplex'] = [{'barcode_id': 0, 'sequence': 'NoIndex'}] sample_kw = dict(flowcell=fc_name, date=fc_date, lane=d['Lane'], barcode_name=d['SampleID'], sample_prj=d['SampleProject'].replace("__", "."), barcode_id=runinfo_yaml['details'][0]['multiplex'][0]['barcode_id'], sequence=runinfo_yaml['details'][0]['multiplex'][0]['sequence']) parser = SampleRunMetricsParser(sample_fcdir) obj = SampleRunMetricsDocument(**sample_kw) obj["picard_metrics"] = parser.read_picard_metrics(**sample_kw) obj["fastq_scr"] = parser.parse_fastq_screen(**sample_kw) obj["bc_count"] = parser.get_bc_count(demultiplex_stats=demultiplex_stats, run_setup=setup, **sample_kw) obj["fastqc"] = parser.read_fastqc_metrics(**sample_kw) obj["bcbb_checkpoints"] = parser.parse_bcbb_checkpoints(**sample_kw) qc_objects.append(obj) return qc_objects
def test_get_bc_count(self): parser = SampleRunMetricsParser( os.path.join(project_dir, "J.Doe_00_01", "P001_101_index3", "120924_AC003CCCXX")) bc_count = parser.get_bc_count(**self.sample_kw) self.assertEqual(bc_count, 0)
def test_get_bc_count(self): parser = SampleRunMetricsParser( os.path.join(project_dir, "J.Doe_00_01", "P001_101_index3", "120924_AC003CCCXX") ) bc_count = parser.get_bc_count(**self.sample_kw) self.assertEqual(bc_count, 0)
def upload_analysis(self): kw = vars(self.pargs) if not kw.get("flowcell"): kw["flowcell"] = "TOTAL" # Get a connection to the analysis database acon = AnalysisConnection(**kw) # Traverse the folder hierarchy and determine paths to process to_process = {} for pdir in os.listdir(self._meta.root_path): pdir = os.path.join(self._meta.root_path, pdir) if not os.path.isdir(pdir): continue plist = [] for sdir in [ d for d in os.listdir(pdir) if re.match(r'^P[0-9]{3,}_[0-9]+', d) ]: fdir = os.path.join(pdir, sdir, kw.get("flowcell")) if not os.path.exists(fdir) or not modified_within_days( fdir, self.pargs.mtime): continue plist.append(fdir) if plist: to_process[os.path.basename(pdir)] = plist # Collect the data from each folder for project_name, sdirs in to_process.items(): self.log.info("Processing {}".format(project_name)) samples = {} for sdir in sdirs: config = glob.glob(os.path.join(sdir, "*-bcbb-config.yaml")) if not config: self.log.error( "Could not find sample configuration file in {}. Skipping sample." .format(sdir)) continue if len(config) > 1: self.log.warn( "Multiple sample configuration files found in {}. Will only use {}." .format(sdir, os.path.basename(config[0]))) # Parse the config file and get the flowcell, lane and index sequence that may be needed to parse info = {} sinfos = [] with open(config[0]) as fh: info = yaml.load(fh) fcdate = info.get("fc_date") fcname = info.get("fc_name") for laneinfo in info.get("details", []): for sampleinfo in laneinfo.get("multiplex", [laneinfo]): linfo = laneinfo linfo.update(sampleinfo) name = linfo.get("name", linfo.get("description", "unknown")) m = re.match(r'(P[0-9_]{4,}[0-9])', name) if m: name = m.group(1) sample_kw = { 'flowcell': linfo.get("flowcell_id") if not fcname else fcname, 'date': fcdate, 'lane': linfo.get("lane"), 'barcode_name': name, 'sample_prj': linfo.get("sample_prj", project_name), 'barcode_id': linfo.get("barcode_id", "1"), 'sequence': linfo.get("sequence", "NoIndex") } sinfos.append(sample_kw) # Create a parser object and collect the metrics parser = SampleRunMetricsParser(sdir) sinfo = sinfos[0] name = sinfo.get("barcode_name", "unknown") samples[name] = {} samples[name][ "bcbb_checkpoints"] = parser.parse_bcbb_checkpoints( **sinfo) samples[name][ "software_versions"] = parser.parse_software_versions( **sinfo) samples[name][ "project_summary"] = parser.parse_project_summary(**sinfo) samples[name]["snpeff_genes"] = parser.parse_snpeff_genes( **sinfo) for sinfo in sinfos: picard = parser.read_picard_metrics(**sinfo) if picard: samples[name]["picard_metrics"] = picard fq_scr = parser.parse_fastq_screen(**sinfo) if fq_scr: samples[name]["fastq_scr"] = fq_scr fastqc = parser.read_fastqc_metrics(**sinfo) if fastqc.get("stats"): samples[name]["fastqc"] = fastqc gteval = parser.parse_eval_metrics(**sinfo) if gteval: samples[name]["gatk_variant_eval"] = gteval # Store the collected metrics in an analysis document obj = AnalysisDocument( **{ 'project_name': project_name, 'name': project_name, 'samples': samples }) dry("Saving object {}".format(repr(obj)), acon.save(obj))