def _collect_casava_qc(self): qc_objects = [] runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))) if not os.path.exists(runinfo_csv): LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv)) runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv") try: with open(runinfo_csv) as fh: runinfo_reader = csv.reader(fh) runinfo = [x for x in runinfo_reader] except IOError as e: self.app.log.warn(str(e)) raise e fcdir = os.path.join(os.path.abspath(self._meta.root_path), self.pargs.flowcell) (fc_date, fc_name) = fc_parts(self.pargs.flowcell) ## Check modification time if modified_within_days(fcdir, self.pargs.mtime): fc_kw = dict(fc_date = fc_date, fc_name=fc_name) parser = FlowcellRunMetricsParser(fcdir) fcobj = FlowcellRunMetricsDocument(fc_date, fc_name) fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw) fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw) fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw) fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw) fcobj["undemultiplexed_barcodes"] = parser.parse_undemultiplexed_barcode_metrics(**fc_kw) fcobj["illumina"].update({"Demultiplex_Stats" : parser.parse_demultiplex_stats_htm(**fc_kw)}) fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv, **fc_kw) qc_objects.append(fcobj) qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, fc_name, fcdir, demultiplex_stats=fcobj["illumina"]["Demultiplex_Stats"]) return qc_objects
def _collect_casava_qc(self): qc_objects = [] runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))) if not os.path.exists(runinfo_csv): LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv)) runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv") try: with open(runinfo_csv) as fh: runinfo_reader = csv.reader(fh) runinfo = [x for x in runinfo_reader] except IOError as e: self.app.log.warn(str(e)) raise e fcdir = os.path.join(os.path.abspath(self._meta.root_path), self.pargs.flowcell) (fc_date, fc_name) = fc_parts(self.pargs.flowcell) ## Check modification time demux_stats = None if modified_within_days(fcdir, self.pargs.mtime): fc_kw = dict(fc_date = fc_date, fc_name=fc_name) parser = FlowcellRunMetricsParser(fcdir) fcobj = FlowcellRunMetricsDocument(fc_date, fc_name) fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw) fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw) fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw) fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw) fcobj["undemultiplexed_barcodes"] = parser.parse_undemultiplexed_barcode_metrics(**fc_kw) fcobj["illumina"].update({"Demultiplex_Stats" : parser.parse_demultiplex_stats_htm(**fc_kw)}) fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv, **fc_kw) demux_stats = fcobj["illumina"]["Demultiplex_Stats"] qc_objects.append(fcobj) qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, fc_name, fcdir, demultiplex_stats=demux_stats) return qc_objects
def _collect_casava_qc(self): qc_objects = [] read_setup = None demux_stats = None fcdir = os.path.join(os.path.abspath(self._meta.root_path), self.pargs.flowcell) # Get the fc_name, fc_date from RunInfo parser = FlowcellRunMetricsParser(fcdir) runinfo_xml = parser.parseRunInfo() runparams = parser.parseRunParameters() fc_date = runinfo_xml.get('Date', None) fc_name = runinfo_xml.get('Flowcell', None) fc_pos = runparams.get('FCPosition', '') runinfo_csv = os.path.join( os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_name)) if not os.path.exists(runinfo_csv): LOG.warn("No such file {}: trying fallback SampleSheet.csv".format( runinfo_csv)) runinfo_csv = os.path.join( os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv") runinfo = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv) if modified_within_days(fcdir, self.pargs.mtime): # Most of the code expects to have the flowcell position pre-pended to the flowcell id fc_kw = dict(fc_date=fc_date, fc_name="{}{}".format(fc_pos, fc_name)) fcobj = FlowcellRunMetricsDocument(**fc_kw) fcobj["RunInfo"] = runinfo_xml fcobj["RunParameters"] = runparams fcobj["DemultiplexConfig"] = parser.parseDemultiplexConfig(**fc_kw) fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw) fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw) fcobj[ "undemultiplexed_barcodes"] = parser.parse_undemultiplexed_barcode_metrics( **fc_kw) fcobj["illumina"].update({ "Demultiplex_Stats": parser.parse_demultiplex_stats_htm(**fc_kw) }) fcobj["samplesheet_csv"] = runinfo read_setup = fcobj["RunInfo"].get('Reads', []) fcobj["run_setup"] = self._run_setup(read_setup) demux_stats = fcobj["illumina"]["Demultiplex_Stats"] qc_objects.append(fcobj) qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, "{}{}".format(fc_pos, fc_name), fcdir, demultiplex_stats=demux_stats, setup=read_setup) return qc_objects
def __init__(self, run_dir, samplesheet=None): self._run_dir = os.path.normpath(run_dir) assert os.path.exists(self._run_dir), "The path %s is invalid" % self._run_dir # Parse the run parameters parser = FlowcellRunMetricsParser(self._run_dir) self.run_config = parser.parseRunParameters() self.run_info = parser.parseRunInfo() self.samplesheet_file = samplesheet or IlluminaRun.get_samplesheet(self._run_dir)
def __init__(self, run_dir, samplesheet=None): self._run_dir = os.path.normpath(run_dir) assert os.path.exists( self._run_dir), "The path %s is invalid" % self._run_dir # Parse the run parameters parser = FlowcellRunMetricsParser(self._run_dir) self.run_config = parser.parseRunParameters() self.run_info = parser.parseRunInfo() self.samplesheet_file = samplesheet or IlluminaRun.get_samplesheet( self._run_dir)
def _collect_pre_casava_qc(self): qc_objects = [] as_yaml = False runinfo_csv = os.path.join( os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))) if not os.path.exists(runinfo_csv): LOG.warn("No such file {}: trying fallback SampleSheet.csv".format( runinfo_csv)) runinfo_csv = os.path.join( os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv") runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml") try: if os.path.exists(runinfo_csv): with open(runinfo_csv) as fh: runinfo_reader = csv.reader(fh) runinfo = [x for x in runinfo_reader] else: as_yaml = True with open(runinfo_yaml) as fh: runinfo = yaml.load(fh) except IOError as e: self.app.log.warn(str(e)) raise e fcdir = os.path.abspath(self.pargs.flowcell) (fc_date, fc_name) = fc_parts(self.pargs.flowcell) ## Check modification time if modified_within_days(fcdir, self.pargs.mtime): fc_kw = dict(fc_date=fc_date, fc_name=fc_name) parser = FlowcellRunMetricsParser(fcdir) fcobj = FlowcellRunMetricsDocument(**fc_kw) fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw) fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw) fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw) fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw) fcobj["filter_metrics"] = parser.parse_filter_metrics(**fc_kw) fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv( runinfo_csv=runinfo_csv, **fc_kw) fcobj["run_info_yaml"] = parser.parse_run_info_yaml(**fc_kw) qc_objects.append(fcobj) else: return qc_objects qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, fc_name, fcdir, as_yaml=as_yaml) return qc_objects
def _collect_pre_casava_qc(self): qc_objects = [] as_yaml = False read_setup = None fcdir = os.path.abspath(self.pargs.flowcell) ## Check modification time if not modified_within_days(fcdir, self.pargs.mtime): return qc_objects # Get the fc_name, fc_date from RunInfo parser = FlowcellRunMetricsParser(fcdir) runinfo_xml = parser.parseRunInfo() runparams = parser.parseRunParameters() fc_date = runinfo_xml.get('Date',None) fc_name = runinfo_xml.get('Flowcell',None) fc_pos = runparams.get('FCPosition','') runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_name)) if not os.path.exists(runinfo_csv): LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv)) runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv") runinfo = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv) if len(runinfo) == 0: runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml") as_yaml = True try: with open(runinfo_yaml) as fh: runinfo = yaml.load(fh) except IOError as e: self.app.log.warn(str(e)) raise e # Most of the code expects to have the flowcell position pre-pended to the flowcell id fc_kw = dict(fc_date = fc_date, fc_name="{}{}".format(fc_pos,fc_name)) fcobj = FlowcellRunMetricsDocument(**fc_kw) fcobj["RunInfo"] = runinfo_xml fcobj["RunParameters"] = runparams fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw) fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw) fcobj["filter_metrics"] = parser.parse_filter_metrics(**fc_kw) fcobj["samplesheet_csv"] = runinfo fcobj["run_info_yaml"] = parser.parse_run_info_yaml(**fc_kw) read_setup = fcobj["RunInfo"].get('Reads',[]) fcobj["run_setup"] = self._run_setup(read_setup) qc_objects.append(fcobj) qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, "{}{}".format(fc_pos,fc_name), fcdir, as_yaml=as_yaml, setup=read_setup) return qc_objects
def collect_metrics(path, log): parser = FlowcellRunMetricsParser(path) run_info = parser.parseRunInfo() fcid = run_info.get('Flowcell',None) if fcid is None: log.error("Could not parse flowcell id from RunInfo.xml") return {} # Insert a dummy character as the parse method expects a flowcell position metrics = parser.parse_demultiplex_stats_htm(fcid) metrics['RunInfo'] = run_info # Get the undemultiplexed indexes undemux = parser.parse_undemultiplexed_barcode_metrics(fcid) metrics['Undemultiplexed'] = undemux return metrics
def collect_metrics(path): parser = FlowcellRunMetricsParser(path) run_info = parser.parseRunInfo() fcid = run_info.get('Flowcell', None) if fcid is None: LOG.error("Could not parse flowcell id from RunInfo.xml") return {} # Insert a dummy character as the parse method expects a flowcell position metrics = parser.parse_demultiplex_stats_htm(fcid) metrics['RunInfo'] = run_info # Get the undemultiplexed indexes undemux = parser.parse_undemultiplexed_barcode_metrics(fcid) metrics['Undemultiplexed'] = undemux return metrics
def parse_casava_directory(fc_dir): """Traverse a bcl2fastq v2.17 generated directory structure and return a dictionary """ projects = [] fc_dir = os.path.abspath(fc_dir) parser = FlowcellRunMetricsParser(fc_dir) run_info = parser.parseRunInfo() runparams = parser.parseRunParameters() fc_name = run_info.get('Flowcell',None) fc_date = run_info.get('Date',None) fc_pos = runparams.get('FCPosition','') assert fc_name is not None and fc_date is not None, "Could not parse flowcell name and flowcell date" unaligned_dir_pattern = os.path.join(fc_dir,"{}".format(CASAVA_OUTPUT_DIR)) basecall_stats_dir_pattern = os.path.join(unaligned_dir_pattern,"Basecall_Stats_*") basecall_stats_dir = [os.path.relpath(d,fc_dir) for d in glob.glob(basecall_stats_dir_pattern)] project_dir_pattern = os.path.join(unaligned_dir_pattern,"*__*_*_*") data=read_ssheet_csv(fc_dir) for project_dir in glob.glob(project_dir_pattern): project_samples = [] sample_dir_pattern = os.path.join(project_dir,"Sample_*") for sample_dir in glob.glob(sample_dir_pattern): fastq_file_pattern = os.path.join(sample_dir,"*.fastq.gz") fastq_files = [os.path.basename(file) for file in glob.glob(fastq_file_pattern)] sample_name = os.path.basename(sample_dir).replace("Sample_","").replace('__','.') samplesheet_pattern = os.path.join(sample_dir, "SampleSheet.csv") if not os.path.exists(samplesheet_pattern): write_samplesheet(samplesheet_pattern,data,sample_name,fc_name,fc_dir) samplesheet = glob.glob(samplesheet_pattern) project_samples.append({'sample_dir': os.path.basename(sample_dir), 'sample_name': sample_name, 'files': fastq_files, 'samplesheet': os.path.basename(samplesheet[0])}) project_name = os.path.basename(project_dir).replace('__','.') projects.append({'data_dir': os.path.relpath(os.path.dirname(project_dir),fc_dir), 'project_dir': os.path.basename(project_dir), 'project_name': project_name, 'samples': project_samples}) return {'fc_dir': fc_dir, 'fc_name': '{}{}'.format(fc_pos,fc_name), 'fc_date': fc_date, 'basecall_stats_dir': basecall_stats_dir, 'projects': projects}
def _collect_pre_casava_qc(self): qc_objects = [] as_yaml = False runinfo_csv = os.path.join( os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell)) ) if not os.path.exists(runinfo_csv): LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv)) runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv") runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml") try: if os.path.exists(runinfo_csv): with open(runinfo_csv) as fh: runinfo_reader = csv.reader(fh) runinfo = [x for x in runinfo_reader] else: as_yaml = True with open(runinfo_yaml) as fh: runinfo = yaml.load(fh) except IOError as e: self.app.log.warn(str(e)) raise e fcdir = os.path.abspath(self.pargs.flowcell) (fc_date, fc_name) = fc_parts(self.pargs.flowcell) ## Check modification time if modified_within_days(fcdir, self.pargs.mtime): fc_kw = dict(fc_date=fc_date, fc_name=fc_name) parser = FlowcellRunMetricsParser(fcdir) fcobj = FlowcellRunMetricsDocument(**fc_kw) fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw) fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw) fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw) fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw) fcobj["filter_metrics"] = parser.parse_filter_metrics(**fc_kw) fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv, **fc_kw) fcobj["run_info_yaml"] = parser.parse_run_info_yaml(**fc_kw) qc_objects.append(fcobj) else: return qc_objects qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, fc_name, fcdir, as_yaml=as_yaml) return qc_objects
def parse_casava_directory(fc_dir): """Traverse a CASAVA 1.8+ generated directory structure and return a dictionary """ projects = [] fc_dir = os.path.abspath(fc_dir) parser = FlowcellRunMetricsParser(fc_dir) run_info = parser.parseRunInfo() runparams = parser.parseRunParameters() fc_name = run_info.get('Flowcell',None) fc_date = run_info.get('Date',None) fc_pos = runparams.get('FCPosition','') assert fc_name is not None and fc_date is not None, "Could not parse flowcell name and flowcell date" unaligned_dir_pattern = os.path.join(fc_dir,"{}*".format(CASAVA_OUTPUT_DIR)) basecall_stats_dir_pattern = os.path.join(unaligned_dir_pattern,"Basecall_Stats_*") basecall_stats_dir = [os.path.relpath(d,fc_dir) for d in glob.glob(basecall_stats_dir_pattern)] project_dir_pattern = os.path.join(unaligned_dir_pattern,"Project_*") for project_dir in glob.glob(project_dir_pattern): project_samples = [] sample_dir_pattern = os.path.join(project_dir,"Sample_*") for sample_dir in glob.glob(sample_dir_pattern): fastq_file_pattern = os.path.join(sample_dir,"*.fastq.gz") samplesheet_pattern = os.path.join(sample_dir,"*.csv") fastq_files = [os.path.basename(file) for file in glob.glob(fastq_file_pattern)] samplesheet = glob.glob(samplesheet_pattern) assert len(samplesheet) == 1, "ERROR: Could not unambiguously locate samplesheet in %s" % sample_dir sample_name = os.path.basename(sample_dir).replace("Sample_","").replace('__','.') project_samples.append({'sample_dir': os.path.basename(sample_dir), 'sample_name': sample_name, 'files': fastq_files, 'samplesheet': os.path.basename(samplesheet[0])}) project_name = os.path.basename(project_dir).replace("Project_","").replace('__','.') projects.append({'data_dir': os.path.relpath(os.path.dirname(project_dir),fc_dir), 'project_dir': os.path.basename(project_dir), 'project_name': project_name, 'samples': project_samples}) return {'fc_dir': fc_dir, 'fc_name': '{}{}'.format(fc_pos,fc_name), 'fc_date': fc_date, 'basecall_stats_dir': basecall_stats_dir, 'projects': projects}
def _collect_casava_qc(self): qc_objects = [] read_setup = None demux_stats = None fcdir = os.path.join(os.path.abspath(self._meta.root_path), self.pargs.flowcell) # Get the fc_name, fc_date from RunInfo parser = FlowcellRunMetricsParser(fcdir) runinfo_xml = parser.parseRunInfo() runparams = parser.parseRunParameters() fc_date = runinfo_xml.get('Date',None) fc_name = runinfo_xml.get('Flowcell',None) fc_pos = runparams.get('FCPosition','') runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_name)) if not os.path.exists(runinfo_csv): LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv)) runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv") runinfo = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv) if modified_within_days(fcdir, self.pargs.mtime): # Most of the code expects to have the flowcell position pre-pended to the flowcell id fc_kw = dict(fc_date = fc_date, fc_name="{}{}".format(fc_pos,fc_name)) fcobj = FlowcellRunMetricsDocument(**fc_kw) fcobj["RunInfo"] = runinfo_xml fcobj["RunParameters"] = runparams fcobj["DemultiplexConfig"] = parser.parseDemultiplexConfig(**fc_kw) fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw) fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw) fcobj["undemultiplexed_barcodes"] = parser.parse_undemultiplexed_barcode_metrics(**fc_kw) fcobj["illumina"].update({"Demultiplex_Stats" : parser.parse_demultiplex_stats_htm(**fc_kw)}) fcobj["samplesheet_csv"] = runinfo read_setup = fcobj["RunInfo"].get('Reads',[]) fcobj["run_setup"] = self._run_setup(read_setup) demux_stats = fcobj["illumina"]["Demultiplex_Stats"] qc_objects.append(fcobj) qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, "{}{}".format(fc_pos,fc_name), fcdir, demultiplex_stats=demux_stats, setup=read_setup) return qc_objects