def _collect_casava_qc(self): qc_objects = [] runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))) if not os.path.exists(runinfo_csv): LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv)) runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv") try: with open(runinfo_csv) as fh: runinfo_reader = csv.reader(fh) runinfo = [x for x in runinfo_reader] except IOError as e: self.app.log.warn(str(e)) raise e fcdir = os.path.join(os.path.abspath(self._meta.root_path), self.pargs.flowcell) (fc_date, fc_name) = fc_parts(self.pargs.flowcell) ## Check modification time demux_stats = None if modified_within_days(fcdir, self.pargs.mtime): fc_kw = dict(fc_date = fc_date, fc_name=fc_name) parser = FlowcellRunMetricsParser(fcdir) fcobj = FlowcellRunMetricsDocument(fc_date, fc_name) fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw) fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw) fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw) fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw) fcobj["undemultiplexed_barcodes"] = parser.parse_undemultiplexed_barcode_metrics(**fc_kw) fcobj["illumina"].update({"Demultiplex_Stats" : parser.parse_demultiplex_stats_htm(**fc_kw)}) fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv, **fc_kw) demux_stats = fcobj["illumina"]["Demultiplex_Stats"] qc_objects.append(fcobj) qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, fc_name, fcdir, demultiplex_stats=demux_stats) return qc_objects
def _collect_casava_qc(self): qc_objects = [] runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))) if not os.path.exists(runinfo_csv): LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv)) runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv") try: with open(runinfo_csv) as fh: runinfo_reader = csv.reader(fh) runinfo = [x for x in runinfo_reader] except IOError as e: self.app.log.warn(str(e)) raise e fcdir = os.path.join(os.path.abspath(self._meta.root_path), self.pargs.flowcell) (fc_date, fc_name) = fc_parts(self.pargs.flowcell) ## Check modification time if modified_within_days(fcdir, self.pargs.mtime): fc_kw = dict(fc_date = fc_date, fc_name=fc_name) parser = FlowcellRunMetricsParser(fcdir) fcobj = FlowcellRunMetricsDocument(fc_date, fc_name) fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw) fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw) fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw) fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw) fcobj["undemultiplexed_barcodes"] = parser.parse_undemultiplexed_barcode_metrics(**fc_kw) fcobj["illumina"].update({"Demultiplex_Stats" : parser.parse_demultiplex_stats_htm(**fc_kw)}) fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv, **fc_kw) qc_objects.append(fcobj) qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, fc_name, fcdir, demultiplex_stats=fcobj["illumina"]["Demultiplex_Stats"]) return qc_objects
def upload_qc(self): if not self._check_pargs(['flowcell']): return url = self.pargs.url if self.pargs.url else self.app.config.get( "db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return if not validate_fc_directory_format(self.pargs.flowcell): self.app.log.warn( "Path '{}' does not conform to bcbio flowcell directory format; aborting" .format(self.pargs.flowcell)) return runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))) runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml") (fc_date, fc_name) = fc_parts(self.pargs.flowcell) if int(fc_date) < 120815: self.log.info( "Assuming pre-casava based file structure for {}".format( fc_id(self.pargs.flowcell))) qc_objects = self._collect_pre_casava_qc() else: self.log.info("Assuming casava based file structure for {}".format( fc_id(self.pargs.flowcell))) qc_objects = self._collect_casava_qc() if len(qc_objects) == 0: self.log.info("No out-of-date qc objects for {}".format( fc_id(self.pargs.flowcell))) return else: self.log.info("Retrieved {} updated qc objects".format( len(qc_objects))) s_con = SampleRunMetricsConnection(dbname=self.app.config.get( "db", "samples"), **vars(self.app.pargs)) fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get( "db", "flowcells"), **vars(self.app.pargs)) p_con = ProjectSummaryConnection(dbname=self.app.config.get( "db", "projects"), **vars(self.app.pargs)) for obj in qc_objects: if self.app.pargs.debug: self.log.debug("{}: {}".format(str(obj), obj["_id"])) if isinstance(obj, FlowcellRunMetricsDocument): dry("Saving object {}".format(repr(obj)), fc_con.save(obj)) if isinstance(obj, SampleRunMetricsDocument): project_sample = p_con.get_project_sample( obj.get("sample_prj", None), obj.get("barcode_name", None), self.pargs.extensive_matching) if project_sample: obj["project_sample_name"] = project_sample['sample_name'] dry("Saving object {}".format(repr(obj)), s_con.save(obj))
def _collect_pre_casava_qc(self): qc_objects = [] as_yaml = False runinfo_csv = os.path.join( os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))) if not os.path.exists(runinfo_csv): LOG.warn("No such file {}: trying fallback SampleSheet.csv".format( runinfo_csv)) runinfo_csv = os.path.join( os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv") runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml") try: if os.path.exists(runinfo_csv): with open(runinfo_csv) as fh: runinfo_reader = csv.reader(fh) runinfo = [x for x in runinfo_reader] else: as_yaml = True with open(runinfo_yaml) as fh: runinfo = yaml.load(fh) except IOError as e: self.app.log.warn(str(e)) raise e fcdir = os.path.abspath(self.pargs.flowcell) (fc_date, fc_name) = fc_parts(self.pargs.flowcell) ## Check modification time if modified_within_days(fcdir, self.pargs.mtime): fc_kw = dict(fc_date=fc_date, fc_name=fc_name) parser = FlowcellRunMetricsParser(fcdir) fcobj = FlowcellRunMetricsDocument(**fc_kw) fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw) fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw) fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw) fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw) fcobj["filter_metrics"] = parser.parse_filter_metrics(**fc_kw) fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv( runinfo_csv=runinfo_csv, **fc_kw) fcobj["run_info_yaml"] = parser.parse_run_info_yaml(**fc_kw) qc_objects.append(fcobj) else: return qc_objects qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, fc_name, fcdir, as_yaml=as_yaml) return qc_objects
def upload_qc(self): if not self._check_pargs(["flowcell"]): return url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url") if not url: self.app.log.warn("Please provide a valid url: got {}".format(url)) return if not validate_fc_directory_format(self.pargs.flowcell): self.app.log.warn( "Path '{}' does not conform to bcbio flowcell directory format; aborting".format(self.pargs.flowcell) ) return runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))) runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml") (fc_date, fc_name) = fc_parts(self.pargs.flowcell) if int(fc_date) < 120815: self.log.info("Assuming pre-casava based file structure for {}".format(fc_id(self.pargs.flowcell))) qc_objects = self._collect_pre_casava_qc() else: self.log.info("Assuming casava based file structure for {}".format(fc_id(self.pargs.flowcell))) qc_objects = self._collect_casava_qc() if len(qc_objects) == 0: self.log.info("No out-of-date qc objects for {}".format(fc_id(self.pargs.flowcell))) return else: self.log.info("Retrieved {} updated qc objects".format(len(qc_objects))) s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs)) fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs)) p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs)) for obj in qc_objects: if self.app.pargs.debug: self.log.debug("{}: {}".format(str(obj), obj["_id"])) if isinstance(obj, FlowcellRunMetricsDocument): dry("Saving object {}".format(repr(obj)), fc_con.save(obj)) if isinstance(obj, SampleRunMetricsDocument): project_sample = p_con.get_project_sample( obj.get("sample_prj", None), obj.get("barcode_name", None), self.pargs.extensive_matching ) if project_sample: obj["project_sample_name"] = project_sample["sample_name"] dry("Saving object {}".format(repr(obj)), s_con.save(obj))
def _collect_pre_casava_qc(self): qc_objects = [] as_yaml = False runinfo_csv = os.path.join( os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell)) ) if not os.path.exists(runinfo_csv): LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv)) runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv") runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml") try: if os.path.exists(runinfo_csv): with open(runinfo_csv) as fh: runinfo_reader = csv.reader(fh) runinfo = [x for x in runinfo_reader] else: as_yaml = True with open(runinfo_yaml) as fh: runinfo = yaml.load(fh) except IOError as e: self.app.log.warn(str(e)) raise e fcdir = os.path.abspath(self.pargs.flowcell) (fc_date, fc_name) = fc_parts(self.pargs.flowcell) ## Check modification time if modified_within_days(fcdir, self.pargs.mtime): fc_kw = dict(fc_date=fc_date, fc_name=fc_name) parser = FlowcellRunMetricsParser(fcdir) fcobj = FlowcellRunMetricsDocument(**fc_kw) fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw) fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw) fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw) fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw) fcobj["filter_metrics"] = parser.parse_filter_metrics(**fc_kw) fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv, **fc_kw) fcobj["run_info_yaml"] = parser.parse_run_info_yaml(**fc_kw) qc_objects.append(fcobj) else: return qc_objects qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, fc_name, fcdir, as_yaml=as_yaml) return qc_objects
def _make_casava_archive_files(fc, ssname, prefix, startiter = 1, nseqout=1000): fc_dir = os.path.join(ARCHIVE, fc) if not os.path.exists(fc_dir): safe_makedir(fc_dir) with open(os.path.join(fc_dir, "{}.csv".format(ssname)), "w") as fh: fh.write(SAMPLESHEETS[ssname]) with open(os.path.join(fc_dir, "RunInfo.xml"), "w") as fh: fh.write(RUNINFO.render(**{'flowcell':os.path.basename(fc), 'fc_id':fc_id(fc), 'date':fc_parts(fc)[0], 'instrument':fc.split("_")[1]})) with open(os.path.join(fc_dir, "runParameters.xml"), "w") as fh: fh.write(RUNPARAMETERS.render(**{'flowcell':os.path.basename(fc), 'fc_id':fc_id(fc), 'date':fc_parts(fc)[0], 'instrument':fc.split("_")[1]})) outf1 = [] outf2 = [] basecall_stats_dir = os.path.join(fc_dir, "Unaligned", "Basecall_Stats_{}".format(ssname)) if not os.path.exists(basecall_stats_dir): safe_makedir(basecall_stats_dir) for d in [os.path.join(basecall_stats_dir, x) for x in ["css", "Plots"]]: if not os.path.exists(d): safe_makedir(d) for row in SAMPLESHEETS[ssname].split("\n"): vals = row.split(",") if vals[0] == "FCID": header = row continue if len(vals) == 0: continue outdir = os.path.join(fc_dir, "Unaligned", "Project_{}".format(vals[5]), "Sample_{}".format(vals[2])) if not os.path.exists(outdir): safe_makedir(outdir) with open(os.path.join(outdir, "SampleSheet.csv"), "w") as fh: LOG.info("Writing to {}".format(os.path.join(outdir, "SampleSheet.csv"))) fh.write("{}\n".format(header)) fh.write("{}\n".format(row)) r1 = os.path.join(outdir, "{}_{}_L00{}_R1_001.fastq.gz".format(vals[2], vals[4], vals[1])) r2 = os.path.join(outdir, "{}_{}_L00{}_R2_001.fastq.gz".format(vals[2], vals[4], vals[1])) if os.path.exists(r1): LOG.info("{} already exists: if you want to rerun file generation remove {}".format(r1, r1)) return outf1.append(r1) outf2.append(r2) ## Write sequences with open("{}_1.fastq".format(prefix), "r") as fh: _write_sample_fastq(fh, outf1, startiter=startiter, nseqout=nseqout) with open("{}_2.fastq".format(prefix), "r") as fh: _write_sample_fastq(fh, outf2, startiter=startiter, nseqout=nseqout)