Exemple #1
0
    def _collect_casava_qc(self):
        qc_objects = []
        runinfo_csv = os.path.join(os.path.abspath(self.pargs.flowcell), "{}.csv".format(self._fc_id()))
        try:
            with open(runinfo_csv) as fh:
                runinfo_reader = csv.reader(fh)
                runinfo = [x for x in runinfo_reader]
        except IOError as e:
            self.app.log.warn(str(e))
            raise e
        fcdir = os.path.join(os.path.abspath(self.pargs.analysis), self.pargs.flowcell)
        (fc_date, fc_name) = self._fc_parts()
        ## Check modification time
        if modified_within_days(fcdir, self.pargs.mtime):
            fc_kw = dict(path=fcdir, fc_date = fc_date, fc_name=fc_name)
            fcobj = FlowcellRunMetrics(**fc_kw)
            fcobj.parse_illumina_metrics(fullRTA=False)
            fcobj.parse_bc_metrics()
            fcobj.parse_demultiplex_stats_htm()
            fcobj.parse_samplesheet_csv()
            qc_objects.append(fcobj)

        for sample in runinfo[1:]:
            d = dict(zip(runinfo[0], sample))
            if self.app.pargs.project and self.app.pargs.project != d['SampleProject']:
                continue
            if self.app.pargs.sample and self.app.pargs.sample != d['SampleID']:
                continue
                
            sampledir = os.path.join(os.path.abspath(self.pargs.analysis), d['SampleProject'].replace("__", "."), d['SampleID'])
            if not os.path.exists(sampledir):
                self.app.log.warn("No such sample directory: {}".format(sampledir))
                continue
            sample_fcdir = os.path.join(sampledir, self._fc_fullname())
            if not os.path.exists(sample_fcdir):
                self.app.log.warn("No such sample flowcell directory: {}".format(sample_fcdir))
                continue
            if not modified_within_days(sample_fcdir, self.pargs.mtime):
                continue
            runinfo_yaml_file = os.path.join(sample_fcdir, "{}-bcbb-config.yaml".format(d['SampleID']))
            if not os.path.exists(runinfo_yaml_file):
                self.app.log.warn("No such yaml file for sample: {}".format(runinfo_yaml_file))
                raise IOError(2, "No such yaml file for sample: {}".format(runinfo_yaml_file), runinfo_yaml_file)
            with open(runinfo_yaml_file) as fh:
                runinfo_yaml = yaml.load(fh)
            if not runinfo_yaml['details'][0].get("multiplex", None):
                self.app.log.warn("No multiplex information for sample {}".format(d['SampleID']))
                continue
            sample_kw = dict(path=sample_fcdir, flowcell=fc_name, date=fc_date, lane=d['Lane'], barcode_name=d['SampleID'], sample_prj=d['SampleProject'].replace("__", "."), barcode_id=runinfo_yaml['details'][0]['multiplex'][0]['barcode_id'], sequence=runinfo_yaml['details'][0]['multiplex'][0]['sequence'])
            obj = SampleRunMetrics(**sample_kw)
            obj.read_picard_metrics()
            obj.parse_fastq_screen()
            obj.parse_bc_metrics()
            obj.read_fastqc_metrics()
            qc_objects.append(obj)
        return qc_objects
Exemple #2
0
 def _parse_samplesheet(self, runinfo, qc_objects, fc_date, fc_name, fcdir, as_yaml=False, demultiplex_stats=None, setup=None):
     """Parse samplesheet information and populate sample run metrics object"""
     if as_yaml:
         for info in runinfo:
             if not info.get("multiplex", None):
                 self.app.log.warn("No multiplex information for lane {}".format(info.get("lane")))
                 sample = {}
                 sample.update({k: info.get(k, None) for k in ('analysis', 'description', 'flowcell_id', 'lane')})
                 sample_kw = dict(path=fcdir, flowcell=fc_name, date=fc_date, lane=sample.get('lane', None), barcode_name=sample.get('name', None), sample_prj=sample.get('sample_prj', None),
                                  barcode_id=sample.get('barcode_id', None), sequence=sample.get('sequence', "NoIndex"))
             for sample in info["multiplex"]:
                 sample.update({k: info.get(k, None) for k in ('analysis', 'description', 'flowcell_id', 'lane')})
                 sample_kw = dict(flowcell=fc_name, date=fc_date, lane=sample['lane'], barcode_name=sample['name'], sample_prj=sample.get('sample_prj', None),
                                  barcode_id=sample['barcode_id'], sequence=sample.get('sequence', "NoIndex"))
             
                 parser = SampleRunMetricsParser(fcdir)
                 obj = SampleRunMetricsDocument(**sample_kw)
                 obj["picard_metrics"] = parser.read_picard_metrics(**sample_kw)
                 obj["fastq_scr"] = parser.parse_fastq_screen(**sample_kw)
                 obj["bc_count"] = parser.get_bc_count(run_setup=setup, **sample_kw)
                 obj["fastqc"] = parser.read_fastqc_metrics(**sample_kw)
                 obj["bcbb_checkpoints"] = parser.parse_bcbb_checkpoints(**sample_kw)
                 qc_objects.append(obj)
     else:
         for d in runinfo:
             LOG.debug("Getting information for sample defined by {}".format(d.values()))
             if self.app.pargs.project_name and self.app.pargs.project_name != d['SampleProject']:
                 continue
             if self.app.pargs.sample and self.app.pargs.sample != d['SampleID']:
                 continue
             
             sampledir = os.path.join(os.path.abspath(self._meta.production_root_path), d['SampleProject'].replace("__", "."), d['SampleID'])
             if not os.path.exists(sampledir):
                 self.app.log.warn("No such sample directory: {}".format(sampledir))
                 continue
             sample_fcdir = os.path.join(sampledir, fc_fullname(self.pargs.flowcell))
             if not os.path.exists(sample_fcdir):
                 self.app.log.warn("No such sample flowcell directory: {}".format(sample_fcdir))
                 continue
             if not modified_within_days(sample_fcdir, self.pargs.mtime):
                 continue
             runinfo_yaml_file = os.path.join(sample_fcdir, "{}-bcbb-config.yaml".format(d['SampleID']))
             if not os.path.exists(runinfo_yaml_file):
                 self.app.log.warn("No such yaml file for sample: {}".format(runinfo_yaml_file))
                 raise IOError(2, "No such yaml file for sample: {}".format(runinfo_yaml_file), runinfo_yaml_file)
             with open(runinfo_yaml_file) as fh:
                 runinfo_yaml = yaml.load(fh)
             if not runinfo_yaml['details'][0].get("multiplex", None):
                 self.app.log.warn("No multiplex information for sample {}".format(d['SampleID']))
                 runinfo_yaml['details'][0]['multiplex'] = [{'barcode_id': 0, 'sequence': 'NoIndex'}]
             sample_kw = dict(flowcell=fc_name, date=fc_date, lane=d['Lane'], barcode_name=d['SampleID'], sample_prj=d['SampleProject'].replace("__", "."), barcode_id=runinfo_yaml['details'][0]['multiplex'][0]['barcode_id'], sequence=runinfo_yaml['details'][0]['multiplex'][0]['sequence'])
             parser = SampleRunMetricsParser(sample_fcdir)
             obj = SampleRunMetricsDocument(**sample_kw)
             obj["picard_metrics"] = parser.read_picard_metrics(**sample_kw)
             obj["fastq_scr"] = parser.parse_fastq_screen(**sample_kw)
             obj["bc_count"] = parser.get_bc_count(demultiplex_stats=demultiplex_stats, run_setup=setup, **sample_kw)
             obj["fastqc"] = parser.read_fastqc_metrics(**sample_kw)
             obj["bcbb_checkpoints"] = parser.parse_bcbb_checkpoints(**sample_kw)
             qc_objects.append(obj)
     return qc_objects
Exemple #3
0
 def _parse_samplesheet(self, runinfo, qc_objects, fc_date, fc_name, fcdir, as_yaml=False, demultiplex_stats=None, setup=None):
     """Parse samplesheet information and populate sample run metrics object"""
     if as_yaml:
         for info in runinfo:
             if not info.get("multiplex", None):
                 self.app.log.warn("No multiplex information for lane {}".format(info.get("lane")))
                 sample = {}
                 sample.update({k: info.get(k, None) for k in ('analysis', 'description', 'flowcell_id', 'lane')})
                 sample_kw = dict(path=fcdir, flowcell=fc_name, date=fc_date, lane=sample.get('lane', None), barcode_name=sample.get('name', None), sample_prj=sample.get('sample_prj', None),
                                  barcode_id=sample.get('barcode_id', None), sequence=sample.get('sequence', "NoIndex"))
             for sample in info["multiplex"]:
                 sample.update({k: info.get(k, None) for k in ('analysis', 'description', 'flowcell_id', 'lane')})
                 sample_kw = dict(flowcell=fc_name, date=fc_date, lane=sample['lane'], barcode_name=sample['name'], sample_prj=sample.get('sample_prj', None),
                                  barcode_id=sample['barcode_id'], sequence=sample.get('sequence', "NoIndex"))
             
                 parser = SampleRunMetricsParser(fcdir)
                 obj = SampleRunMetricsDocument(**sample_kw)
                 obj["picard_metrics"] = parser.read_picard_metrics(**sample_kw)
                 obj["fastq_scr"] = parser.parse_fastq_screen(**sample_kw)
                 obj["bc_count"] = parser.get_bc_count(run_setup=setup, **sample_kw)
                 obj["fastqc"] = parser.read_fastqc_metrics(**sample_kw)
                 obj["bcbb_checkpoints"] = parser.parse_bcbb_checkpoints(**sample_kw)
                 qc_objects.append(obj)
     else:
         for d in runinfo:
             LOG.debug("Getting information for sample defined by {}".format(d.values()))
             if self.app.pargs.project_name and self.app.pargs.project_name != d['SampleProject']:
                 continue
             if self.app.pargs.sample and self.app.pargs.sample != d['SampleID']:
                 continue
             
             sampledir = os.path.join(os.path.abspath(self._meta.production_root_path), d['SampleProject'].replace("__", "."), d['SampleID'])
             if not os.path.exists(sampledir):
                 self.app.log.warn("No such sample directory: {}".format(sampledir))
                 continue
             sample_fcdir = os.path.join(sampledir, fc_fullname(self.pargs.flowcell))
             if not os.path.exists(sample_fcdir):
                 self.app.log.warn("No such sample flowcell directory: {}".format(sample_fcdir))
                 continue
             if not modified_within_days(sample_fcdir, self.pargs.mtime):
                 continue
             runinfo_yaml_file = os.path.join(sample_fcdir, "{}-bcbb-config.yaml".format(d['SampleID']))
             if not os.path.exists(runinfo_yaml_file):
                 self.app.log.warn("No such yaml file for sample: {}".format(runinfo_yaml_file))
                 raise IOError(2, "No such yaml file for sample: {}".format(runinfo_yaml_file), runinfo_yaml_file)
             with open(runinfo_yaml_file) as fh:
                 runinfo_yaml = yaml.load(fh)
             if not runinfo_yaml['details'][0].get("multiplex", None):
                 self.app.log.warn("No multiplex information for sample {}".format(d['SampleID']))
                 runinfo_yaml['details'][0]['multiplex'] = [{'barcode_id': 0, 'sequence': 'NoIndex'}]
             sample_kw = dict(flowcell=fc_name, date=fc_date, lane=d['Lane'], barcode_name=d['SampleID'], sample_prj=d['SampleProject'].replace("__", "."), barcode_id=runinfo_yaml['details'][0]['multiplex'][0]['barcode_id'], sequence=runinfo_yaml['details'][0]['multiplex'][0]['sequence'])
             parser = SampleRunMetricsParser(sample_fcdir)
             obj = SampleRunMetricsDocument(**sample_kw)
             obj["picard_metrics"] = parser.read_picard_metrics(**sample_kw)
             obj["fastq_scr"] = parser.parse_fastq_screen(**sample_kw)
             obj["bc_count"] = parser.get_bc_count(demultiplex_stats=demultiplex_stats, run_setup=setup, **sample_kw)
             obj["fastqc"] = parser.read_fastqc_metrics(**sample_kw)
             obj["bcbb_checkpoints"] = parser.parse_bcbb_checkpoints(**sample_kw)
             qc_objects.append(obj)
     return qc_objects
Exemple #4
0
 def _collect_casava_qc(self):
     qc_objects = []
     runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell)))
     if not os.path.exists(runinfo_csv):
         LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv))
         runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv")
     try:
         with open(runinfo_csv) as fh:
             runinfo_reader = csv.reader(fh)
             runinfo = [x for x in runinfo_reader]
     except IOError as e:
         self.app.log.warn(str(e))
         raise e
     fcdir = os.path.join(os.path.abspath(self._meta.root_path), self.pargs.flowcell)
     (fc_date, fc_name) = fc_parts(self.pargs.flowcell)
     ## Check modification time
     demux_stats = None
     if modified_within_days(fcdir, self.pargs.mtime):
         fc_kw = dict(fc_date = fc_date, fc_name=fc_name)
         parser = FlowcellRunMetricsParser(fcdir)
         fcobj = FlowcellRunMetricsDocument(fc_date, fc_name)
         fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw)
         fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw)
         fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw)
         fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw)
         fcobj["undemultiplexed_barcodes"] = parser.parse_undemultiplexed_barcode_metrics(**fc_kw)
         fcobj["illumina"].update({"Demultiplex_Stats" : parser.parse_demultiplex_stats_htm(**fc_kw)})
         fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv, **fc_kw)
         demux_stats = fcobj["illumina"]["Demultiplex_Stats"]
         qc_objects.append(fcobj)
     qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, fc_name, fcdir, demultiplex_stats=demux_stats)
     return qc_objects
Exemple #5
0
 def _collect_casava_qc(self):
     qc_objects = []
     runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell)))
     if not os.path.exists(runinfo_csv):
         LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv))
         runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv")
     try:
         with open(runinfo_csv) as fh:
             runinfo_reader = csv.reader(fh)
             runinfo = [x for x in runinfo_reader]
     except IOError as e:
         self.app.log.warn(str(e))
         raise e
     fcdir = os.path.join(os.path.abspath(self._meta.root_path), self.pargs.flowcell)
     (fc_date, fc_name) = fc_parts(self.pargs.flowcell)
     ## Check modification time
     if modified_within_days(fcdir, self.pargs.mtime):
         fc_kw = dict(fc_date = fc_date, fc_name=fc_name)
         parser = FlowcellRunMetricsParser(fcdir)
         fcobj = FlowcellRunMetricsDocument(fc_date, fc_name)
         fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw)
         fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw)
         fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw)
         fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw)
         fcobj["undemultiplexed_barcodes"] = parser.parse_undemultiplexed_barcode_metrics(**fc_kw)
         fcobj["illumina"].update({"Demultiplex_Stats" : parser.parse_demultiplex_stats_htm(**fc_kw)})
         fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv, **fc_kw)
         qc_objects.append(fcobj)
     qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, fc_name, fcdir, demultiplex_stats=fcobj["illumina"]["Demultiplex_Stats"])
     return qc_objects
Exemple #6
0
    def _collect_casava_qc(self):
        qc_objects = []
        read_setup = None
        demux_stats = None

        fcdir = os.path.join(os.path.abspath(self._meta.root_path),
                             self.pargs.flowcell)

        # Get the fc_name, fc_date from RunInfo
        parser = FlowcellRunMetricsParser(fcdir)
        runinfo_xml = parser.parseRunInfo()
        runparams = parser.parseRunParameters()
        fc_date = runinfo_xml.get('Date', None)
        fc_name = runinfo_xml.get('Flowcell', None)
        fc_pos = runparams.get('FCPosition', '')
        runinfo_csv = os.path.join(
            os.path.join(self._meta.root_path, self.pargs.flowcell),
            "{}.csv".format(fc_name))
        if not os.path.exists(runinfo_csv):
            LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(
                runinfo_csv))
            runinfo_csv = os.path.join(
                os.path.join(self._meta.root_path, self.pargs.flowcell),
                "SampleSheet.csv")
        runinfo = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv)

        if modified_within_days(fcdir, self.pargs.mtime):
            # Most of the code expects to have the flowcell position pre-pended to the flowcell id
            fc_kw = dict(fc_date=fc_date,
                         fc_name="{}{}".format(fc_pos, fc_name))
            fcobj = FlowcellRunMetricsDocument(**fc_kw)
            fcobj["RunInfo"] = runinfo_xml
            fcobj["RunParameters"] = runparams
            fcobj["DemultiplexConfig"] = parser.parseDemultiplexConfig(**fc_kw)
            fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False,
                                                              **fc_kw)
            fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw)
            fcobj[
                "undemultiplexed_barcodes"] = parser.parse_undemultiplexed_barcode_metrics(
                    **fc_kw)
            fcobj["illumina"].update({
                "Demultiplex_Stats":
                parser.parse_demultiplex_stats_htm(**fc_kw)
            })
            fcobj["samplesheet_csv"] = runinfo
            read_setup = fcobj["RunInfo"].get('Reads', [])
            fcobj["run_setup"] = self._run_setup(read_setup)
            demux_stats = fcobj["illumina"]["Demultiplex_Stats"]
            qc_objects.append(fcobj)
        qc_objects = self._parse_samplesheet(runinfo,
                                             qc_objects,
                                             fc_date,
                                             "{}{}".format(fc_pos, fc_name),
                                             fcdir,
                                             demultiplex_stats=demux_stats,
                                             setup=read_setup)
        return qc_objects
Exemple #7
0
 def _collect_pre_casava_qc(self):
     qc_objects = []
     as_yaml = False
     runinfo_csv = os.path.join(
         os.path.join(self._meta.root_path, self.pargs.flowcell),
         "{}.csv".format(fc_id(self.pargs.flowcell)))
     if not os.path.exists(runinfo_csv):
         LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(
             runinfo_csv))
         runinfo_csv = os.path.join(
             os.path.join(self._meta.root_path, self.pargs.flowcell),
             "SampleSheet.csv")
     runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell),
                                 "run_info.yaml")
     try:
         if os.path.exists(runinfo_csv):
             with open(runinfo_csv) as fh:
                 runinfo_reader = csv.reader(fh)
                 runinfo = [x for x in runinfo_reader]
         else:
             as_yaml = True
             with open(runinfo_yaml) as fh:
                 runinfo = yaml.load(fh)
     except IOError as e:
         self.app.log.warn(str(e))
         raise e
     fcdir = os.path.abspath(self.pargs.flowcell)
     (fc_date, fc_name) = fc_parts(self.pargs.flowcell)
     ## Check modification time
     if modified_within_days(fcdir, self.pargs.mtime):
         fc_kw = dict(fc_date=fc_date, fc_name=fc_name)
         parser = FlowcellRunMetricsParser(fcdir)
         fcobj = FlowcellRunMetricsDocument(**fc_kw)
         fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw)
         fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw)
         fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False,
                                                           **fc_kw)
         fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw)
         fcobj["filter_metrics"] = parser.parse_filter_metrics(**fc_kw)
         fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv(
             runinfo_csv=runinfo_csv, **fc_kw)
         fcobj["run_info_yaml"] = parser.parse_run_info_yaml(**fc_kw)
         qc_objects.append(fcobj)
     else:
         return qc_objects
     qc_objects = self._parse_samplesheet(runinfo,
                                          qc_objects,
                                          fc_date,
                                          fc_name,
                                          fcdir,
                                          as_yaml=as_yaml)
     return qc_objects
Exemple #8
0
 def _collect_pre_casava_qc(self):
     qc_objects = []
     as_yaml = False
     read_setup = None
     
     fcdir = os.path.abspath(self.pargs.flowcell)
     
     ## Check modification time
     if not modified_within_days(fcdir, self.pargs.mtime):
         return qc_objects
     
     # Get the fc_name, fc_date from RunInfo    
     parser = FlowcellRunMetricsParser(fcdir)
     runinfo_xml = parser.parseRunInfo()
     runparams = parser.parseRunParameters()
     fc_date = runinfo_xml.get('Date',None)
     fc_name = runinfo_xml.get('Flowcell',None)
     fc_pos = runparams.get('FCPosition','')
     
     runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_name))
     if not os.path.exists(runinfo_csv):
         LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv))
         runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv")
     runinfo = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv)
     if len(runinfo) == 0:
         runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml")
         as_yaml = True
         try:
             with open(runinfo_yaml) as fh:
                 runinfo = yaml.load(fh)
         except IOError as e:
             self.app.log.warn(str(e))
             raise e
     
     # Most of the code expects to have the flowcell position pre-pended to the flowcell id
     fc_kw = dict(fc_date = fc_date, fc_name="{}{}".format(fc_pos,fc_name))
     fcobj = FlowcellRunMetricsDocument(**fc_kw)
     fcobj["RunInfo"] = runinfo_xml
     fcobj["RunParameters"] = runparams
     fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw)
     fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw)
     fcobj["filter_metrics"] = parser.parse_filter_metrics(**fc_kw)
     fcobj["samplesheet_csv"] = runinfo
     fcobj["run_info_yaml"] = parser.parse_run_info_yaml(**fc_kw)
     read_setup = fcobj["RunInfo"].get('Reads',[])
     fcobj["run_setup"] = self._run_setup(read_setup)
     qc_objects.append(fcobj)
     qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, "{}{}".format(fc_pos,fc_name), fcdir, as_yaml=as_yaml, setup=read_setup)
     return qc_objects
Exemple #9
0
 def _collect_pre_casava_qc(self):
     qc_objects = []
     as_yaml = False
     read_setup = None
     
     fcdir = os.path.abspath(self.pargs.flowcell)
     
     ## Check modification time
     if not modified_within_days(fcdir, self.pargs.mtime):
         return qc_objects
     
     # Get the fc_name, fc_date from RunInfo    
     parser = FlowcellRunMetricsParser(fcdir)
     runinfo_xml = parser.parseRunInfo()
     runparams = parser.parseRunParameters()
     fc_date = runinfo_xml.get('Date',None)
     fc_name = runinfo_xml.get('Flowcell',None)
     fc_pos = runparams.get('FCPosition','')
     
     runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_name))
     if not os.path.exists(runinfo_csv):
         LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv))
         runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv")
     runinfo = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv)
     if len(runinfo) == 0:
         runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml")
         as_yaml = True
         try:
             with open(runinfo_yaml) as fh:
                 runinfo = yaml.load(fh)
         except IOError as e:
             self.app.log.warn(str(e))
             raise e
     
     # Most of the code expects to have the flowcell position pre-pended to the flowcell id
     fc_kw = dict(fc_date = fc_date, fc_name="{}{}".format(fc_pos,fc_name))
     fcobj = FlowcellRunMetricsDocument(**fc_kw)
     fcobj["RunInfo"] = runinfo_xml
     fcobj["RunParameters"] = runparams
     fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw)
     fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw)
     fcobj["filter_metrics"] = parser.parse_filter_metrics(**fc_kw)
     fcobj["samplesheet_csv"] = runinfo
     fcobj["run_info_yaml"] = parser.parse_run_info_yaml(**fc_kw)
     read_setup = fcobj["RunInfo"].get('Reads',[])
     fcobj["run_setup"] = self._run_setup(read_setup)
     qc_objects.append(fcobj)
     qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, "{}{}".format(fc_pos,fc_name), fcdir, as_yaml=as_yaml, setup=read_setup)
     return qc_objects
Exemple #10
0
 def _collect_pre_casava_qc(self):
     qc_objects = []
     as_yaml = False
     runinfo_csv = os.path.join(
         os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_id(self.pargs.flowcell))
     )
     if not os.path.exists(runinfo_csv):
         LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv))
         runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv")
     runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml")
     try:
         if os.path.exists(runinfo_csv):
             with open(runinfo_csv) as fh:
                 runinfo_reader = csv.reader(fh)
                 runinfo = [x for x in runinfo_reader]
         else:
             as_yaml = True
             with open(runinfo_yaml) as fh:
                 runinfo = yaml.load(fh)
     except IOError as e:
         self.app.log.warn(str(e))
         raise e
     fcdir = os.path.abspath(self.pargs.flowcell)
     (fc_date, fc_name) = fc_parts(self.pargs.flowcell)
     ## Check modification time
     if modified_within_days(fcdir, self.pargs.mtime):
         fc_kw = dict(fc_date=fc_date, fc_name=fc_name)
         parser = FlowcellRunMetricsParser(fcdir)
         fcobj = FlowcellRunMetricsDocument(**fc_kw)
         fcobj["RunInfo"] = parser.parseRunInfo(**fc_kw)
         fcobj["RunParameters"] = parser.parseRunParameters(**fc_kw)
         fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw)
         fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw)
         fcobj["filter_metrics"] = parser.parse_filter_metrics(**fc_kw)
         fcobj["samplesheet_csv"] = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv, **fc_kw)
         fcobj["run_info_yaml"] = parser.parse_run_info_yaml(**fc_kw)
         qc_objects.append(fcobj)
     else:
         return qc_objects
     qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, fc_name, fcdir, as_yaml=as_yaml)
     return qc_objects
Exemple #11
0
 def _collect_pre_casava_qc(self):
     qc_objects = []
     runinfo_yaml = os.path.join(os.path.abspath(self.pargs.flowcell), "run_info.yaml")
     try:
         with open(runinfo_yaml) as fh:
             runinfo = yaml.load(fh)
     except IOError as e:
         self.app.log.warn(str(e))
         raise e
     fcdir = os.path.abspath(self.pargs.flowcell)
     (fc_date, fc_name) = self._fc_parts()
     ## Check modification time
     if modified_within_days(fcdir, self.pargs.mtime):
         fc_kw = dict(path=fcdir, fc_date = fc_date, fc_name=fc_name)
         fcobj = FlowcellRunMetrics(**fc_kw)
         fcobj.parse_illumina_metrics(fullRTA=False)
         fcobj.parse_bc_metrics()
         fcobj.parse_filter_metrics()
         if not fcobj.parse_samplesheet_csv():
             fcobj.parse_run_info_yaml()
         qc_objects.append(fcobj)
     else:
         return qc_objects
     for info in runinfo:
         if not info.get("multiplex", None):
             self.app.log.warn("No multiplex information for lane {}".format(info.get("lane")))
             sample.update({k: info.get(k, None) for k in ('analysis', 'description', 'flowcell_id', 'lane')})
             sample_kw = dict(path=fcdir, flowcell=fc_name, date=fc_date, lane=sample.get('lane', None), barcode_name=sample.get('name', None), sample_prj=sample.get('sample_prj', None),
                              barcode_id=sample.get('barcode_id', None), sequence=sample.get('sequence', "NoIndex"))
         for sample in info["multiplex"]:
             sample.update({k: info.get(k, None) for k in ('analysis', 'description', 'flowcell_id', 'lane')})
             sample_kw = dict(path=fcdir, flowcell=fc_name, date=fc_date, lane=sample['lane'], barcode_name=sample['name'], sample_prj=sample.get('sample_prj', None),
                              barcode_id=sample['barcode_id'], sequence=sample.get('sequence', "NoIndex"))
             obj = SampleRunMetrics(**sample_kw)
             obj.read_picard_metrics()
             obj.parse_fastq_screen()
             obj.parse_bc_metrics()
             obj.read_fastqc_metrics()
             qc_objects.append(obj)
     return qc_objects
Exemple #12
0
    def _collect_casava_qc(self):
        qc_objects = []
        read_setup = None
        demux_stats = None
        
        fcdir = os.path.join(os.path.abspath(self._meta.root_path), self.pargs.flowcell)
        
        # Get the fc_name, fc_date from RunInfo    
        parser = FlowcellRunMetricsParser(fcdir)
        runinfo_xml = parser.parseRunInfo()
        runparams = parser.parseRunParameters()
        fc_date = runinfo_xml.get('Date',None)
        fc_name = runinfo_xml.get('Flowcell',None)
        fc_pos = runparams.get('FCPosition','')
        runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "{}.csv".format(fc_name))
        if not os.path.exists(runinfo_csv):
            LOG.warn("No such file {}: trying fallback SampleSheet.csv".format(runinfo_csv))
            runinfo_csv = os.path.join(os.path.join(self._meta.root_path, self.pargs.flowcell), "SampleSheet.csv")
        runinfo = parser.parse_samplesheet_csv(runinfo_csv=runinfo_csv)

        if modified_within_days(fcdir, self.pargs.mtime):
            # Most of the code expects to have the flowcell position pre-pended to the flowcell id
            fc_kw = dict(fc_date = fc_date, fc_name="{}{}".format(fc_pos,fc_name))
            fcobj = FlowcellRunMetricsDocument(**fc_kw)
            fcobj["RunInfo"] = runinfo_xml
            fcobj["RunParameters"] = runparams
            fcobj["DemultiplexConfig"] = parser.parseDemultiplexConfig(**fc_kw)
            fcobj["illumina"] = parser.parse_illumina_metrics(fullRTA=False, **fc_kw)
            fcobj["bc_metrics"] = parser.parse_bc_metrics(**fc_kw)
            fcobj["undemultiplexed_barcodes"] = parser.parse_undemultiplexed_barcode_metrics(**fc_kw)
            fcobj["illumina"].update({"Demultiplex_Stats" : parser.parse_demultiplex_stats_htm(**fc_kw)})
            fcobj["samplesheet_csv"] = runinfo
            read_setup = fcobj["RunInfo"].get('Reads',[])
            fcobj["run_setup"] = self._run_setup(read_setup)
            demux_stats = fcobj["illumina"]["Demultiplex_Stats"]
            qc_objects.append(fcobj)
        qc_objects = self._parse_samplesheet(runinfo, qc_objects, fc_date, "{}{}".format(fc_pos,fc_name), fcdir, demultiplex_stats=demux_stats, setup=read_setup)
        return qc_objects
Exemple #13
0
 def upload_analysis(self):
     kw = vars(self.pargs)
     if not kw.get("flowcell"):
         kw["flowcell"] = "TOTAL"
     
     # Get a connection to the analysis database
     acon = AnalysisConnection(**kw)
     
     # Traverse the folder hierarchy and determine paths to process
     to_process = {}
     for pdir in os.listdir(self._meta.root_path):
         pdir = os.path.join(self._meta.root_path,pdir)
         if not os.path.isdir(pdir):
             continue
         plist = []
         for sdir in [d for d in os.listdir(pdir) if re.match(r'^P[0-9]{3,}_[0-9]+',d)]:
             fdir = os.path.join(pdir,sdir,kw.get("flowcell"))
             if not os.path.exists(fdir) or not modified_within_days(fdir, self.pargs.mtime):
                 continue
             plist.append(fdir)
         if plist:
             to_process[os.path.basename(pdir)] = plist
     
     # Collect the data from each folder
     for project_name, sdirs in to_process.items():
         self.log.info("Processing {}".format(project_name))
         samples = {}
         for sdir in sdirs:
             config = glob.glob(os.path.join(sdir,"*-bcbb-config.yaml"))
             if not config:
                 self.log.error("Could not find sample configuration file in {}. Skipping sample.".format(sdir))
                 continue
             if len(config) > 1:
                 self.log.warn("Multiple sample configuration files found in {}. Will only use {}.".format(sdir,os.path.basename(config[0])))
             
             # Parse the config file and get the flowcell, lane and index sequence that may be needed to parse
             info = {}
             sinfos = []
             with open(config[0]) as fh:
                 info = yaml.load(fh)
             fcdate = info.get("fc_date")
             fcname = info.get("fc_name")
             for laneinfo in info.get("details",[]):
                 for sampleinfo in laneinfo.get("multiplex",[laneinfo]):
                     linfo = laneinfo
                     linfo.update(sampleinfo)
                     name = linfo.get("name",linfo.get("description","unknown"))
                     m = re.match(r'(P[0-9_]{4,}[0-9])',name)
                     if m:
                         name = m.group(1)
                     sample_kw = {'flowcell': linfo.get("flowcell_id") if not fcname else fcname,
                                  'date': fcdate,
                                  'lane': linfo.get("lane"),
                                  'barcode_name': name,
                                  'sample_prj': linfo.get("sample_prj",project_name),
                                  'barcode_id': linfo.get("barcode_id","1"),
                                  'sequence': linfo.get("sequence","NoIndex")}
                     sinfos.append(sample_kw)
             
             # Create a parser object and collect the metrics
             parser = SampleRunMetricsParser(sdir)
             sinfo = sinfos[0]
             name = sinfo.get("barcode_name","unknown")
             samples[name] = {}
             samples[name]["bcbb_checkpoints"] = parser.parse_bcbb_checkpoints(**sinfo)
             samples[name]["software_versions"] = parser.parse_software_versions(**sinfo)
             samples[name]["project_summary"] = parser.parse_project_summary(**sinfo)
             samples[name]["snpeff_genes"] = parser.parse_snpeff_genes(**sinfo)
             for sinfo in sinfos:
                 picard = parser.read_picard_metrics(**sinfo)
                 if picard:
                     samples[name]["picard_metrics"] = picard
                 fq_scr = parser.parse_fastq_screen(**sinfo)
                 if fq_scr:
                     samples[name]["fastq_scr"] = fq_scr
                 fastqc = parser.read_fastqc_metrics(**sinfo)
                 if fastqc.get("stats"):
                     samples[name]["fastqc"] = fastqc
                 gteval = parser.parse_eval_metrics(**sinfo)
                 if gteval:
                     samples[name]["gatk_variant_eval"] = gteval
                     
         # Store the collected metrics in an analysis document
         obj = AnalysisDocument(**{'project_name': project_name,
                                   'name': project_name,
                                   'samples': samples})
         dry("Saving object {}".format(repr(obj)), acon.save(obj))
Exemple #14
0
    def _parse_samplesheet(self, runinfo, qc_objects, fc_date, fc_name, fcdir, as_yaml=False, demultiplex_stats=None):
        """Parse samplesheet information and populate sample run metrics object"""
        if as_yaml:
            for info in runinfo:
                if not info.get("multiplex", None):
                    self.app.log.warn("No multiplex information for lane {}".format(info.get("lane")))
                    sample = {}
                    sample.update({k: info.get(k, None) for k in ("analysis", "description", "flowcell_id", "lane")})
                    sample_kw = dict(
                        path=fcdir,
                        flowcell=fc_name,
                        date=fc_date,
                        lane=sample.get("lane", None),
                        barcode_name=sample.get("name", None),
                        sample_prj=sample.get("sample_prj", None),
                        barcode_id=sample.get("barcode_id", None),
                        sequence=sample.get("sequence", "NoIndex"),
                    )
                for sample in info["multiplex"]:
                    sample.update({k: info.get(k, None) for k in ("analysis", "description", "flowcell_id", "lane")})
                    sample_kw = dict(
                        flowcell=fc_name,
                        date=fc_date,
                        lane=sample["lane"],
                        barcode_name=sample["name"],
                        sample_prj=sample.get("sample_prj", None),
                        barcode_id=sample["barcode_id"],
                        sequence=sample.get("sequence", "NoIndex"),
                    )

                    parser = SampleRunMetricsParser(fcdir)
                    obj = SampleRunMetricsDocument(**sample_kw)
                    obj["picard_metrics"] = parser.read_picard_metrics(**sample_kw)
                    obj["fastq_scr"] = parser.parse_fastq_screen(**sample_kw)
                    obj["bc_count"] = parser.get_bc_count(**sample_kw)
                    obj["fastqc"] = parser.read_fastqc_metrics(**sample_kw)
                    qc_objects.append(obj)
        else:
            for sample in runinfo[1:]:
                LOG.debug("Getting information for sample defined by {}".format(sample))
                d = dict(zip(runinfo[0], sample))
                if self.app.pargs.project_name and self.app.pargs.project_name != d["SampleProject"]:
                    continue
                if self.app.pargs.sample and self.app.pargs.sample != d["SampleID"]:
                    continue

                sampledir = os.path.join(
                    os.path.abspath(self._meta.production_root_path),
                    d["SampleProject"].replace("__", "."),
                    d["SampleID"],
                )
                if not os.path.exists(sampledir):
                    self.app.log.warn("No such sample directory: {}".format(sampledir))
                    continue
                sample_fcdir = os.path.join(sampledir, fc_fullname(self.pargs.flowcell))
                if not os.path.exists(sample_fcdir):
                    self.app.log.warn("No such sample flowcell directory: {}".format(sample_fcdir))
                    continue
                if not modified_within_days(sample_fcdir, self.pargs.mtime):
                    continue
                runinfo_yaml_file = os.path.join(sample_fcdir, "{}-bcbb-config.yaml".format(d["SampleID"]))
                if not os.path.exists(runinfo_yaml_file):
                    self.app.log.warn("No such yaml file for sample: {}".format(runinfo_yaml_file))
                    raise IOError(2, "No such yaml file for sample: {}".format(runinfo_yaml_file), runinfo_yaml_file)
                with open(runinfo_yaml_file) as fh:
                    runinfo_yaml = yaml.load(fh)
                if not runinfo_yaml["details"][0].get("multiplex", None):
                    self.app.log.warn("No multiplex information for sample {}".format(d["SampleID"]))
                    continue
                sample_kw = dict(
                    flowcell=fc_name,
                    date=fc_date,
                    lane=d["Lane"],
                    barcode_name=d["SampleID"],
                    sample_prj=d["SampleProject"].replace("__", "."),
                    barcode_id=runinfo_yaml["details"][0]["multiplex"][0]["barcode_id"],
                    sequence=runinfo_yaml["details"][0]["multiplex"][0]["sequence"],
                )
                parser = SampleRunMetricsParser(sample_fcdir)
                obj = SampleRunMetricsDocument(**sample_kw)
                obj["picard_metrics"] = parser.read_picard_metrics(**sample_kw)
                obj["fastq_scr"] = parser.parse_fastq_screen(**sample_kw)
                obj["bc_count"] = parser.get_bc_count(demultiplex_stats=demultiplex_stats, **sample_kw)
                obj["fastqc"] = parser.read_fastqc_metrics(**sample_kw)
                qc_objects.append(obj)
        return qc_objects
Exemple #15
0
    def upload_analysis(self):
        kw = vars(self.pargs)
        if not kw.get("flowcell"):
            kw["flowcell"] = "TOTAL"

        # Get a connection to the analysis database
        acon = AnalysisConnection(**kw)

        # Traverse the folder hierarchy and determine paths to process
        to_process = {}
        for pdir in os.listdir(self._meta.root_path):
            pdir = os.path.join(self._meta.root_path, pdir)
            if not os.path.isdir(pdir):
                continue
            plist = []
            for sdir in [
                    d for d in os.listdir(pdir)
                    if re.match(r'^P[0-9]{3,}_[0-9]+', d)
            ]:
                fdir = os.path.join(pdir, sdir, kw.get("flowcell"))
                if not os.path.exists(fdir) or not modified_within_days(
                        fdir, self.pargs.mtime):
                    continue
                plist.append(fdir)
            if plist:
                to_process[os.path.basename(pdir)] = plist

        # Collect the data from each folder
        for project_name, sdirs in to_process.items():
            self.log.info("Processing {}".format(project_name))
            samples = {}
            for sdir in sdirs:
                config = glob.glob(os.path.join(sdir, "*-bcbb-config.yaml"))
                if not config:
                    self.log.error(
                        "Could not find sample configuration file in {}. Skipping sample."
                        .format(sdir))
                    continue
                if len(config) > 1:
                    self.log.warn(
                        "Multiple sample configuration files found in {}. Will only use {}."
                        .format(sdir, os.path.basename(config[0])))

                # Parse the config file and get the flowcell, lane and index sequence that may be needed to parse
                info = {}
                sinfos = []
                with open(config[0]) as fh:
                    info = yaml.load(fh)
                fcdate = info.get("fc_date")
                fcname = info.get("fc_name")
                for laneinfo in info.get("details", []):
                    for sampleinfo in laneinfo.get("multiplex", [laneinfo]):
                        linfo = laneinfo
                        linfo.update(sampleinfo)
                        name = linfo.get("name",
                                         linfo.get("description", "unknown"))
                        m = re.match(r'(P[0-9_]{4,}[0-9])', name)
                        if m:
                            name = m.group(1)
                        sample_kw = {
                            'flowcell':
                            linfo.get("flowcell_id") if not fcname else fcname,
                            'date':
                            fcdate,
                            'lane':
                            linfo.get("lane"),
                            'barcode_name':
                            name,
                            'sample_prj':
                            linfo.get("sample_prj", project_name),
                            'barcode_id':
                            linfo.get("barcode_id", "1"),
                            'sequence':
                            linfo.get("sequence", "NoIndex")
                        }
                        sinfos.append(sample_kw)

                # Create a parser object and collect the metrics
                parser = SampleRunMetricsParser(sdir)
                sinfo = sinfos[0]
                name = sinfo.get("barcode_name", "unknown")
                samples[name] = {}
                samples[name][
                    "bcbb_checkpoints"] = parser.parse_bcbb_checkpoints(
                        **sinfo)
                samples[name][
                    "software_versions"] = parser.parse_software_versions(
                        **sinfo)
                samples[name][
                    "project_summary"] = parser.parse_project_summary(**sinfo)
                samples[name]["snpeff_genes"] = parser.parse_snpeff_genes(
                    **sinfo)
                for sinfo in sinfos:
                    picard = parser.read_picard_metrics(**sinfo)
                    if picard:
                        samples[name]["picard_metrics"] = picard
                    fq_scr = parser.parse_fastq_screen(**sinfo)
                    if fq_scr:
                        samples[name]["fastq_scr"] = fq_scr
                    fastqc = parser.read_fastqc_metrics(**sinfo)
                    if fastqc.get("stats"):
                        samples[name]["fastqc"] = fastqc
                    gteval = parser.parse_eval_metrics(**sinfo)
                    if gteval:
                        samples[name]["gatk_variant_eval"] = gteval

            # Store the collected metrics in an analysis document
            obj = AnalysisDocument(
                **{
                    'project_name': project_name,
                    'name': project_name,
                    'samples': samples
                })
            dry("Saving object {}".format(repr(obj)), acon.save(obj))