Ejemplos de FlowcellRunMetricsConnection.is_paired_end en Python

Lenguaje de programación: Python

Namespace/Package Name: scilifelab.db.statusdb

Clase / Tipo: FlowcellRunMetricsConnection

Método / Función: is_paired_end

Ejemplos en hotexamples.com: 4

Python FlowcellRunMetricsConnection.is_paired_end - 4 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de scilifelab.db.statusdb.FlowcellRunMetricsConnection.is_paired_end extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

FlowcellRunMetricsConnection(15)

get_entry(5)

get_phix_error_rate(4)

get_barcode_lane_statistics(2)

get_instrument(2)

get_storage_status(2)

is_paired_end(2)

set_storage_status(2)

get_run_mode(1)

save(1)

Ejemplo n.º 1

Mostrar archivo

def sample_status_note(project_name=None,
                       flowcell=None,
                       username=None,
                       password=None,
                       url=None,
                       ordered_million_reads=None,
                       uppnex_id=None,
                       customer_reference=None,
                       bc_count=None,
                       project_alias=[],
                       projectdb="projects",
                       samplesdb="samples",
                       flowcelldb="flowcells",
                       phix=None,
                       is_paired=True,
                       **kw):
    """Make a sample status note. Used keywords:

    :param project_name: project name
    :param flowcell: flowcell id
    :param username: db username
    :param password: db password
    :param url: db url
    :param ordered_million_reads: number of ordered reads in millions
    :param uppnex_id: the uppnex id
    :param customer_reference: customer project name
    :param project_alias: project alias name
    :param phix: phix error rate
    :param is_paired: True if run is paired-end, False for single-end
    """
    # Cutoffs
    cutoffs = {
        "phix_err_cutoff": 2.0,
        "qv_cutoff": 30,
    }

    instrument = _parse_instrument_config(
        os.path.expanduser(kw.get("instrument_config", "")))
    instrument_dict = {i['instrument_id']: i for i in instrument}

    # parameters
    parameters = {
        "project_name": None,
        "start_date": None,
        "FC_id": None,
        "scilifelab_name": None,
        "rounded_read_count": None,
        "phix_error_rate": None,
        "avg_quality_score": None,
        "pct_q30_bases": None,
        "success": None,
        "run_mode": None,
        "is_paired": True
    }
    # key mapping from sample_run_metrics to parameter keys
    srm_to_parameter = {
        "project_name": "sample_prj",
        "FC_id": "flowcell",
        "scilifelab_name": "barcode_name",
        "start_date": "date",
        "rounded_read_count": "bc_count",
        "lane": "lane"
    }

    LOG.debug("got parameters {}".format(parameters))
    output_data = {
        'stdout': StringIO(),
        'stderr': StringIO(),
        'debug': StringIO()
    }
    if not _assert_flowcell_format(flowcell):
        LOG.warn(
            "Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9\-]+\")"
            .format(flowcell))
        return output_data
    output_data = _update_sample_output_data(output_data, cutoffs)

    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb,
                                       username=username,
                                       password=password,
                                       url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb,
                                          username=username,
                                          password=password,
                                          url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb,
                                     username=username,
                                     password=password,
                                     url=url)

    # Set up paragraphs
    paragraphs = sample_note_paragraphs()
    headers = sample_note_headers()

    # Get project
    project = p_con.get_entry(project_name)
    source = p_con.get_info_source(project_name)
    if not project:
        LOG.warn("No such project '{}'".format(project_name))
        return output_data

    # Set samples list
    sample_run_list = _set_sample_run_list(project_name, flowcell,
                                           project_alias, s_con)
    if len(sample_run_list) == 0:
        LOG.warn(
            "No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?"
            .format(project_name, flowcell))
        return output_data

    # Set options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    bc_count = _literal_eval_option(bc_count)
    phix = _literal_eval_option(phix)

    # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports
    sample_count = Counter([x.get("barcode_name") for x in sample_run_list])

    # Loop samples and collect information
    s_param_out = []
    fcdoc = None
    for s in sample_run_list:
        s_param = {}
        LOG.debug(
            "working on sample '{}', sample run metrics name '{}', id '{}'".
            format(s.get("barcode_name", None), s.get("name", None),
                   s.get("_id", None)))
        s_param.update(parameters)
        s_param.update(
            {key: s[srm_to_parameter[key]]
             for key in srm_to_parameter.keys()})
        fc = "{}_{}".format(s.get("date"), s.get("flowcell"))
        # Get instrument
        try:
            s_param.update(instrument_dict[fc_con.get_instrument(str(fc))])
        except:
            LOG.warn(
                "Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report."
                .format(fc))
            s_param.update(instrument_dict['default'])
        # Get run mode
        if not fcdoc or fcdoc.get("name") != fc:
            fcdoc = fc_con.get_entry(fc)
        runp = fcdoc.get("RunParameters", {})
        s_param[
            "sequencing_platform"] = "MiSeq" if "MCSVersion" in runp else "HiSeq2500"
        s_param["clustering_method"] = "onboard clustering" if runp.get(
            "ClusteringChoice", "") == "OnBoardClustering" or s_param[
                "sequencing_platform"] == "MiSeq" else "cBot"
        s_param["sequencing_setup"] = fcdoc.get("run_setup")
        s_param["sequencing_mode"] = runp.get("RunMode", "High Output")
        s_param["sequencing_software"] = "RTA {}".format(
            runp.get("RTAVersion"))
        if s_param["sequencing_platform"] == "MiSeq":
            s_param["sequencing_software"] = "MCS {}/{}".format(
                runp.get("MCSVersion"), s_param["sequencing_software"])
        else:
            s_param["sequencing_software"] = "{} {}/{}".format(
                runp.get("ApplicationName"), runp.get("ApplicationVersion"),
                s_param["sequencing_software"])
        s_param["is_paired"] = fc_con.is_paired_end(str(fc))
        if s_param["is_paired"] is None:
            LOG.warn(
                "Could not determine run setup for flowcell {}. Will assume paired-end."
                .format(fc))
            s_param["is_paired"] = True
        s_param.update(software_versions)
        s_param["phix_error_rate"] = fc_con.get_phix_error_rate(
            str(fc), s["lane"])
        if phix:
            s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix)
        # Get quality score from demultiplex stats, if that fails
        # (which it shouldn't), fall back on fastqc data.
        (avg_quality_score,
         pct_q30_bases) = fc_con.get_barcode_lane_statistics(
             project_name, s.get("barcode_name"), fc, s["lane"])
        s_param[
            'avg_quality_score'] = avg_quality_score if avg_quality_score else calc_avg_qv(
                s)
        if not s_param['avg_quality_score']:
            LOG.warn(
                "Setting average quality failed for sample {}, id {}".format(
                    s.get("name"), s.get("_id")))
        s_param['pct_q30_bases'] = pct_q30_bases
        if not s_param['pct_q30_bases']:
            LOG.warn(
                "Setting % of >= Q30 Bases (PF) failed for sample {}, id {}".
                format(s.get("name"), s.get("_id")))
        # Compare phix error and qv to cutoffs
        err_stat = "OK"
        qv_stat = "OK"
        if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]:
            err_stat = "HIGH"
        elif s_param["phix_error_rate"] == -1:
            err_stat = "N/A"
        if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]:
            qv_stat = "LOW"
        output_data["stdout"].write(
            "{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format(
                s["barcode_name"], s["lane"], s_param["phix_error_rate"],
                err_stat, s_param["avg_quality_score"], qv_stat))

        # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing
        s_param['ordered_amount'] = s_param.get(
            'ordered_amount',
            p_con.get_ordered_amount(project_name,
                                     samples=p_con.get_entry(
                                         project_name, 'samples')))
        s_param['customer_reference'] = s_param.get(
            'customer_reference', project.get('customer_reference'))
        s_param['uppnex_project_id'] = s_param.get('uppnex_project_id',
                                                   project.get('uppnex_id'))

        # Override database settings if options passed at command line
        if ordered_million_reads:
            s_param["ordered_amount"] = _get_ordered_million_reads(
                s["barcode_name"], ordered_million_reads)
        if bc_count:
            s_param["rounded_read_count"] = _round_read_count_in_millions(
                _get_bc_count(s["barcode_name"], bc_count, s))
        else:
            s_param["rounded_read_count"] = _round_read_count_in_millions(
                s_param["rounded_read_count"])
        if uppnex_id:
            s_param["uppnex_project_id"] = uppnex_id
        if customer_reference:
            s_param["customer_reference"] = customer_reference

        # Get the project sample name corresponding to the sample run
        project_sample = p_con.get_project_sample(
            project_name, s.get("project_sample_name", None))
        if project_sample:
            LOG.debug(
                "project sample run metrics mapping found: '{}' : '{}'".format(
                    s["name"], project_sample["sample_name"]))
            project_sample_item = project_sample['project_sample']
            # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
            project_sample_d = _set_project_sample_dict(
                project_sample_item, source)
            if not project_sample_d:
                LOG.warn(
                    "No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}"
                    .format(s["name"], s["barcode_name"], s["_id"],
                            project_sample))
            # Check if sample run metrics name present in project database: if so, verify that database ids are consistent
            if s["name"] not in project_sample_d.keys():
                LOG.warn(
                    "no such sample run metrics '{}' in project sample run metrics dictionary"
                    .format(s["name"]))
            else:
                if s["_id"] == project_sample_d[s["name"]]:
                    LOG.debug(
                        "project sample run metrics mapping found: '{}' : '{}'"
                        .format(s["name"], project_sample_d[s["name"]]))
                else:
                    LOG.warn(
                        "inconsistent mapping for '{}': '{}' != '{}' (project summary id)"
                        .format(s["name"], s["_id"],
                                project_sample_d[s["name"]]))
            s_param['customer_name'] = project_sample_item.get(
                "customer_name", None)

            # Always normalize submitted id, since module textttable does not support unicode
            if type(s_param['customer_name']) is unicode:
                s_param['customer_name'] = unicodedata.normalize(
                    'NFKD',
                    s_param['customer_name']).encode('ascii', 'ignore')
        # No project sample found. Manual upload to database necessary.
        else:
            s_param['customer_name'] = None
            LOG.warn(
                "No project sample name found for sample run name '{}'".format(
                    s["barcode_name"]))
            LOG.info(
                "Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names "
            )
            LOG.info(
                "or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names."
            )
            LOG.info("Please refer to the pm documentation for examples.")
            query_ok(force=kw.get("force", False))

        # Finally assess sequencing success, update parameters and set outputs
        s_param['success'] = sequencing_success(s_param, cutoffs)
        s_param.update({
            k: "N/A"
            for k in s_param.keys()
            if s_param[k] is None or s_param[k] == "" or s_param[k] == -1.0
        })
        if sample_count[s.get("barcode_name")] > 1:
            outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"],
                                               s["flowcell"], s["lane"])
        else:
            outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"],
                                            s["flowcell"])
        s_param["outfile"] = outfile
        s_param_out.append(s_param)

    # Write final output to reportlab and rst files
    output_data["debug"].write(
        json.dumps({
            's_param': s_param_out,
            'sample_runs':
            {s["name"]: s["barcode_name"]
             for s in sample_run_list}
        }))
    notes = [
        make_note(headers=headers, paragraphs=paragraphs, **sp)
        for sp in s_param_out
    ]
    rest_notes = make_sample_rest_notes(
        "{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None),
                                             s.get("flowcell", None)),
        s_param_out)
    concatenate_notes(
        notes, "{}_{}_{}_sample_summary.pdf".format(project_name,
                                                    s.get("date", None),
                                                    s.get("flowcell", None)))
    return output_data

Ejemplo n.º 2

Mostrar archivo

Archivo: ext_qc.py Proyecto: emmser/scilifelab

  def multiplex_qc(self):
      
      MAX_UNDEMULTIPLEXED_INDEX_COUNT = 1000000
      EXPECTED_LANE_YIELD = 143000000
      MAX_PHIX_ERROR_RATE = 2.0
      MIN_PHIX_ERROR_RATE = 0.0
      MIN_GTQ30 = 80.0
      read_pairs = True
      
      out_data = []
      
      if not self._check_pargs(['flowcell']):
          return
      url = self.pargs.url if self.pargs.url else self.app.config.get("db", "url")
      if not url:
          self.app.log.warn("Please provide a valid url: got {}".format(url))
          return
      
      # Construct the short form of the fcid
      sp = os.path.basename(self.pargs.flowcell).split("_")
      fcid = "_".join([sp[0],sp[-1]])
      
      # Get a connection to the flowcell database and fetch the corresponding document
      self.log.debug("Connecting to flowcell database".format(fcid))
      fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get("db", "flowcells"), **vars(self.app.pargs))
      self.log.debug("Fetching run metrics entry for flowcell {}".format(fcid))
      fc_doc = fc_con.get_entry(fcid)
      if not fc_doc:
          self.log.warn("Could not fetch run metrics entry for flowcell {}".format(fcid))
          return
 
      # Adjust the read pairs variable according to the run setup
      read_pairs = fc_con.is_paired_end(fcid) 
      
      # Get the yield per sample from the Demultiplex_Stats
      self.log.debug("Getting yield for flowcell {}".format(fcid))
      sample_yield = self._get_yield_per_sample(fc_doc, read_pairs)
      
      # Get the yield per lane from the Demultiplex_Stats
      self.log.debug("Getting lane yield for flowcell {}".format(fcid))
      lane_yield = self._get_yield_per_lane(fc_doc, read_pairs)
      lanes = lane_yield.keys()
      
      # Get the number of samples in the pools from the Demultiplex_Stats
      self.log.debug("Getting lane pool sizes for flowcell {}".format(fcid))
      pool_size = self._get_pool_size(fc_doc)
      
      # Get the sample information from the csv samplesheet
      self.log.debug("Getting csv samplesheet data for flowcell {}".format(fcid))
      ssheet_samples = self._get_samplesheet_sample_data(fc_doc)
      if len(ssheet_samples) == 0: 
          self.log.warn("No samplesheet data available for flowcell {}".format(fcid))
      
      # Verify that all samples in samplesheet have reported metrics
      for id in ssheet_samples.keys():
          for key in ssheet_samples[id].keys():
              lane, index = key.split("_")
              project = ssheet_samples[id][key][0]
              if id not in sample_yield or \
              key not in sample_yield[id]: 
                  self.log.warn("Sample {} from project {} is in samplesheet but no yield was reported in " \
                                "Demultiplex_Stats.htm for lane {} and index {}".format(id,
                                                                                        project,
                                                                                        lane,
                                                                                        index))
                  continue
              sample_yield[id][key].append('verified')
      
      # Check that all samples in Demultiplex_Stats have entries in Samplesheet
      for id in sample_yield.keys():
          for key in sample_yield[id].keys():
              lane, index = key.split("_")
              if "verified" not in sample_yield[id][key] and \
              index != "Undetermined":
                  self.log.warn("Sample {} from project {}, with index {} on lane {} is in Demultiplex_Stats " \
                                "but no corresponding entry is present in SampleSheet".format(id,
                                                                                              sample_yield[id][key][1],
                                                                                              index,
                                                                                              lane))
                      
      # Check the PhiX error rate for each lane
      self.log.debug("Getting PhiX error rates for flowcell {}".format(fcid))
      for lane in lanes:
          status = "N/A"
          err_rate = fc_con.get_phix_error_rate(fcid,lane)
          if err_rate < 0:
              self.log.warn("Could not get PhiX error rate for lane {} on flowcell {}".format(lane,fcid))
          elif err_rate <= MIN_PHIX_ERROR_RATE or err_rate > MAX_PHIX_ERROR_RATE:
              status = "FAIL"
          else:
              status = "PASS"
          out_data.append([status,
                           "PhiX error rate",
                           lane,
                           err_rate,
                           "{} < PhiX e (%) <= {}".format(MIN_PHIX_ERROR_RATE,
                                                          MAX_PHIX_ERROR_RATE)])
      
      # Check the %>=Q30 value for each sample
      sample_quality = self._get_quality_per_sample(fc_doc)
      for id in sample_quality.keys():
          for key in sample_quality[id].keys():
              lane, index = key.split("_")
              status = "FAIL"
              if float(sample_quality[id][key][0]) >= MIN_GTQ30:
                  status = "PASS"
              out_data.append([status,"Sample quality",lane,sample_quality[id][key][2],id,sample_quality[id][key][0],"[%>=Q30 >= {}%]".format(MIN_GTQ30)])
              
      # Check that each lane received the minimum amount of reads
      for lane, reads in lane_yield.items():
          status = "FAIL"
          if reads >= EXPECTED_LANE_YIELD:
              status = "PASS"
          out_data.append([status,"Lane yield",lane,reads,"[Yield >= {}]".format(EXPECTED_LANE_YIELD)])
              
      # Check that all samples in the pool have received a minimum number of reads
      for id in sample_yield.keys():
          for key in sample_yield[id].keys():
              lane, index = key.split("_")
              if index == "Undetermined":
                  continue
              
              status = "FAIL"
              mplx_min = int(0.5*EXPECTED_LANE_YIELD/pool_size[lane])
              if sample_yield[id][key][0] >= mplx_min:
                  status = "PASS"
              out_data.append([status,"Sample yield",lane,sample_yield[id][key][1],id,sample_yield[id][key][0],"[Yield >= {}]".format(mplx_min)])
      
      # Check that the number of undetermined reads in each lane is below 10% of the total yield for the lane
      for lane, reads in lane_yield.items():
          status = "FAIL"
          key = "_".join([lane,"Undetermined"])
          undetermined = sum([counts.get(key,[0])[0] for counts in sample_yield.values()])
          cutoff = 0.1*reads
          if undetermined < cutoff:
              status = "PASS"
          out_data.append([status,"Index read",lane,undetermined,"[Undetermined < {}]".format(cutoff)])
      
      # Check that no overrepresented index sequence exists in undemultiplexed output
      self.log.debug("Fetching undemultiplexed barcode data for flowcell {}".format(fcid))
      undemux_data = self._get_undetermined_index_counts(fc_doc)
      if len(undemux_data) == 0:
          self.log.warn("No undemultiplexed barcode data available for flowcell {}".format(fcid))
      
      for lane, counts in undemux_data.items():
          mplx_min = int(min(MAX_UNDEMULTIPLEXED_INDEX_COUNT,
                             0.5*EXPECTED_LANE_YIELD/max(1,pool_size[lane])))
          status = "N/A"
          if len(counts) > 0:
              for i in range(len(counts)):
                  status = "FAIL"
                  if int(counts[i][0]) < mplx_min:
                      status = "PASS"
                  out_data.append([status,"Index",lane,counts[i][1],counts[i][2],counts[i][0],"[Undetermined index < {}]".format(mplx_min)])
          else:
              out_data.append([status,"Index",lane,"","",mplx_min,"-"])
                  
      self.app._output_data['stdout'].write("\n".join(["\t".join([str(r) for r in row]) for row in out_data]))

Ejemplo n.º 3

Mostrar archivo

Archivo: delivery_notes.py Proyecto: guillermo-carrasco/scilifelab

def sample_status_note(project_name=None, flowcell=None, username=None, password=None, url=None,
                       ordered_million_reads=None, uppnex_id=None, customer_reference=None, bc_count=None,
                       project_alias=[], projectdb="projects", samplesdb="samples", flowcelldb="flowcells",
                       phix=None, is_paired=True, **kw):
    """Make a sample status note. Used keywords:

    :param project_name: project name
    :param flowcell: flowcell id
    :param username: db username
    :param password: db password
    :param url: db url
    :param ordered_million_reads: number of ordered reads in millions
    :param uppnex_id: the uppnex id
    :param customer_reference: customer project name
    :param project_alias: project alias name
    :param phix: phix error rate
    :param is_paired: True if run is paired-end, False for single-end
    """
    # Cutoffs
    cutoffs = {
        "phix_err_cutoff" : 2.0,
        "qv_cutoff" : 30,
        }

    instrument = _parse_instrument_config(os.path.expanduser(kw.get("instrument_config","")))
    instrument_dict = {i['instrument_id']: i for i in instrument}

    # parameters
    parameters = {
        "project_name" : None,
        "start_date" : None,
        "FC_id" : None,
        "scilifelab_name" : None,
        "rounded_read_count" : None,
        "phix_error_rate" : None,
        "avg_quality_score" : None,
        "pct_q30_bases" : None,
        "success" : None,
        "run_mode":None,
        "is_paired":True
        }
    # key mapping from sample_run_metrics to parameter keys
    srm_to_parameter = {"project_name":"sample_prj", "FC_id":"flowcell",
                        "scilifelab_name":"barcode_name", "start_date":"date",
                        "rounded_read_count":"bc_count", "lane": "lane"}

    LOG.debug("got parameters {}".format(parameters))
    output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()}
    if not _assert_flowcell_format(flowcell):
        LOG.warn("Wrong flowcell format {}; skipping. Please use the flowcell id (format \"[A-Z0-9\-]+\")".format(flowcell) )
        return output_data
    output_data = _update_sample_output_data(output_data, cutoffs)

    # Connect and run
    s_con = SampleRunMetricsConnection(dbname=samplesdb, username=username, password=password, url=url)
    fc_con = FlowcellRunMetricsConnection(dbname=flowcelldb, username=username, password=password, url=url)
    p_con = ProjectSummaryConnection(dbname=projectdb, username=username, password=password, url=url)

    # Set up paragraphs
    paragraphs = sample_note_paragraphs()
    headers = sample_note_headers()

    # Get project
    project = p_con.get_entry(project_name)
    source = p_con.get_info_source(project_name)
    if not project:
        LOG.warn("No such project '{}'".format(project_name))
        return output_data

    # Set samples list
    sample_run_list = _set_sample_run_list(project_name, flowcell, project_alias, s_con)
    if len(sample_run_list) == 0:
        LOG.warn("No samples for project '{}', flowcell '{}'. Maybe there are no sample run metrics in statusdb?".format(project_name, flowcell))
        return output_data

    # Set options
    ordered_million_reads = _literal_eval_option(ordered_million_reads)
    bc_count = _literal_eval_option(bc_count)
    phix = _literal_eval_option(phix)

    # Count number of times a sample has been run on a flowcell; if several, make lane-specific reports
    sample_count = Counter([x.get("barcode_name") for x in sample_run_list])

    # Loop samples and collect information
    s_param_out = []
    fcdoc = None
    for s in sample_run_list:
        s_param = {}
        LOG.debug("working on sample '{}', sample run metrics name '{}', id '{}'".format(s.get("barcode_name", None), s.get("name", None), s.get("_id", None)))
        s_param.update(parameters)
        s_param.update({key:s[srm_to_parameter[key]] for key in srm_to_parameter.keys()})
        fc = "{}_{}".format(s.get("date"), s.get("flowcell"))
        # Get instrument
        try:
            s_param.update(instrument_dict[fc_con.get_instrument(str(fc))])
        except:
            LOG.warn("Failed to set instrument and software versions for flowcell {} in report due to missing RunInfo -> Instrument field in statusdb. Either rerun 'pm qc update-qc' or search-and-replace 'NN' in the sample report.".format(fc))
            s_param.update(instrument_dict['default'])
        # Get run mode
        if not fcdoc or fcdoc.get("name") != fc:
            fcdoc = fc_con.get_entry(fc)
        runp = fcdoc.get("RunParameters",{})
        s_param["sequencing_platform"] = "MiSeq" if "MCSVersion" in runp else "HiSeq2500"
        s_param["clustering_method"] = "onboard clustering" if runp.get("ClusteringChoice","") == "OnBoardClustering" or s_param["sequencing_platform"] == "MiSeq" else "cBot"
        s_param["sequencing_setup"] = fcdoc.get("run_setup")
        s_param["sequencing_mode"] = runp.get("RunMode","High Output")
        s_param["sequencing_software"] = "RTA {}".format(runp.get("RTAVersion"))
        if s_param["sequencing_platform"] == "MiSeq":
            s_param["sequencing_software"] = "MCS {}/{}".format(runp.get("MCSVersion"),s_param["sequencing_software"])
        else:
            s_param["sequencing_software"] = "{} {}/{}".format(runp.get("ApplicationName"),runp.get("ApplicationVersion"),s_param["sequencing_software"])
        s_param["is_paired"] = fc_con.is_paired_end(str(fc))
        if s_param["is_paired"] is None:
            LOG.warn("Could not determine run setup for flowcell {}. Will assume paired-end.".format(fc))
            s_param["is_paired"] = True
        s_param.update(software_versions)
        s_param["phix_error_rate"] = fc_con.get_phix_error_rate(str(fc), s["lane"])
        if phix:
            s_param["phix_error_rate"] = _get_phix_error_rate(s["lane"], phix)
        # Get quality score from demultiplex stats, if that fails
        # (which it shouldn't), fall back on fastqc data.
        (avg_quality_score, pct_q30_bases) = fc_con.get_barcode_lane_statistics(project_name, s.get("barcode_name"), fc, s["lane"])
        s_param['avg_quality_score'] = avg_quality_score if avg_quality_score else calc_avg_qv(s)
        if not s_param['avg_quality_score']:
            LOG.warn("Setting average quality failed for sample {}, id {}".format(s.get("name"), s.get("_id")))
        s_param['pct_q30_bases'] = pct_q30_bases
        if not s_param['pct_q30_bases']:
            LOG.warn("Setting % of >= Q30 Bases (PF) failed for sample {}, id {}".format(s.get("name"), s.get("_id")))
        # Compare phix error and qv to cutoffs
        err_stat = "OK"
        qv_stat = "OK"
        if s_param["phix_error_rate"] > cutoffs["phix_err_cutoff"]:
            err_stat = "HIGH"
        elif s_param["phix_error_rate"] == -1:
            err_stat = "N/A"
        if s_param["avg_quality_score"] < cutoffs["qv_cutoff"]:
            qv_stat = "LOW"
        output_data["stdout"].write("{:>18}\t{:>6}\t{:>12}\t{:>12}\t{:>12}\t{:>12}\n".format(s["barcode_name"], s["lane"], s_param["phix_error_rate"], err_stat, s_param["avg_quality_score"], qv_stat))

        # Update/set remaning sample run parameters, falling back on project defaults if *key* is missing
        s_param['ordered_amount'] = s_param.get('ordered_amount',
                                                p_con.get_ordered_amount(project_name,
                                                                         samples=p_con.get_entry(project_name,'samples')))
        s_param['customer_reference'] = s_param.get('customer_reference', project.get('customer_reference'))
        s_param['uppnex_project_id'] = s_param.get('uppnex_project_id', project.get('uppnex_id'))

        # Override database settings if options passed at command line
        if ordered_million_reads:
            s_param["ordered_amount"] = _get_ordered_million_reads(s["barcode_name"], ordered_million_reads)
        if bc_count:
            s_param["rounded_read_count"] = _round_read_count_in_millions(_get_bc_count(s["barcode_name"], bc_count, s))
        else:
            s_param["rounded_read_count"] = _round_read_count_in_millions(s_param["rounded_read_count"])
        if uppnex_id:
            s_param["uppnex_project_id"] = uppnex_id
        if customer_reference:
            s_param["customer_reference"] = customer_reference

        # Get the project sample name corresponding to the sample run
        project_sample = p_con.get_project_sample(project_name, s.get("project_sample_name", None))
        if project_sample:
            LOG.debug("project sample run metrics mapping found: '{}' : '{}'".format(s["name"], project_sample["sample_name"]))
            project_sample_item = project_sample['project_sample']
            # Set project_sample_d: a dictionary mapping from sample run metrics name to sample run metrics database id
            project_sample_d = _set_project_sample_dict(project_sample_item, source)
            if not project_sample_d:
                LOG.warn("No sample_run_metrics information for sample '{}', barcode name '{}', id '{}'\n\tProject summary information {}".format(s["name"], s["barcode_name"], s["_id"], project_sample))
            # Check if sample run metrics name present in project database: if so, verify that database ids are consistent
            if s["name"] not in project_sample_d.keys():
                LOG.warn("no such sample run metrics '{}' in project sample run metrics dictionary".format(s["name"]) )
            else:
                if s["_id"] == project_sample_d[s["name"]]:
                    LOG.debug("project sample run metrics mapping found: '{}' : '{}'".format(s["name"], project_sample_d[s["name"]]))
                else:
                    LOG.warn("inconsistent mapping for '{}': '{}' != '{}' (project summary id)".format(s["name"], s["_id"], project_sample_d[s["name"]]))
            s_param['customer_name'] = project_sample_item.get("customer_name", None)

            # Always normalize submitted id, since module textttable does not support unicode
            if type(s_param['customer_name']) is unicode:
                s_param['customer_name'] = unicodedata.normalize('NFKD', s_param['customer_name']).encode('ascii', 'ignore')
        # No project sample found. Manual upload to database necessary.
        else:
            s_param['customer_name'] = None
            LOG.warn("No project sample name found for sample run name '{}'".format(s["barcode_name"]))
            LOG.info("Please run 'pm qc upload-qc FLOWCELL_ID --extensive-matching' to update project sample names ")
            LOG.info("or 'pm qc update --sample_prj PROJECT_NAME --names BARCODE_TO_SAMPLE_MAP to update project sample names.")
            LOG.info("Please refer to the pm documentation for examples.")
            query_ok(force=kw.get("force", False))

        # Finally assess sequencing success, update parameters and set outputs
        s_param['success'] = sequencing_success(s_param, cutoffs)
        s_param.update({k:"N/A" for k in s_param.keys() if s_param[k] is None or s_param[k] ==  "" or s_param[k] == -1.0})
        if sample_count[s.get("barcode_name")] > 1:
            outfile = "{}_{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"], s["lane"])
        else:
            outfile = "{}_{}_{}.pdf".format(s["barcode_name"], s["date"], s["flowcell"])
        s_param["outfile"] = outfile
        s_param_out.append(s_param)

    # Write final output to reportlab and rst files
    output_data["debug"].write(json.dumps({'s_param': s_param_out, 'sample_runs':{s["name"]:s["barcode_name"] for s in sample_run_list}}))
    notes = [make_note(headers=headers, paragraphs=paragraphs, **sp) for sp in s_param_out]
    rest_notes = make_sample_rest_notes("{}_{}_{}_sample_summary.rst".format(project_name, s.get("date", None), s.get("flowcell", None)), s_param_out)
    concatenate_notes(notes, "{}_{}_{}_sample_summary.pdf".format(project_name, s.get("date", None), s.get("flowcell", None)))
    return output_data

Ejemplo n.º 4

Mostrar archivo

Archivo: ext_qc.py Proyecto: senthil10/scilifelab

    def multiplex_qc(self):

        MAX_UNDEMULTIPLEXED_INDEX_COUNT = 1000000
        EXPECTED_LANE_YIELD = 143000000
        MAX_PHIX_ERROR_RATE = 2.0
        MIN_PHIX_ERROR_RATE = 0.0
        MIN_GTQ30 = 80.0
        read_pairs = True

        out_data = []

        if not self._check_pargs(['flowcell']):
            return
        url = self.pargs.url if self.pargs.url else self.app.config.get(
            "db", "url")
        if not url:
            self.app.log.warn("Please provide a valid url: got {}".format(url))
            return

        # Construct the short form of the fcid
        sp = os.path.basename(self.pargs.flowcell).split("_")
        fcid = "_".join([sp[0], sp[-1]])

        # Get a connection to the flowcell database and fetch the corresponding document
        self.log.debug("Connecting to flowcell database".format(fcid))
        fc_con = FlowcellRunMetricsConnection(dbname=self.app.config.get(
            "db", "flowcells"),
                                              **vars(self.app.pargs))
        self.log.debug(
            "Fetching run metrics entry for flowcell {}".format(fcid))
        fc_doc = fc_con.get_entry(fcid)
        if not fc_doc:
            self.log.warn(
                "Could not fetch run metrics entry for flowcell {}".format(
                    fcid))
            return

        # Adjust the read pairs variable according to the run setup
        read_pairs = fc_con.is_paired_end(fcid)

        # Get the yield per sample from the Demultiplex_Stats
        self.log.debug("Getting yield for flowcell {}".format(fcid))
        sample_yield = self._get_yield_per_sample(fc_doc, read_pairs)

        # Get the yield per lane from the Demultiplex_Stats
        self.log.debug("Getting lane yield for flowcell {}".format(fcid))
        lane_yield = self._get_yield_per_lane(fc_doc, read_pairs)
        lanes = lane_yield.keys()

        # Get the number of samples in the pools from the Demultiplex_Stats
        self.log.debug("Getting lane pool sizes for flowcell {}".format(fcid))
        pool_size = self._get_pool_size(fc_doc)

        # Get the sample information from the csv samplesheet
        self.log.debug(
            "Getting csv samplesheet data for flowcell {}".format(fcid))
        ssheet_samples = self._get_samplesheet_sample_data(fc_doc)
        if len(ssheet_samples) == 0:
            self.log.warn(
                "No samplesheet data available for flowcell {}".format(fcid))

        # Verify that all samples in samplesheet have reported metrics
        for id in ssheet_samples.keys():
            for key in ssheet_samples[id].keys():
                lane, index = key.split("_")
                project = ssheet_samples[id][key][0]
                if id not in sample_yield or \
                key not in sample_yield[id]:
                    self.log.warn("Sample {} from project {} is in samplesheet but no yield was reported in " \
                                  "Demultiplex_Stats.htm for lane {} and index {}".format(id,
                                                                                          project,
                                                                                          lane,
                                                                                          index))
                    continue
                sample_yield[id][key].append('verified')

        # Check that all samples in Demultiplex_Stats have entries in Samplesheet
        for id in sample_yield.keys():
            for key in sample_yield[id].keys():
                lane, index = key.split("_")
                if "verified" not in sample_yield[id][key] and \
                index != "Undetermined":
                    self.log.warn("Sample {} from project {}, with index {} on lane {} is in Demultiplex_Stats " \
                                  "but no corresponding entry is present in SampleSheet".format(id,
                                                                                                sample_yield[id][key][1],
                                                                                                index,
                                                                                                lane))

        # Check the PhiX error rate for each lane
        self.log.debug("Getting PhiX error rates for flowcell {}".format(fcid))
        for lane in lanes:
            status = "N/A"
            err_rate = fc_con.get_phix_error_rate(fcid, lane)
            if err_rate < 0:
                self.log.warn(
                    "Could not get PhiX error rate for lane {} on flowcell {}".
                    format(lane, fcid))
            elif err_rate <= MIN_PHIX_ERROR_RATE or err_rate > MAX_PHIX_ERROR_RATE:
                status = "FAIL"
            else:
                status = "PASS"
            out_data.append([
                status, "PhiX error rate", lane, err_rate,
                "{} < PhiX e (%) <= {}".format(MIN_PHIX_ERROR_RATE,
                                               MAX_PHIX_ERROR_RATE)
            ])

        # Check the %>=Q30 value for each sample
        sample_quality = self._get_quality_per_sample(fc_doc)
        for id in sample_quality.keys():
            for key in sample_quality[id].keys():
                lane, index = key.split("_")
                status = "FAIL"
                if float(sample_quality[id][key][0]) >= MIN_GTQ30:
                    status = "PASS"
                out_data.append([
                    status, "Sample quality", lane, sample_quality[id][key][2],
                    id, sample_quality[id][key][0],
                    "[%>=Q30 >= {}%]".format(MIN_GTQ30)
                ])

        # Check that each lane received the minimum amount of reads
        for lane, reads in lane_yield.items():
            status = "FAIL"
            if reads >= EXPECTED_LANE_YIELD:
                status = "PASS"
            out_data.append([
                status, "Lane yield", lane, reads,
                "[Yield >= {}]".format(EXPECTED_LANE_YIELD)
            ])

        # Check that all samples in the pool have received a minimum number of reads
        for id in sample_yield.keys():
            for key in sample_yield[id].keys():
                lane, index = key.split("_")
                if index == "Undetermined":
                    continue

                status = "FAIL"
                mplx_min = int(0.5 * EXPECTED_LANE_YIELD / pool_size[lane])
                if sample_yield[id][key][0] >= mplx_min:
                    status = "PASS"
                out_data.append([
                    status, "Sample yield", lane, sample_yield[id][key][1], id,
                    sample_yield[id][key][0], "[Yield >= {}]".format(mplx_min)
                ])

        # Check that the number of undetermined reads in each lane is below 10% of the total yield for the lane
        for lane, reads in lane_yield.items():
            status = "FAIL"
            key = "_".join([lane, "Undetermined"])
            undetermined = sum(
                [counts.get(key, [0])[0] for counts in sample_yield.values()])
            cutoff = 0.1 * reads
            if undetermined < cutoff:
                status = "PASS"
            out_data.append([
                status, "Index read", lane, undetermined,
                "[Undetermined < {}]".format(cutoff)
            ])

        # Check that no overrepresented index sequence exists in undemultiplexed output
        self.log.debug(
            "Fetching undemultiplexed barcode data for flowcell {}".format(
                fcid))
        undemux_data = self._get_undetermined_index_counts(fc_doc)
        if len(undemux_data) == 0:
            self.log.warn(
                "No undemultiplexed barcode data available for flowcell {}".
                format(fcid))

        for lane, counts in undemux_data.items():
            mplx_min = int(
                min(MAX_UNDEMULTIPLEXED_INDEX_COUNT,
                    0.5 * EXPECTED_LANE_YIELD / max(1, pool_size[lane])))
            status = "N/A"
            if len(counts) > 0:
                for i in range(len(counts)):
                    status = "FAIL"
                    if int(counts[i][0]) < mplx_min:
                        status = "PASS"
                    out_data.append([
                        status, "Index", lane, counts[i][1], counts[i][2],
                        counts[i][0],
                        "[Undetermined index < {}]".format(mplx_min)
                    ])
            else:
                out_data.append([status, "Index", lane, "", "", mplx_min, "-"])

        self.app._output_data['stdout'].write("\n".join(
            ["\t".join([str(r) for r in row]) for row in out_data]))