Beispiel #1
0
def getPeptideLengthMetrics(
    identification_sequence_metrics: mzqc.QualityMetric
) -> List[mzqc.QualityMetric]:
    """
    describePeptideLengthMetrics calculates the descriptive statistics metrics for identified sequences' length

    From the proto-metrics on identification sequences, the function calculates descriptive statistics metrics for
    the distribution of peak density from all involved mass spectra.
    Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers.

    Parameters
    ----------
    identification_sequence_metrics : mzqc.QualityMetric
        QualityMetric with 'peptide' value, filtered for final outcome

    Returns
    -------
    List[mzqc.QualityMetric]
        List of resulting QualityMetrics
    """
    metrics: List[mzqc.QualityMetric] = list()

    regex_mod = r'(\([^\(]*\))'
    regex_noaa = r'([^A-Za-z])'
    # TODO test this: '.(iTRAQ4plex)M(Oxidation)C(Carbamidomethyl)HNVNR'
    lengths = np.array([
        len(re.sub(regex_noaa, '', re.sub(regex_mod, '', x)))
        for x in identification_sequence_metrics.value['peptide']
    ])

    q1, q2, q3, s, m, ol = utils.extractDistributionStats(lengths)
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                           accession="QC:0000000",
                           name="Identified peptide lengths Q1, Q2, Q3",
                           value=[q1, q2, q3]))

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                           accession="QC:0000000",
                           name="Identified peptide lengths sigma",
                           value=s))

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                           accession="QC:0000000",
                           name="Identified peptide lengths mean",
                           value=m))

    metrics.append(
        mzqc.QualityMetric(
            cvRef="QC",
            accession="QC:0000000",
            name="Identified peptide lengths +/-1.5*IQR outlier",
            value=ol))

    return metrics
Beispiel #2
0
def describeMSdensity(spectrum_acquisition_metrics_MS:mzqc.QualityMetric, start_time: datetime.datetime, ms_level: int) -> List[mzqc.QualityMetric]:
    """
    describeMSdensity calculates the descriptive statistics metrics for spectra's peak density from a given level.

    From the proto-metrics on spectrum acquisition for a given MS level, the function calculates descriptive statistics metrics for
    the distribution of peak density from all involved mass spectra.
    Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers.

    Parameters
    ----------
    spectrum_acquisition_metrics_MS : mzqc.QualityMetric
        Proto-metric containing 'RT' and 'peakcount' values for all involved spectra
    start_time : datetime.datetime
        MS run start time
    ms_level : int
        The MS level considered to produce the right QC metric accession

    Returns
    -------
    List[mzqc.QualityMetric]
        List of resulting QualityMetrics
    """
    metrics: List[mzqc.QualityMetric] = list()

    rts = [start_time + datetime.timedelta(seconds=i) for i in spectrum_acquisition_metrics_MS.value['RT']]
    arr = np.array(spectrum_acquisition_metrics_MS.value['peakcount'])
    q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr)

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Q1, Q2, Q3 of peak density for MS level {ms_level} collection".format(ms_level=ms_level),
                value=[q1, q2, q3])
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Sigma of peak density for MS level {ms_level} collection".format(ms_level=ms_level),
                value=s)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Mean of peak density for MS level {ms_level} collection".format(ms_level=ms_level),
                value=m)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Peak density for MS level {ms_level} collection +/-1.5*IQR outlier".format(ms_level=ms_level),
                value=ol)
    )

    return metrics
Beispiel #3
0
def describeIdentificationScores(identification_scoring_metrics:mzqc.QualityMetric, score_type:str) -> List[mzqc.QualityMetric]:
    """
    describeIdentificationScores calculates the descriptive statistics metrics for the scoring of identified tandem spectra.

    From the proto-metrics on identification scores, the function calculates descriptive statistics metrics on the
    charge id scores from all identified tandem spectra. Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers.

    Parameters
    ----------
    identification_scoring_metrics : mzqc.QualityMetric
        The proto-metrics on identification scores containing `score_type` values.
    score_type : str
        The score_type descriptor used to create the identification score values category from `identification_scoring_metrics`

    Returns
    -------
    List[mzqc.QualityMetric]
        The list of metrics
    """
    metrics: List[mzqc.QualityMetric] = list()
    qs =  np.quantile(identification_scoring_metrics.value[score_type], [.25,.5,.75])

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="scores Q1, Q2, Q3",
                value=list(qs))
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="score sigma",
                value=np.std(identification_scoring_metrics.value[score_type]))
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="score mean",
                value=np.mean(identification_scoring_metrics.value[score_type]))
    )

    np_score = np.array(identification_scoring_metrics.value[score_type])
    low_out = qs[0]-(1.5*(qs[2]-qs[0]))
    high_out = qs[2]+(1.5*(qs[2]-qs[0]))
    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="score +/-1.5*IQR outlier",
                value=np.extract((np_score<low_out) | (np_score>high_out), np_score))
    )

    return metrics
Beispiel #4
0
def describeMSCollectionTime(trap_metrics:mzqc.QualityMetric, ms_level: int) -> List[mzqc.QualityMetric]:
    """
    describeMSCollectionTime calculates the descriptive statistics metrics for ion collection times of spectra from a given level.

    From the proto-metrics on ion collection for a given MS level, the function calculates descriptive statistics metrics for
    the distribution of ion collection times from all involved mass spectra.
    Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers.

    Parameters
    ----------
    trap_metrics : mzqc.QualityMetric
        The proto-metrics on ion collection times from the respective MS level containing 'traptime' values.
    ms_level : int
        The MS level considered to produce the right QC metric accession

    Returns
    -------
    List[mzqc.QualityMetric]
        The list of metrics
    """
    metrics: List[mzqc.QualityMetric] = list()
    arr = np.array(trap_metrics['traptime'])
    q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr)

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Q1, Q2, Q3 for MS level {ms_level} trap time collection".format(ms_level=ms_level),
                value=[q1, q2, q3])
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Sigma for MS level {ms_level} trap time collection".format(ms_level=ms_level),
                value=s)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Mean of frequency for MS level {ms_level} collection".format(ms_level=ms_level),
                value=m)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Frequency for MS level {ms_level} collection +/-1.5*IQR outlier".format(ms_level=ms_level),
                value=ol)
    )

    return metrics
Beispiel #5
0
def getSamplingRatios(identification_sequence_metrics:mzqc.QualityMetric) -> List[mzqc.QualityMetric]:
    """
    getSamplingRatios calculates the sampling ratio metric for identified tandem spectra.

    From the proto-metrics on identified sequences, the function calculates sampling rate and frequency.

    Parameters
    ----------
    identification_sequence_metrics : mzqc.QualityMetric
        The proto-metrics on identified sequences containing 'peptide' (sequence) values.

    Returns
    -------
    List[mzqc.QualityMetric]
        The list of metrics
    """
    metrics: List[mzqc.QualityMetric] = list()

    sample_rate, sample_rate_counts = np.unique(np.unique(identification_sequence_metrics.value['peptide'], return_counts=True)[1], return_counts=True)
    # explicitly enum all sampling rates up to the max.
    explicit_rate_counts = np.zeros( np.max(sample_rate) )
    explicit_rate = np.arange(1, np.max(sample_rate)+1)
    explicit_indices = np.where(np.isin(explicit_rate,sample_rate))
    np.put(explicit_rate,explicit_indices,sample_rate)
    np.put(explicit_rate_counts,explicit_indices,sample_rate_counts)

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Sampling frequencies",
                value={'sampling rate': list(explicit_rate),
                    'frequencies': list(explicit_rate_counts)})
    )

    return metrics
Beispiel #6
0
 def test_NumpyValues(self):
     nup = qc.QualityMetric()
     nup.accession = "QC:123"
     nup.name = "einszweidrei"
     npnd = np.array([1 / 9, 2 / 8, 3 / 7], dtype=np.float32)
     nup.value = {"np": npnd}
     assert qc.JsonSerialisable.ToJson(nup) == NPQM
Beispiel #7
0
 def test_DateTime(self):
     try:
         zqc = qc.MzQcFile(version="0.1.0",
                           creationDate=datetime.now().isoformat(),
                           runQualities=[],
                           setQualities=[],
                           controlledVocabularies=[])
     except Exception as error:
         raise AssertionError(f"An unexpected exception {error} raised.")
Beispiel #8
0
def mzqc_assembly(rqs, sqs, out):
    # TODO check all the metrics to see which ontologies were used
    cv_qc = qc.ControlledVocabulary(
        ref="QC",
        name="Proteomics Standards Initiative Quality Control Ontology",
        version="0.1.0",
        uri=
        "https://github.com/HUPO-PSI/qcML-development/blob/master/cv/v0_1_0/qc-cv.obo"
    )
    cv_ms = qc.ControlledVocabulary(
        ref="MS",
        name="Proteomics Standards Initiative Mass Spectrometry Ontology",
        version="4.1.7",
        uri="https://github.com/HUPO-PSI/psi-ms-CV/blob/master/psi-ms.obo")

    return qc.MzQcFile(version="0.1.0",
                       creationDate=datetime.now().isoformat(),
                       runQualities=rqs,
                       setQualities=sqs,
                       controlledVocabularies=[cv_qc, cv_ms])
Beispiel #9
0
def describePrecursorIntensity(tandem_spectrum_metrics_MS2:mzqc.QualityMetric) -> List[mzqc.QualityMetric]:
    """
    describePrecursorIntensity calculates the descriptive statistics metrics for spectra's peak density from a given level.

    From the proto-metrics on tandem spectra, the function calculates descriptive statistics metrics for
    the distribution of precursor intensity. Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers.

    Parameters
    ----------
    tandem_spectrum_metrics_MS2 : mzqc.QualityMetric
        Proto-metric of tandem spectra containing values for 'precursor_intensity'

    Returns
    -------
    List[mzqc.QualityMetric]
        List of resulting QualityMetrics
    """
    metrics: List[mzqc.QualityMetric] = list()

    arr = np.array(tandem_spectrum_metrics_MS2.value['precursor_intensity'])
    q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr)

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Maximum precursor intensity",
                value=max(arr))
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Minmum precursor intensity",
                value=min(arr))
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Q1, Q2, Q3 of precursor intensities",
                value=[q1, q2, q3])
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Sigma of precursor intensities",
                value=s)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Mean of precursor intensities",
                value=m)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Precursor intensity +/-1.5*IQR outlier",
                value=ol)
    )

    return metrics
Beispiel #10
0
def getESIstability(ion_intensity_metric:mzqc.QualityMetric) -> List[mzqc.QualityMetric]:
    """
    getESIstability calculates the count of signal jumps and falls during the course of a mass-spectrometry run's acquisition time.

    Counts the number of signal jumps/falls of at least 10-fold magnitude.

    Parameters
    ----------
    ion_intensity_metric : mzqc.QualityMetric
        Proto-metric containing the 'int' values of signal intensity in timely order

    Returns
    -------
    List[mzqc.QualityMetric]
        List of resulting QualityMetrics
    """
    metrics: List[mzqc.QualityMetric] = list()

    folds = np.true_divide(ion_intensity_metric.value['int'][:-1],ion_intensity_metric.value['int'][1:])
    jumps = len(np.where(folds > 10)[0])
    falls = len(np.where(folds < 1/10)[0])

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="signal jump (10x) count",
                value=jumps)
    )

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="signal fall (10x) count",
                value=falls)
    )
    return metrics
Beispiel #11
0
def getAnalysedSignalMetrics(tandem_spectrum_metrics_MS2:mzqc.QualityMetric) -> List[mzqc.QualityMetric]:
    """
    getAnalysedSignalMetrics calculates a metric on the proportion of signal analysed with subsequent tandem spectra.

    The function calculates the median ratio of max survey scan intensity over sampled precursor intensity for the bottom (by MS1 max) half of MS2.

    Parameters
    ----------
    tandem_spectrum_metrics_MS2 : mzqc.QualityMetric
        Proto-metric of tandem spectra containing values for 'RT', 'precursor_mz', 'precursor_intensity', 'surveyscan_intensity_sum', 'surveyscan_intensity_max'.

    Returns
    -------
    List[mzqc.QualityMetric]
        List of resulting QualityMetrics
    """
    metrics: List[mzqc.QualityMetric] = list()

    # Fraction of total MS2 scans identified in the first quartile of peptides sorted by MS1 intensity (sum)
    np_prec = np.array([tandem_spectrum_metrics_MS2.value['RT'],
                        tandem_spectrum_metrics_MS2.value['precursor_mz'],
                        tandem_spectrum_metrics_MS2.value['precursor_intensity'],
                        tandem_spectrum_metrics_MS2.value['surveyscan_intensity_sum'],
                        tandem_spectrum_metrics_MS2.value['surveyscan_intensity_max']])


    # DS-3B reimpl.: median( (surv max / prec int) for bottom 50% of all precursors )
    np_prec = np_prec[:,np_prec[4].argsort()]
    # Ratio of MS1 maximum to MS1 value at sampling for bottom 50% of analytes by MS1 maximum intensity (1 = sampled at peak maxima)
    bottom_sampled_prec = np_prec[:,np_prec[4]<np.median(np_prec[4])]
    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Median ratio of max survey scan intensity over sampled precursor intensity for the bottom (by MS1 max) half of MS2",
                value=np.median(bottom_sampled_prec[4] / bottom_sampled_prec[2]))
    )

    return metrics
Beispiel #12
0
def getCoverageRatios(pro_ids: oms.ProteinIdentification,
                      pep_ids: List[oms.PeptideIdentification],
                      fasta=Dict[str, SeqRecord.SeqRecord],
                      fetch=False) -> List[mzqc.QualityMetric]:
    """
    getCoverageRatios calculates the coverage ratios per protein from the identified searchspace.

    Calculating the coverage from all individual peptide identification also requires all protein
    sequences expected to be known. For this there are two options, either retrieve the sequences
    from the originally used fasta, or try to retrieve the sequences via UniProt through the
    accessions with the PeptideHits.

    Parameters
    ----------
    pro_ids : List[oms.ProteinIdentification]
        The PyOpenMS ProteinIdentification as from reading a common identification file
    pep_ids : List[oms.PeptideIdentification]
        List of PyOpenMS PeptideIdentification as from reading a common identification file
    fasta : [type], optional
        Dictionary of sequences from a fasta file, Dict[accession,SeqRecord] by default Dict[str,SeqRecord.SeqRecord]
    fetch : bool, optional
        If set true, will attempt to retrieve sequences by accession, is ignored if `fasta` is provided, by default False

    Returns
    -------
    List[mzqc.QualityMetric]
        [description]
    """
    metrics: List[mzqc.QualityMetric] = list()

    # check all proteinhits have seq set
    # first via proteinhits, missing then either via fasta or
    # calc coverage
    missing_acc = list()
    nup = list()
    for p in pro_ids.getHits():
        ac = p.getAccession()
        nup.append(oms.ProteinHit(p))
        if not p.getSequence():
            if fasta:
                nup[-1].setSequence(
                    str(fasta.get(ac, SeqRecord.SeqRecord('')).seq))
            # if still no sequence
            if not p.getSequence():
                missing_acc.append(ac)

    if missing_acc:
        uniprot = {x.id: x for x in utils.getUniProtSequences(missing_acc)}
        for n in nup:
            ac = n.getAccession()
            if not n.getSequence():
                n.setSequence(str(
                    uniprot.get(ac, SeqRecord.SeqRecord('')).seq))
        urx = re.compile('\w*\|(\w*)\|\w*')
        uniprot = {
            re.search(urx, x.id).group(): x
            for x in utils.getUniProtSequences(missing_acc)
        }
        del uniprot['']
        for n in nup:
            ac = n.getAccession()
            if not n.getSequence():
                n.setSequence(str(
                    uniprot.get(ac, SeqRecord.SeqRecord('')).seq))

    coverage_tab: Dict[str, List[Any]] = defaultdict(list)
    na = [n.getAccession() for n in nup if not n.getSequence()]
    nup = [n for n in nup if n.getSequence()]

    pro_ids.setHits(nup)
    pro_ids.computeCoverage(pep_ids)

    for p in pro_ids.getHits():
        coverage_tab['Accession'].append(p.getAccession())
        coverage_tab['Coverage'].append(p.getCoverage())
        coverage_tab['Length'].append(len(p.getSequence()))
        # TODO figure out decoy string by fasta
        coverage_tab['TD'].append('decoy' if 'decoy' in
                                  p.getAccession().lower() else 'target')

    for n in na:
        coverage_tab['Accession'].append(n.getAccession())
        coverage_tab['Coverage'].append('NA')
        coverage_tab['Length'].append('NA')
        coverage_tab['TD'].append('decoy' if 'decoy' in
                                  n.getAccession().lower() else 'target')

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                           accession="QC:0000000",
                           name="Protein coverage",
                           value=coverage_tab))

    return metrics
Beispiel #13
0
def describeChargeRatios(identification_scoring_metrics:mzqc.QualityMetric) -> List[mzqc.QualityMetric]:
    """
    describeChargeRatios calculates the descriptive statistics metrics for charge ratios of identified tandem spectra.

    From the proto-metrics on identification scores, the function calculates descriptive statistics metrics on the
    charge ratios from all identified tandem spectra. Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers.

    Parameters
    ----------
    identification_scoring_metrics : mzqc.QualityMetric
        The proto-metrics on identification scores containing 'c' values.

    Returns
    -------
    List[mzqc.QualityMetric]
        The list of metrics
    """
    metrics: List[mzqc.QualityMetric] = list()
    if 'c' not in identification_scoring_metrics:
        warnings.warn("No charges in given annotation, ignoring charge ratio metrics.", Warning)
        return metrics

    # IS3A <- c1n / c2n
    # IS3B <- c3n / c2n
    # IS3C <- c4n / c2n
    unique_charges, charge_freq = np.unique(identification_scoring_metrics.value['c'], return_counts=True)
    c1i = np.where(unique_charges == 1)
    c1n = charge_freq[c1i[0][0]] if len(c1i[0])>0 else 0
    c2i = np.where(unique_charges == 2)
    c2n = charge_freq[c2i[0][0]] if len(c2i[0])>0 else 0
    c3i = np.where(unique_charges == 3)
    c3n = charge_freq[c3i[0][0]] if len(c3i[0])>0 else 0
    c4i = np.where(unique_charges == 4)
    c4n = charge_freq[c4i[0][0]] if len(c4i[0])>0 else 0
    mi = c4i if c4i > 0 else c3i if c3i > 0 else c2i if c2i > 0 else c1i if c1i > 0 else 0
    c5p = sum(charge_freq[mi+1:])

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="IS3A",
                value= (c1n / c2n) if c2n > 0 else np.NAN )
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="IS3B",
                value= (c3n / c2n) if c2n > 0 else np.NAN )
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="IS3C",
                value= (c4n / c2n) if c2n > 0 else np.NAN )
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="IS3X",
                value=np.median(identification_scoring_metrics.value['c']))
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="IS3Y",
                value=np.mean(identification_scoring_metrics.value['c']))
    )

    return metrics
Beispiel #14
0
Unit tests for the MZQCFile library
"""

# String comparison -as in TestSerialisation- needs the 'empty' attributes, too, whereas Object comparison -as in TestDeserialisation- only compares 'non-empty' attributes
QM = '{"cvRef": "QC", "accession": "QC:4000053", "name": "RT duration", "description": "", "value": 99, "unit": ""}'
CV = '{"ref": "REF", "name": "TEST", "uri": "www.eff.off", "version": ""}'
CVT = '{"cvRef": "REF", "accession": "TEST:123", "name": "testname", "description": "", "value": 99, "unit": ""}'
ANSO = '{"cvRef": "QC", "accession": "QC:9999999", "name": "bigwhopqc", "description": "", "value": "", "unit": "", "version": "1.2.3", "uri": "file:///dev/null"}'
INFI = '{"location": "file:///dev/null", "name": "file.raw", "fileFormat": {"cvRef": "MS", "accession": "MS:1000584", "name": "mzML format"}, "fileProperties": [{"cvRef": "MS", "accession": "MS:1000747", "name": "completion time", "value": "2017-12-08-T15:38:57Z"}]}'
META = '{"inputFiles": [{"location": "file:///dev/null", "name": "file.raw", "fileFormat": {"cvRef": "MS", "accession": "MS:1000584", "name": "mzML format"}, "fileProperties": [{"cvRef": "MS", "accession": "MS:1000747", "name": "completion time", "value": "2017-12-08-T15:38:57Z"}]}], "analysisSoftware": [{"cvRef": "QC", "accession": "QC:9999999", "name": "bigwhopqc", "version": "1.2.3", "uri": "file:///dev/null"}]}'
RUQU = '{"metadata": {"inputFiles": [{"location": "file:///dev/null", "name": "file.raw", "fileFormat": {"cvRef": "MS", "accession": "MS:1000584", "name": "mzML format"}, "fileProperties": [{"cvRef": "MS", "accession": "MS:1000747", "name": "completion time", "value": "2017-12-08-T15:38:57Z"}]}], "analysisSoftware": [{"cvRef": "QC", "accession": "QC:9999999", "name": "bigwhopqc", "version": "1.2.3", "uri": "file:///dev/null"}]}, "qualityMetrics": [{"cvRef": "QC", "accession": "QC:4000053", "name": "RT duration", "value": 99}]}'
SEQU = '{"metadata": {"inputFiles": [{"location": "file:///dev/null", "name": "file.raw", "fileFormat": {"cvRef": "MS", "accession": "MS:1000584", "name": "mzML format"}, "fileProperties": [{"cvRef": "MS", "accession": "MS:1000747", "name": "completion time", "value": "2017-12-08-T15:38:57Z"}]}], "analysisSoftware": [{"cvRef": "QC", "accession": "QC:9999999", "name": "bigwhopqc", "version": "1.2.3", "uri": "file:///dev/null"}]}, "qualityMetrics": [{"cvRef": "QC", "accession": "QC:4000053", "name": "RT duration", "value": 99}]}'
NPQM = '{"cvRef": "", "accession": "QC:123", "name": "einszweidrei", "description": "", "value": {"np": [0.1111111119389534, 0.25, 0.4285714328289032]}, "unit": ""}'

cvt = qc.CvParameter(cvRef="REF",
                     accession="TEST:123",
                     name="testname",
                     value=99)
infi = qc.InputFile(name="file.raw",
                    location="file:///dev/null",
                    fileFormat=qc.CvParameter("MS", "MS:1000584",
                                              "mzML format"),
                    fileProperties=[
                        qc.CvParameter(cvRef="MS",
                                       accession="MS:1000747",
                                       name="completion time",
                                       value="2017-12-08-T15:38:57Z")
                    ])
anso = qc.AnalysisSoftware(
    cvRef="QC",
    accession="QC:9999999",
    name="bigwhopqc",
Beispiel #15
0
def getIDQuality(exp: oms.MSExperiment, pro_ids: List[oms.ProteinIdentification], pep_ids: List[oms.PeptideIdentification], ms2num: int = 0) -> List[mzqc.QualityMetric]:
    """
    getIDQuality calculates the id-based QualityMetrics from a mass spectrometry peak file and associated identification file.

    Calculated are the id-based QC metrics and proto-metrics necessary to calculate more elaborate QC metrics with even more additional data (e.g. multiple runs).

    Parameters
    ----------
    exp : oms.MSExperiment
        The mass spectrometry peak file to calculate metrics from
    pro_ids : List[oms.ProteinIdentification]
        List of PyOpenMS ProteinIdentification as from reading a common identification file
    pep_ids : List[oms.PeptideIdentification]
        List of PyOpenMS PeptideIdentification as from reading a common identification file
    ms2num : int, optional
        The total number of tandem spectra as from the id-free metrics, by default 0

    Returns
    -------
    List[mzqc.QualityMetric]
        List of resulting QualityMetrics
    """
    metrics: List[mzqc.QualityMetric] = list()
    params = pro_ids[0].getSearchParameters()
    # var_mods = params.variable_modifications

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Sequence database name",
                value=pro_ids[0].getSearchParameters().db)
    )

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Sequence database version",
                value=pro_ids[0].getSearchParameters().db_version)
    )

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Sequence database taxonomy",
                value=pro_ids[0].getSearchParameters().taxonomy)
    )

    spectrum_count: int = 0
    psm_count: int = 0
    runs_coun: int = 0
    protein_evidence_count: int = 0

    # TODO call mc functions
    missedcleavages: int = 0
    missedcleavages_total: int = 0

    peptides_allhits: Set[str] = set()
    peptides: Set[str] = set()
    proteins: Set[str] = set()

    for pepi in pep_ids:
      if not pepi.empty():
        # TODO if not decoy and not under threshold
        spectrum_count += 1
        psm_count += len(pepi.getHits())
        for psm in pepi.getHits():
            peptides_allhits.add(psm.getSequence().toString())
        if pepi.getHits():
            peptides.add(pepi.getHits()[0].getSequence().toString())

    for proid in pro_ids:
        protein_evidence_count += len(proid.getHits())
        for p in proid.getHits():
            proteins.add(p.getAccession())

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Total number of protein evidences",
                value=protein_evidence_count)
    )

    # TODO not yet factoring in protein inference, one psm might still account for several evidences
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Total number of identified proteins",
                value=len(proteins))
    )

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Total number of PSM",
                value=psm_count)
    )

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Total number of peptide spectra",
                value=spectrum_count)
    )

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Total number identified unique peptide sequences",
                value=len(peptides))
    )

    identification_accuracy_metrics: Dict[str,List[Any]] = defaultdict(list)
    identification_scoring_metrics: Dict[str,List[Any]] = defaultdict(list)
    identification_sequence_metrics: Dict[str,List[Any]] = defaultdict(list)
    hydrophobicity_metrics: Dict[str,List[Any]] = defaultdict(list)

    # TODO constants available since 2.5 as oms.Constants.PROTON_MASS_U
    # PROTON_MASS_U = 1.00727646677  # Constants::PROTON_MASS_U unavailable

    score_type = pep_ids[0].getScoreType()

    psims = utils.obtainOntology("psi-ms")

    name_indexed = {psims[x].name: psims[x] for x in psims}
    score_indexed = {x.name: x for x in chain(psims['MS:1001143'].subclasses(),psims['MS:1001153'].subclasses(),psims['MS:1002347'].subclasses(),psims['MS:1002363'].subclasses())}

    if score_type in name_indexed:
        if not score_type in score_indexed:
            warnings.warn("Score type does not correspond to a score type in the OBO, proceed at own risk.", Warning)
            score_col_name = name_indexed[score_type].id
        else:
            score_col_name = score_indexed[score_type].id
    else:
        warnings.warn("OBO does not contain any entry matching the identification score, proceed at own risk.", Warning)
        score_col_name = score_type

    for pepi in pep_ids:
        pid = utils.pep_native_id(pepi)
        if pepi.getHits():
            tmp = pepi.getHits()[0]  # TODO apply custom filters and also consider 'pass_threshold'
            identification_scoring_metrics['RT'].append(pepi.getRT())
            identification_scoring_metrics['c'].append(tmp.getCharge())
            identification_scoring_metrics[score_col_name].append(tmp.getScore())

            tw = (tmp.getSequence().getMonoWeight(0,0) + tmp.getCharge() * oms.Constants.PROTON_MASS_U) / tmp.getCharge()
            dppm = utils.getMassDifference(tw, pepi.getMZ(), True)
            identification_accuracy_metrics['RT'].append(pepi.getRT())
            identification_accuracy_metrics['MZ'].append(pepi.getMZ())
            identification_accuracy_metrics['delta_ppm'].append(dppm)
            err = utils.getMassDifference(tw, pepi.getMZ(), False)
            identification_accuracy_metrics['abs_error'].append(err)

            hydrophobicity_metrics['RT'].append(pepi.getRT())
            hydrophobicity_metrics['gravy'].append(ProtParam.ProteinAnalysis(tmp.getSequence().toUnmodifiedString()).gravy())

            identification_sequence_metrics['RT'].append(pepi.getRT())
            identification_sequence_metrics['peptide'].append(tmp.getSequence().toString().lstrip().rstrip())
            identification_sequence_metrics['target'].append(tmp.getMetaValue('target_decoy').lower() == 'target')
            identification_sequence_metrics['native_id'].append(pid)

    #   #varmod???
    #   for (UInt w = 0; w < var_mods.size(); ++w)
    #   {
    #     at.colTypes.push_back(String(var_mods[w]).substitute(' ', '_'));
    #   }

    ## Basic id metrics
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Identification scoring metric values",
                value=identification_scoring_metrics)
    )
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Identifications accuracy metric values",
                value=identification_accuracy_metrics)
    )
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Hydrophobicity metric values",
                value=hydrophobicity_metrics)
    )
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Identifications sequence metric values",
                value=identification_sequence_metrics)
    )

    ## simple id metrics
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Identification to tandem spectra ratio",
                value=float(len(pep_ids))/float(ms2num))
    )

    return metrics
Beispiel #16
0
def describeErrorRates(identification_accuracy_metrics:mzqc.QualityMetric) -> List[mzqc.QualityMetric]:
    """
    describeErrorRates calculates the descriptive statistics metrics for charge ratios of identified tandem spectra.

    From the proto-metrics on identification accuracy, the function calculates descriptive statistics metrics on the
    error rates from all identified tandem spectra. Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers.

    Parameters
    ----------
    identification_accuracy_metrics : mzqc.QualityMetric
        The proto-metrics on identification accuracies containing 'delta_ppm' and 'abs_error' values.

    Returns
    -------
    List[mzqc.QualityMetric]
        The list of metrics
    """
    metrics: List[mzqc.QualityMetric] = list()
    if 'delta_ppm' not in identification_accuracy_metrics:
        warnings.warn("No error values in given annotation, ignoring identification error rate metrics.", Warning)
        return metrics

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="MS15A",
                value= np.median(identification_accuracy_metrics.value['abs_error']) )
    )
    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="MS15B",
                value=np.mean(identification_accuracy_metrics.value['abs_error']) )
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="MS15C",
                value=np.median(identification_accuracy_metrics.value['delta_ppm']) )
    )

    arr = np.array(identification_accuracy_metrics.value['delta_ppm'])
    q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr)

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="MS15D",
                value=q3-q1)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Delta ppm Q1, Q2, Q3",
                value=[q1,q2,q3])
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Delta ppm sigma",
                value=s)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Delta ppm mean",
                value=m)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Delta ppm +/-1.5*IQR outlier",
                value=ol)
    )

    return metrics
Beispiel #17
0
def test_SyntaxCheck():
    cvt = qc.CvParameter(cvRef="REF",
                         accession="TEST:123",
                         name="testname",
                         value=99)
    infi = qc.InputFile(name="file.raw",
                        location="file:///dev/null",
                        fileFormat=qc.CvParameter("MS", "MS:1000584",
                                                  "mzML format"),
                        fileProperties=[
                            qc.CvParameter(cvRef="MS",
                                           accession="MS:1000747",
                                           name="completion time",
                                           value="2017-12-08-T15:38:57Z")
                        ])
    anso = qc.AnalysisSoftware(
        cvRef="QC",
        accession="QC:9999999",
        name="bigwhopqc",
        version="1.2.3",
        uri="file:///dev/null")  # isn't requiring a uri a bit too much?
    meta = qc.MetaDataParameters(inputFiles=[infi], analysisSoftware=[anso])
    qm = qc.QualityMetric(cvRef="QC",
                          accession="QC:4000053",
                          name="RT duration",
                          value=99)
    qm2 = qc.QualityMetric(cvRef="QC",
                           accession="QC:4000061",
                           name="Maximal MS2 frequency",
                           value=999)
    qm3 = qc.QualityMetric(cvRef="QC",
                           accession="QC:4000055",
                           name="MS1 quantiles RT fraction",
                           value=9)
    rq = qc.RunQuality(metadata=meta, qualityMetrics=[qm, qm2])
    sq = qc.SetQuality(metadata=meta, qualityMetrics=[qm3])
    cv = qc.ControlledVocabulary(ref="QC", name="QCvocab", uri="www.qc.ml")
    cv2 = qc.ControlledVocabulary(ref="REF", name="TEST", uri="www.eff.off")
    mzqc = qc.MzQcFile(version="0.0.11",
                       runQualities=[rq],
                       setQualities=[sq],
                       controlledVocabularies=[cv, cv2])
    # with open('tests/mzqc_lib_out.mzqc', 'w') as f:
    #     f.write("{ \"mzQC\": " + qc.JsonSerialisable.ToJson(mzqc) + " }")

    syn_check = sy.SyntacticCheck()
    syn_check.validate("{ \"mzQC\": " + qc.JsonSerialisable.ToJson(mzqc) +
                       " }")
Beispiel #18
0
def getBasicQuality(exp: oms.MSExperiment, verbose: bool=False) -> mzqc.RunQuality:
    """
    getBasicQuality calculates the basic QualityMetrics from a mass spectrometry peak file and creates the related RunQuality object.

    Calculated basic QC metrics and proto-metrics necessary to calculate more elaborate QC metrics with additional data (e.g. ID).

    Parameters
    ----------
    exp : oms.MSExperiment
        The mass spectrometry peak file to calculate metrics from
    verbose : bool, optional
        switches on verbose logging, by default False

    Returns
    -------
    mzqc.RunQuality
        A RunQuality object containing the list of metrics calculated and metadata collected, ready for integration into a mzQC file object.
    """
    metrics: List[mzqc.QualityMetric] = list()
    if exp.getExperimentalSettings().getSourceFiles():
        parent_base_name: str = basename(exp.getExperimentalSettings().getSourceFiles()[0].getNameOfFile())
        parent_chksm: str = exp.getExperimentalSettings().getSourceFiles()[0].getChecksum()
        parent_chksm_type: str = exp.getExperimentalSettings().getSourceFiles()[0].getChecksumType()

    instr_srl: str = exp.getInstrument().getMetaValue('instrument serial number') \
        if exp.getInstrument().metaValueExists('instrument serial number') else 'unknown'  # MS:1000529 in mzML

    input_loc: str = exp.getExperimentalSettings().getLoadedFilePath()
    base_name: str = basename(input_loc)
    chksm: str = utils.sha256fromfile(exp.getExperimentalSettings().getLoadedFilePath())
    cmpltn: str = exp.getDateTime().get()
    # strt:datetime.datetime = datetime.datetime.strptime(cmpltn, '%Y-%m-%d %H:%M:%S') - datetime.timedelta(seconds=exp.getChromatograms()[0][exp.getChromatograms()[0].size()-1].getRT()*60)

    meta: mzqc.MetaDataParameters = mzqc.MetaDataParameters(
        inputFiles=[
            mzqc.InputFile(name=base_name,location=input_loc,
                        fileFormat=mzqc.CvParameter("MS", "MS:1000584", "mzML format"),
                        fileProperties=[
                            mzqc.CvParameter(cvRef="MS",
                                accession="MS:1000747",
                                name="completion time",
                                value=cmpltn
                            ),
                            mzqc.CvParameter(cvRef="MS",
                                accession="MS:1000569",
                                name="SHA-256",
                                value=chksm
                            ),
                            mzqc.CvParameter(cvRef="MS",
                                accession="MS:1000031",
                                name="instrument model",
                                value=exp.getInstrument().getName()
                            ),
                            mzqc.CvParameter(cvRef="MS",
                                accession="MS:1000529",
                                name="instrument serial number",
                                value=instr_srl
                            )
                            # TODO integrate parent location and checksum
                            # id: MS:1002846 (Associated raw file URI) N.B. definition is PRIDE specific - WTF
                            # fitting checksum cv missing
                        ]
            )
        ],
        analysisSoftware=[
            mzqc.AnalysisSoftware(cvRef="MS", accession="MS:1000752", name="TOPP software", version=oms.__version__, uri="openms.de")
        ]
    )

    # this is mighty important to sort by RT
    exp.sortSpectra()

    min_mz: float = sys.maxsize
    max_mz: float = 0
    mslevelcounts: Dict[int,int] = defaultdict(int)

    spectrum_acquisition_metrics_MS1: Dict[str,List[Any]] = defaultdict(list)
    spectrum_acquisition_metrics_MS2: Dict[str,List[Any]] = defaultdict(list)
    spectrum_topn: Dict[str,List[Any]] = defaultdict(list)
    tandem_spectrum_metrics_MS2: Dict[str,List[Any]] = defaultdict(list)
    trap_metrics_MS1: Dict[str,List[Any]] = defaultdict(list)
    trap_metrics_MS2: Dict[str,List[Any]] = defaultdict(list)
    isolation_window_metrics: Dict[str,List[Any]] = defaultdict(list)
    tic_tab: Dict[str,List[Any]] = defaultdict(list)

    # ActivationMethod look-up dict
    ams = {getattr(ActivationMethod,i): i for i in dir(ActivationMethod) if type(getattr(ActivationMethod,i))==int }

    intens_sum: np.float = 0
    last_surveyscan_index:int = 0
    for spin, spec in enumerate(exp):
        mslevelcounts[spec.getMSLevel()] += 1

        iontraptime = utils.getTrapTime(spec)
        intens_max = spec.get_peaks()[1].max()
        intens_min = spec.get_peaks()[1].min()
        intens_sum = spec.get_peaks()[1].sum()

        if spec.getMSLevel() == 1:
            last_surveyscan_index = spin
            last_surveyscan_intensity = intens_sum
            last_surveyscan_max = intens_max

            spectrum_acquisition_metrics_MS1['RT'].append(spec.getRT())
            spectrum_acquisition_metrics_MS1['SN'].append(noiseqc.getSN_medianmethod(spec))
            spectrum_acquisition_metrics_MS1['peakcount'].append(spec.size())
            spectrum_acquisition_metrics_MS1['int'].append(intens_sum.item())  # .item() for dtype to pytype

            trap_metrics_MS1['RT'].append(spec.getRT())
            trap_metrics_MS1['traptime'].append(iontraptime)

            tic_tab['RT'].append(spec.getRT())
            tic_tab['int'].append(intens_sum)

        if (spec.getMSLevel() == 2):
            if (spec.getPrecursors()[0].getMZ() < min_mz):
                min_mz = spec.getPrecursors()[0].getMZ()
            if (spec.getPrecursors()[0].getMZ() > max_mz):
                max_mz = spec.getPrecursors()[0].getMZ()

            spectrum_acquisition_metrics_MS2['RT'].append(spec.getRT())
            spectrum_acquisition_metrics_MS2['SN'].append(noiseqc.getSN_medianmethod(spec))
            spectrum_acquisition_metrics_MS2['peakcount'].append(spec.size())
            spectrum_acquisition_metrics_MS2['int'].append(intens_sum.item())  # .item() for dtype to pytype
            spectrum_acquisition_metrics_MS2['native_id'].append(utils.spec_native_id(spec))

            rank = spin - last_surveyscan_index
            spectrum_acquisition_metrics_MS2['rank'].append(rank)

            trap_metrics_MS2['RT'].append(spec.getRT())
            trap_metrics_MS2['traptime'].append(iontraptime)
            trap_metrics_MS2['activation_method'].append(ams.get(next(iter(spec.getPrecursors()[0].getActivationMethods()), None),'unknown'))
            trap_metrics_MS2['activation_energy'].append(spec.getPrecursors()[0].getMetaValue('collision energy') if \
                spec.getPrecursors()[0].metaValueExists('collision energy') else -1)

            precursor_index = np.searchsorted(exp[last_surveyscan_index].get_peaks()[0], [exp[spin].getPrecursors()[0].getMZ()])[0]
            if precursor_index != np.array(exp[last_surveyscan_index].get_peaks()).shape[1]:
                precursor_err = spec.getPrecursors()[0].getMZ() - np.array(exp[last_surveyscan_index].get_peaks())[:,precursor_index][0]
                precursor_int = np.array(exp[last_surveyscan_index].get_peaks())[:,precursor_index][1]
            else:
                precursor_err = np.nan
                precursor_int = np.nan

            tandem_spectrum_metrics_MS2['RT'].append(spec.getRT())
            tandem_spectrum_metrics_MS2['precursor_intensity'].append(precursor_int)  # TODO different from mzid->mzml getPrecursors[0].getIntensity() ? YES, latter one usually zero
            tandem_spectrum_metrics_MS2['precursor_error'].append(precursor_err)
            tandem_spectrum_metrics_MS2['precursor_mz'].append(spec.getPrecursors()[0].getMZ())
            tandem_spectrum_metrics_MS2['precursor_c'].append(spec.getPrecursors()[0].getCharge())

            tandem_spectrum_metrics_MS2['surveyscan_intensity_sum'].append(last_surveyscan_intensity)
            tandem_spectrum_metrics_MS2['surveyscan_intensity_max'].append(last_surveyscan_max)

            isolation_window_metrics['RT'].append(spec.getRT())
            isolation_window_metrics['isolation_target'].append(spec.getPrecursors()[0].getMZ())  # https://github.com/OpenMS/OpenMS/blob/d17cc251fd0c4068eb253b03c9fb107897771fdc/src/openms/source/FORMAT/HANDLERS/MzMLHandler.cpp#L1992
            isolation_window_metrics['isolation_lower'].append(spec.getPrecursors()[0].getIsolationWindowLowerOffset())
            isolation_window_metrics['isolation_upper'].append(spec.getPrecursors()[0].getIsolationWindowUpperOffset())
            lower = spec.getPrecursors()[0].getMZ() - spec.getPrecursors()[0].getIsolationWindowLowerOffset()
            upper = spec.getPrecursors()[0].getMZ() + spec.getPrecursors()[0].getIsolationWindowUpperOffset()

            s = np.array([(i.getMZ(),i.getIntensity()) for i in exp[last_surveyscan_index]], ndmin = 2)
            s = s[np.where(np.logical_and(s[:, 0]>=lower, s[:, 0]<=upper))[0]]
            isolation_window_metrics['peaks_in_window'].append(np.shape(s)[0])

            int_sort_desc = np.flip(np.argsort(s[:,1]))
            if np.shape(s)[0] > 1:
                isolation_window_metrics['int_ratio_ranked_peaks_in_window'].append(
                    s[int_sort_desc][:-1,1]/s[int_sort_desc][1:,1][0])  # intensity ratio between top1&2, 2&3, ...
            else:
                isolation_window_metrics['int_ratio_ranked_peaks_in_window'].append(0)  # bigger is better, though best is 0

            isolation_window_metrics['summed_window_intensity'].append(np.sum(s[int_sort_desc][:,1]))
            isolation_window_metrics['isolation_target_intensity'].append(spec.getPrecursors()[0].getIntensity())

            # TODO this needs to go outside
            tol = 0.5
            if spec.metaValueExists('filter string'):
                if 'FTMS' in spec.getMetaValue('filter string'):
                    tol = 0.05
                elif 'ITMS' in spec.getMetaValue('filter string'):
                    tol = 0.5
                elif 'QTOF' in spec.getMetaValue('filter string'):  #TOFMS, SQMS, TQMS, SectorMS
                    tol = 0.1

            # ms2 peaks directly from isolation window?
            unfragmented = np.any([np.isclose(i[0],[x.getMZ() for x in spec], atol=tol) for i in s])
            isolation_window_metrics['peaks_in_window_in_ms2'].append(str(unfragmented))

    ## Spectra detail numbers
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Spectrum acquisition metric values - MS1",
                value=spectrum_acquisition_metrics_MS1)
    )
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Spectrum acquisition metric values - MS2",
                value=spectrum_acquisition_metrics_MS2)
    )
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Spectra topn ranks",
                value=spectrum_topn)
    )
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Tandem spectrum metric values - MS2",
                value=tandem_spectrum_metrics_MS2)
    )
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Trap metric values - MS1",
                value=trap_metrics_MS1)
    )
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Trap metric values - MS2",
                value=trap_metrics_MS2)
    )
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="isolation window metrics",
                value=isolation_window_metrics)
    )

    ## Spectra numbers
    for levels in mslevelcounts.keys():
        metrics.append(
            mzqc.QualityMetric(cvRef="QC",
                    accession="QC:0000000",
                    name="Number of MS{l} spectra".format(l=str(levels)),
                    value=mslevelcounts[levels])
        )

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Number of chromatograms",
                value=len(exp.getChromatograms()))
    )

    ## Ranges
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="MZ aquisition range",
                value=[min_mz,max_mz])
    )

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="RT aquisition range",
                value=[exp[0].getRT(),exp[exp.size()-1].getRT()])
    )

    # TIC
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Total ion current",
                value=tic_tab)
    )

    # Chrom
    chrom_tab: Dict[str,List[Any]] = defaultdict(list)
    chroms = exp.getChromatograms()
    for t in chroms:
      if t.getChromatogramType() == oms.ChromatogramSettings.ChromatogramType.TOTAL_ION_CURRENT_CHROMATOGRAM:
        for chro_peak in t:
            chrom_tab['RT'].append(chro_peak.getRT())
            chrom_tab['int'].append(chro_peak.getIntensity())
        break

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Chromatogram",
                value=chrom_tab)
    )
    # TODO is there a difference between TIC as defined in MS:1000235 and the chromatogram you get from TRP?? In MZML it says its a MS:1000235 (ion current detected in each of a series of mass spectra) but is it?
    # TODO consider collection of spectrum_native_id
    return mzqc.RunQuality(metadata=meta, qualityMetrics=metrics)
Beispiel #19
0
def getEnzymeContaminationMetrics(pep,
                                  pro,
                                  force_enzymes=False
                                  ) -> List[mzqc.QualityMetric]:
    """
    getEnzymeContaminationMetrics calculates enzyme and enzyme contamination metrics from the
    identifications given.

    The function calculates the number of missed cleavages (internal), peptide length distribution,
    and peptide boundaries matching known enzyme patterns from the given identifications. Matching
    against digestion enzyme patterns other than the enyme used for identification processess has to
    be switched with 'force_enzymes' and is sensible if the identification was conducted with
    unspecific cleavage to detect enzyme contamination or enzyme setting mixup is suspected.

    Parameters
    ----------
    pro : List[oms.ProteinIdentification]
        List of PyOpenMS ProteinIdentification as from reading a common identification file
    pep : List[oms.PeptideIdentification]
        List of PyOpenMS PeptideIdentification as from reading a common identification file
    force_enzymes : bool, optional
        If set, will force checking the identified peptide sequences against other known
        digestion enzyme patterns. By default False

    Returns
    -------
    List[mzqc.QualityMetric]
        List of resulting QualityMetrics
    """
    metrics: List[mzqc.QualityMetric] = list()

    # include all psm actually does not make much sense to assess the enzyme efficiency
    gre = {
        pro[0].getSearchParameters().digestion_enzyme.getName():
        re.compile(pro[0].getSearchParameters().digestion_enzyme.getRegEx())
    }

    # TODO pyopenms wrappers for DigestionEnzymeDB etc
    # li: List = list()
    # oms.DigestionEnzymeDB().getAllNames(li)
    # ore = {e: re.compile(oms.DigestionEnzymeDB().getEnzyme(e).getRegEx()) for e in li
    #            if e not in gre and e != 'no cleavage'}

    enzymematch_tab: Dict[str, List[Any]] = defaultdict(list)
    missed_ranks = list()
    matched_ranks = list()
    # alt = dict()
    for i, pepi in enumerate(pep):
        pepi.sort()
        spec_id = pepi.getMetaValue('spectrum_reference') \
          if pepi.metaValueExists('spectrum_reference') else i
        for i, h in enumerate(pepi.getHits()):
            pepseq = h.getPeptideEvidences()[0].getAABefore() \
                     + h.getSequence().toUnmodifiedString() \
                     + h.getPeptideEvidences()[0].getAAAfter()

            is_matched, internal_matches = matchEnzyme(
                next(iter(gre.values())), pepseq)
            if i == 0:
                enzymematch_tab['native_id'].append(spec_id)
                enzymematch_tab['matched'].append(is_matched)
                enzymematch_tab['missed'].append(internal_matches)
            else:
                missed_ranks.append(internal_matches)
                matched_ranks.append(is_matched)

            # if force_enzymes or not is_matched:
            #     oth_enz_matched = {k: matchEnzyme(v, pepseq) for k,v in ore.items()}
            #     alt[spec_id] = oth_enz_matched

    if len(missed_ranks):
        arr = np.array(missed_ranks)
        q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr)
        metrics.append(
            mzqc.QualityMetric(
                cvRef="QC",
                accession="QC:0000000",
                name=
                "Q1, Q2, Q3 of missed clevage counts for all lower rank identifications.",
                value=[q1, q2, q3]))

        metrics.append(
            mzqc.QualityMetric(
                cvRef="QC",
                accession="QC:0000000",
                name=
                "Sigma of missed clevage counts for all lower rank identifications.",
                value=s))

        metrics.append(
            mzqc.QualityMetric(
                cvRef="QC",
                accession="QC:0000000",
                name=
                "Mean of missed clevage counts for all lower rank identifications.",
                value=m))

        metrics.append(
            mzqc.QualityMetric(
                cvRef="QC",
                accession="QC:0000000",
                name=
                "Missed clevage count for all lower rank identifications +/-1.5*IQR outlier",
                value=ol))

    if len(matched_ranks):
        mdl: Dict[int, int] = defaultdict(int)
        arr = np.array(matched_ranks)
        uniq, counts = np.unique(arr, return_counts=True)
        mdl.update(dict(zip(uniq, counts)))
        metrics.append(
            mzqc.QualityMetric(
                cvRef="QC",
                accession="QC:0000000",
                name=
                "Match/semi/none counts for all lower rank identifications.",
                value=[mdl[2], mdl[1], mdl[0]]))

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                           accession="QC:0000000",
                           name="Missed cleavages",
                           value=enzymematch_tab))

    arr = np.array(enzymematch_tab['missed'])
    q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr)
    metrics.append(
        mzqc.QualityMetric(
            cvRef="QC",
            accession="QC:0000000",
            name="Q1, Q2, Q3 of missed clevage counts for top identifications.",
            value=[q1, q2, q3]))
    metrics.append(
        mzqc.QualityMetric(
            cvRef="QC",
            accession="QC:0000000",
            name="Sigma of missed clevage counts for top identifications.",
            value=s))
    metrics.append(
        mzqc.QualityMetric(
            cvRef="QC",
            accession="QC:0000000",
            name="Mean of missed clevage counts for top identifications.",
            value=m))

    metrics.append(
        mzqc.QualityMetric(
            cvRef="QC",
            accession="QC:0000000",
            name=
            "Missed clevage count for top identifications +/-1.5*IQR outlier",
            value=ol))

    return metrics
Beispiel #20
0
def getSNMetrics(spectrum_acquisition_metrics_MS: mzqc.QualityMetric,
                 ms_level: int) -> List[mzqc.QualityMetric]:
    """
    getSNMetrics collect S/N related QC metrics from a super metric collected in a first pass of the input mzML

    S/N from each spectrum are computed into 'spectrum acquisition metrics' for each MS level, from there S/N 
    distribution values are computed.

    Parameters
    ----------
    spectrum_acquisition_metrics_MS : mzqc.QualityMetric
        QualityMetric object with the spectrum acquisition metrics 
    ms_level : int
        The MS level to which the given spectrum acquisition metrics belong to

    Returns
    -------
    List[mzqc.QualityMetric]
        A list of new QualityMetric objects for mzQC deposition
    """
    metrics: List[mzqc.QualityMetric] = list()
    np_sn = np.array(spectrum_acquisition_metrics_MS.value['SN'])

    qs = np.quantile(np_sn, [.25, .5, .75])
    metrics.append(
        mzqc.QualityMetric(
            cvRef="QC",
            accession="QC:0000000",
            name=
            "Signal-to-noise ratio Q1, Q2, Q3 for MS level {ms_level} collection"
            .format(ms_level=ms_level),
            value=list(qs)))

    metrics.append(
        mzqc.QualityMetric(
            cvRef="QC",
            accession="QC:0000000",
            name="Signal-to-noise ratio sigma for MS level {ms_level} collection"
            .format(ms_level=ms_level),
            value=np.std(np_sn)))

    metrics.append(
        mzqc.QualityMetric(
            cvRef="QC",
            accession="QC:0000000",
            name="Signal-to-noise ratio mean for MS level {ms_level} collection"
            .format(ms_level=ms_level),
            value=np.mean(np_sn)))

    low_out = qs[0] - (1.5 * (qs[2] - qs[0]))
    high_out = qs[2] + (1.5 * (qs[2] - qs[0]))
    metrics.append(
        mzqc.QualityMetric(
            cvRef="QC",
            accession="QC:0000000",
            name=
            "Signal-to-noise ratio +/-1.5*IQR outlier for MS level {ms_level} collection"
            .format(ms_level=ms_level),
            value=np.extract((np_sn < low_out) | (np_sn > high_out), np_sn)))

    return metrics
Beispiel #21
0
def getIdentifiedSignalMetrics(tandem_spectrum_metrics_MS2:mzqc.QualityMetric,
        spectrum_acquisition_metrics_MS1: mzqc.QualityMetric,
        identification_accuracy_metrics: mzqc.QualityMetric,
        tic_table: mzqc.QualityMetric) -> List[mzqc.QualityMetric]:
    """
    getIdentifiedSignalMetrics calculate metrics on the proportions of recorded signal identified.

    The metrics calculated include the median ratio of max survey scan intensity over sampled precursor
    intensity for peptides identified, the fractions of identified MS2 in precursor intensity Quartiles,
    median SN for MS1 spectra in RT range in which the first half of peptides are identified, and
    median TIC value of RT range in which half of peptides are identified.

    Parameters
    ----------
    tandem_spectrum_metrics_MS2 : mzqc.QualityMetric
        The proto-metrics on tandem spectra containing 'RT', 'precursor_mz', 'precursor_intensity', 'surveyscan_intensity_sum', 'surveyscan_intensity_max' values.
    spectrum_acquisition_metrics_MS1 : mzqc.QualityMetric
        The proto-metrics on MS1 spectra containing 'RT' and 'SN' values
    identification_accuracy_metrics : mzqc.QualityMetric
        The proto-metrics on identification accuracies containing 'RT' and 'MZ' values
    tic_table : mzqc.QualityMetric
        The proto-metrics on total ion current intensities containing 'RT' and 'int'

    Returns
    -------
    List[mzqc.QualityMetric]
        The list of metrics
    """
    metrics: List[mzqc.QualityMetric] = list()

    # Fraction of total MS2 scans identified in the first quartile of peptides sorted by MS1 intensity (sum)
    np_prec = np.array([tandem_spectrum_metrics_MS2.value['RT'],
                        tandem_spectrum_metrics_MS2.value['precursor_mz'],
                        tandem_spectrum_metrics_MS2.value['precursor_intensity'],
                        tandem_spectrum_metrics_MS2.value['surveyscan_intensity_sum'],
                        tandem_spectrum_metrics_MS2.value['surveyscan_intensity_max']])

    # DS-3A reimpl.: median( (surv max / prec int) for all ident. prec )
    id_coord = np.array([identification_accuracy_metrics.value['RT'],identification_accuracy_metrics.value['MZ']])  # TODO make sure intersection is round-proof
    intersected = np.intersect1d(np_prec[1],id_coord[1], assume_unique=False, return_indices=True)
    np_id = np_prec[:,intersected[1]]
    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Median ratio of max survey scan intensity over sampled precursor intensity for peptides identified",
                value=np.median(np_id[4] / np_id[2]))
    )

    # MS1-3A reimpl.: Ratio of 95th over 5th percentile MS1 maximum intensity values for identified peptides (approximates dynamic range of signal)
    p05, p95 = np.quantile(np_id[4], [.05, .95])
    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Ratio of 95th over 5th percentile of precursor intensity for identified peptides",
                value=p95 / p05 )
    )

    # Quartiles by MS1 maximum intensity
    # Fraction of identified MS2 Spectra within
    # MS2-4A : 0 and Q1
    # MS2-4B : Q1 and Q2
    # MS2-4C : Q2 and Q3
    # MS2-4D : above Q3
    q1,q2,q3 = np.quantile(np_prec[4], [.25, .5, .75])

    tandem_upto_q1 = np.shape(np_prec[:,np_prec[4]<q1])[1]
    id_upto_q1 = np.shape(np_id[:,np_id[4]<q1])[1]

    tandem_between_q1q2 = np.shape(np_prec[:,(q1<np_prec[4]) & (np_prec[4]<q2)])[1]
    id_between_q1q2 = np.shape(np_id[:,(q1<np_id[4]) & (np_id[4]<q2)])[1]

    tandem_between_q2q3 = np.shape(np_prec[:,(q2<np_prec[4]) & (np_prec[4]<q3)])[1]
    id_between_q2q3 = np.shape(np_id[:,(q2<np_id[4]) & (np_id[4]<q3)])[1]

    tandem_above_q3 = np.shape(np_prec[:,q3<np_prec[4]])[1]
    id_above_q3 = np.shape(np_id[:,q3<np_id[4]])[1]

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Fraction of identified MS2 below Q1 of precursor intensity.",
                value=tandem_upto_q1 / id_upto_q1)
    )
    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Fraction of identified MS2 between Q1 and Q2 of precursor intensity.",
                value=tandem_between_q1q2 / id_between_q1q2)
    )
    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Fraction of identified MS2 between Q2 and Q3 of precursor intensity.",
                value=tandem_between_q2q3 / id_between_q2q3)
    )
    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Fraction of identified MS2 above Q3 of precursor intensity.",
                value=tandem_above_q3 / id_above_q3)
    )

    # MS1-3B reimpl.: Median maximum MS1 value for identified peptides
    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Median precursor intensity of identified MS2",
                value=np.median(np_id[4]))
    )

    # MS1-2A Median SN for MS1 spectra in RT range in which half (which half???) of peptides are identified
    np_id = np_id[:,np_id[0].argsort()]
    msn = np.array([spectrum_acquisition_metrics_MS1.value['RT'], spectrum_acquisition_metrics_MS1.value['SN']])
    median_id_rt = np.quantile(np_id[0], [.5])[0]
    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Median SN for MS1 spectra in RT range in which the first half of peptides are identified",
                value=np.median(msn[:, msn[0]<median_id_rt ][1]) )
    )

    # the smallest rt range which contains half of all identified Spectra
    half_id_size = np.round(np_id.shape[1]/2)
    all_diff = np_id[:,-1*int(half_id_size):][0] - np_id[:,:int(half_id_size)][0]  # the last (half_id_size) many - the first (half_id_size) many
    min_start_index = np.argmin(all_diff)
    min_stop_index = min_start_index+int(half_id_size)-1
    rt_interval = np_id[0,min_stop_index] - np_id[0,min_start_index]

    densest = np.median(msn[:,(np_id[0,min_start_index]<msn[0]) & (msn[0]<np_id[0,min_stop_index])][1])
    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Median SN for MS1 spectra in densest RT range in which any half of peptides are identified",
                value=densest)
    )

    # is median tic value really meaningful? My guess is ratio of tic sum of RT half identified and rest is a better indicator (>1: most signal is in the most exlained region)
    # Median TIC value of RT range in which half of peptides are identified
    np_tic = np.array([tic_table.value['RT'], tic_table.value['int']])
    densest_id_tic = np.median(np_tic[:,(np_id[0,min_start_index]<np_tic[0]) & (np_tic[0]<np_id[0,min_stop_index])][1])

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Median TIC value of RT range in which half of peptides are identified",
                value=np.median(np_tic[:, np_tic[0]<median_id_rt ][1]) )
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Median TIC value of densest RT range in which any half of peptides are identified",
                value=densest_id_tic )
    )

    return metrics
Beispiel #22
0
def getMQMetrics(target_raw: str,
                 params: pandas.DataFrame,
                 evidence: pandas.DataFrame,
                 ms2num: int = 0) -> List[mzqc.QualityMetric]:
    """
    getMQMetrics calculates id based QC metrics from MaxQuant results as close as possible to the way they are calculated from regular id files.

    For a given raw file (name), the respective results are extracted from dataframes derived off the parameters and evidence files from a
    MaxQuant result (of potentially multiple raw files combined analysis). As many metrics similar or equal to those dependent of regular id files
    are calculated.

    Parameters
    ----------
    target_raw : str
        The name of the raw file (as per MaxQuant usage without file type extension)
    params : pandas.DataFrame
        Dataframe with data from the parameters result file as produced by MaxQuant and stratified column names
    evidence : pandas.DataFrame
        Dataframe with data from the evidence result file as produced by MaxQuant and stratified column names
    ms2num : int, optional
        The total number of tandem spectra as from the id-free metrics, by default 0

    Returns
    -------
    List[mzqc.QualityMetric]
        A list of QualityMetrics close to what is calculated from a regular id-based QC calculation.
    """
    if not target_raw in evidence['raw file'].unique():
        return list()  # TODO warn
    else:
        mq_metrics: List[mzqc.QualityMetric] = list()
        #https://stackoverflow.com/questions/17071871/how-to-select-rows-from-a-dataframe-based-on-column-values
        target_mq = evidence.loc[(evidence['raw file'] == target_raw)
                                 & (evidence['ms/ms scan number'].notnull())]

        mq_metrics.append(
            mzqc.QualityMetric(cvRef="QC",
                               accession="QC:0000000",
                               name="Sequence database name",
                               value=params.loc['fasta file']['value']))

        proteins = len(target_mq['leading proteins'].unique())
        mq_metrics.append(
            mzqc.QualityMetric(cvRef="QC",
                               accession="QC:0000000",
                               name="Total number of identified proteins",
                               value=proteins))

        # #     name="Total number of PSM",   # NA
        # metrics.append(
        #     mzqc.QualityMetric(cvRef="QC",
        #             accession="QC:0000000",
        #             name="Total number of PSM",
        #             value=psm_count)
        # )

        mq_metrics.append(
            mzqc.QualityMetric(
                cvRef="QC",
                accession="QC:0000000",
                name="Total number of identified peptide spectra",
                value=len(target_mq)))

        peptides = len(target_mq['sequence'].unique())
        mq_metrics.append(
            mzqc.QualityMetric(
                cvRef="QC",
                accession="QC:0000000",
                name="Total number identified unique peptide sequences",
                value=peptides))

        score_type = "Andromeda:score"
        psims = utils.obtainOntology("psi-ms")

        name_indexed = {psims[x].name: psims[x] for x in psims}
        score_indexed = {
            x.name: x
            for x in chain(psims['MS:1001143'].subclasses(
            ), psims['MS:1001153'].subclasses(), psims['MS:1002347'].
                           subclasses(), psims['MS:1002363'].subclasses())
        }

        if score_type in name_indexed:
            if not score_type in score_indexed:
                warnings.warn(
                    "Score type does not correspond to a score type in the OBO, proceed at own risk.",
                    Warning)
                score_col_name = name_indexed[score_type].id
            else:
                score_col_name = score_indexed[score_type].id
        else:
            warnings.warn(
                "OBO does not contain any entry matching the identification score, proceed at own risk.",
                Warning)
            score_col_name = score_type

        identification_scoring_metrics = target_mq[[
            'retention time', 'charge', 'score'
        ]].rename(columns={
            'retention time': 'RT',
            'charge': 'c',
            'score': score_type
        }).to_dict(orient='list')
        mq_metrics.append(
            mzqc.QualityMetric(cvRef="QC",
                               accession="QC:0000000",
                               name="Identification scoring metric values",
                               value=identification_scoring_metrics))

        # TODO comparison column with qccalculator dppm values
        # TODO RT/native id?
        identification_accuracy_metrics = target_mq[['ms/ms m/z','mass error [ppm]','uncalibrated mass error [da]']]\
            .rename(columns={'ms/ms m/z': 'MZ','mass error [ppm]':'delta_ppm','uncalibrated mass error [da]':'abs_error'})
        identification_accuracy_metrics[
            'abs_error'] = identification_accuracy_metrics['abs_error'].abs()
        identification_accuracy_metrics = identification_accuracy_metrics.to_dict(
            orient='list')
        mq_metrics.append(
            mzqc.QualityMetric(cvRef="QC",
                               accession="QC:0000000",
                               name="Identifications accuracy metric values",
                               value=identification_accuracy_metrics))

        hydrophobicity_metrics = target_mq[['retention time', 'sequence'
                                            ]].rename(columns={
                                                'retention time': 'RT',
                                                'sequence': 'peptide'
                                            })
        hydrophobicity_metrics['gravy'] = hydrophobicity_metrics[
            'peptide'].apply(lambda x: ProtParam.ProteinAnalysis(x).gravy())
        hydrophobicity_metrics = hydrophobicity_metrics[[
            'RT', 'gravy'
        ]].to_dict(orient='list')
        mq_metrics.append(
            mzqc.QualityMetric(cvRef="QC",
                               accession="QC:0000000",
                               name="Hydrophobicity metric values",
                               value=hydrophobicity_metrics))

        # TODO target/decoy info available??
        identification_sequence_metrics = target_mq[[
            'sequence', 'retention time', 'ms/ms scan number'
        ]].rename(
            columns={
                'sequence': 'peptide',
                'retention time': 'RT',
                'ms/ms scan number': 'native_id'
            }).to_dict(orient='list')
        mq_metrics.append(
            mzqc.QualityMetric(cvRef="QC",
                               accession="QC:0000000",
                               name="Identifications sequence metric values",
                               value=identification_sequence_metrics))

        ## simple id metrics
        mq_metrics.append(
            mzqc.QualityMetric(cvRef="QC",
                               accession="QC:0000000",
                               name="Identification to tandem spectra ratio",
                               value=float(len(target_mq)) / float(ms2num)))

        return mq_metrics
Beispiel #23
0
def describeIdentifiedPrecursorIntensity(tandem_spectrum_metrics_MS2:mzqc.QualityMetric,
        identification_accuracy_metrics: mzqc.QualityMetric) -> List[mzqc.QualityMetric]:
    """
    describeIdentifiedPrecursorIntensity calculates the descriptive statistics metrics for precursor intensities of identified tandem spectra.

    From the proto-metrics on identification accuracies and tandem spectra, the function calculates descriptive statistics metrics on the
    precursor intensities from all identified tandem spectra. Namely, min and max, mean, standard deviation, Quartiles, and 1.5*IQR outliers.

    Parameters
    ----------
    tandem_spectrum_metrics_MS2 : mzqc.QualityMetric
        The proto-metrics on tandem spectra containing 'RT', 'precursor_mz', 'precursor_intensity', 'surveyscan_intensity_sum', 'surveyscan_intensity_max' values.
    identification_accuracy_metrics : mzqc.QualityMetric
        The proto-metrics on identification accuracies containing 'RT' and 'MZ' values

    Returns
    -------
    List[mzqc.QualityMetric]
        The list of metrics
    """
    metrics: List[mzqc.QualityMetric] = list()

    # Fraction of total MS2 scans identified in the first quartile of peptides sorted by MS1 intensity (sum)
    np_prec = np.array([tandem_spectrum_metrics_MS2.value['RT'],
                        tandem_spectrum_metrics_MS2.value['precursor_mz'],
                        tandem_spectrum_metrics_MS2.value['precursor_intensity'],
                        tandem_spectrum_metrics_MS2.value['surveyscan_intensity_sum'],
                        tandem_spectrum_metrics_MS2.value['surveyscan_intensity_max']])

    # DS-3A reimpl.: median( (surv max / prec int) for all ident. prec )
    id_coord = np.array([identification_accuracy_metrics.value['RT'],identification_accuracy_metrics.value['MZ']])  # TODO make sure intersection is round-proof
    intersected = np.intersect1d(np_prec[1],id_coord[1], assume_unique=False, return_indices=True)
    np_id = np_prec[:,intersected[1]]
    arr = np_id[2]
    q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr)

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Maximum identified precursor intensity",
                value=max(arr))
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Minmum identified precursor intensity",
                value=min(arr))
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Q1, Q2, Q3 of identified precursor intensities",
                value=[q1, q2, q3])
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Sigma of identified precursor intensities",
                value=s)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Mean of identified precursor intensities",
                value=m)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Precursor identified intensity +/-1.5*IQR outlier",
                value=ol)
    )

    return metrics