Exemple #1
0
def describePrecursorIntensity(tandem_spectrum_metrics_MS2:mzqc.QualityMetric) -> List[mzqc.QualityMetric]:
    """
    describePrecursorIntensity calculates the descriptive statistics metrics for spectra's peak density from a given level.

    From the proto-metrics on tandem spectra, the function calculates descriptive statistics metrics for
    the distribution of precursor intensity. Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers.

    Parameters
    ----------
    tandem_spectrum_metrics_MS2 : mzqc.QualityMetric
        Proto-metric of tandem spectra containing values for 'precursor_intensity'

    Returns
    -------
    List[mzqc.QualityMetric]
        List of resulting QualityMetrics
    """
    metrics: List[mzqc.QualityMetric] = list()

    arr = np.array(tandem_spectrum_metrics_MS2.value['precursor_intensity'])
    q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr)

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Maximum precursor intensity",
                value=max(arr))
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Minmum precursor intensity",
                value=min(arr))
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Q1, Q2, Q3 of precursor intensities",
                value=[q1, q2, q3])
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Sigma of precursor intensities",
                value=s)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Mean of precursor intensities",
                value=m)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Precursor intensity +/-1.5*IQR outlier",
                value=ol)
    )

    return metrics
Exemple #2
0
def getPeptideLengthMetrics(
    identification_sequence_metrics: mzqc.QualityMetric
) -> List[mzqc.QualityMetric]:
    """
    describePeptideLengthMetrics calculates the descriptive statistics metrics for identified sequences' length

    From the proto-metrics on identification sequences, the function calculates descriptive statistics metrics for
    the distribution of peak density from all involved mass spectra.
    Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers.

    Parameters
    ----------
    identification_sequence_metrics : mzqc.QualityMetric
        QualityMetric with 'peptide' value, filtered for final outcome

    Returns
    -------
    List[mzqc.QualityMetric]
        List of resulting QualityMetrics
    """
    metrics: List[mzqc.QualityMetric] = list()

    regex_mod = r'(\([^\(]*\))'
    regex_noaa = r'([^A-Za-z])'
    # TODO test this: '.(iTRAQ4plex)M(Oxidation)C(Carbamidomethyl)HNVNR'
    lengths = np.array([
        len(re.sub(regex_noaa, '', re.sub(regex_mod, '', x)))
        for x in identification_sequence_metrics.value['peptide']
    ])

    q1, q2, q3, s, m, ol = utils.extractDistributionStats(lengths)
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                           accession="QC:0000000",
                           name="Identified peptide lengths Q1, Q2, Q3",
                           value=[q1, q2, q3]))

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                           accession="QC:0000000",
                           name="Identified peptide lengths sigma",
                           value=s))

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                           accession="QC:0000000",
                           name="Identified peptide lengths mean",
                           value=m))

    metrics.append(
        mzqc.QualityMetric(
            cvRef="QC",
            accession="QC:0000000",
            name="Identified peptide lengths +/-1.5*IQR outlier",
            value=ol))

    return metrics
Exemple #3
0
def describeMSdensity(spectrum_acquisition_metrics_MS:mzqc.QualityMetric, start_time: datetime.datetime, ms_level: int) -> List[mzqc.QualityMetric]:
    """
    describeMSdensity calculates the descriptive statistics metrics for spectra's peak density from a given level.

    From the proto-metrics on spectrum acquisition for a given MS level, the function calculates descriptive statistics metrics for
    the distribution of peak density from all involved mass spectra.
    Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers.

    Parameters
    ----------
    spectrum_acquisition_metrics_MS : mzqc.QualityMetric
        Proto-metric containing 'RT' and 'peakcount' values for all involved spectra
    start_time : datetime.datetime
        MS run start time
    ms_level : int
        The MS level considered to produce the right QC metric accession

    Returns
    -------
    List[mzqc.QualityMetric]
        List of resulting QualityMetrics
    """
    metrics: List[mzqc.QualityMetric] = list()

    rts = [start_time + datetime.timedelta(seconds=i) for i in spectrum_acquisition_metrics_MS.value['RT']]
    arr = np.array(spectrum_acquisition_metrics_MS.value['peakcount'])
    q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr)

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Q1, Q2, Q3 of peak density for MS level {ms_level} collection".format(ms_level=ms_level),
                value=[q1, q2, q3])
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Sigma of peak density for MS level {ms_level} collection".format(ms_level=ms_level),
                value=s)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Mean of peak density for MS level {ms_level} collection".format(ms_level=ms_level),
                value=m)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Peak density for MS level {ms_level} collection +/-1.5*IQR outlier".format(ms_level=ms_level),
                value=ol)
    )

    return metrics
Exemple #4
0
def describeMSCollectionTime(trap_metrics:mzqc.QualityMetric, ms_level: int) -> List[mzqc.QualityMetric]:
    """
    describeMSCollectionTime calculates the descriptive statistics metrics for ion collection times of spectra from a given level.

    From the proto-metrics on ion collection for a given MS level, the function calculates descriptive statistics metrics for
    the distribution of ion collection times from all involved mass spectra.
    Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers.

    Parameters
    ----------
    trap_metrics : mzqc.QualityMetric
        The proto-metrics on ion collection times from the respective MS level containing 'traptime' values.
    ms_level : int
        The MS level considered to produce the right QC metric accession

    Returns
    -------
    List[mzqc.QualityMetric]
        The list of metrics
    """
    metrics: List[mzqc.QualityMetric] = list()
    arr = np.array(trap_metrics['traptime'])
    q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr)

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Q1, Q2, Q3 for MS level {ms_level} trap time collection".format(ms_level=ms_level),
                value=[q1, q2, q3])
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Sigma for MS level {ms_level} trap time collection".format(ms_level=ms_level),
                value=s)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Mean of frequency for MS level {ms_level} collection".format(ms_level=ms_level),
                value=m)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Frequency for MS level {ms_level} collection +/-1.5*IQR outlier".format(ms_level=ms_level),
                value=ol)
    )

    return metrics
Exemple #5
0
def getEnzymeContaminationMetrics(pep,
                                  pro,
                                  force_enzymes=False
                                  ) -> List[mzqc.QualityMetric]:
    """
    getEnzymeContaminationMetrics calculates enzyme and enzyme contamination metrics from the
    identifications given.

    The function calculates the number of missed cleavages (internal), peptide length distribution,
    and peptide boundaries matching known enzyme patterns from the given identifications. Matching
    against digestion enzyme patterns other than the enyme used for identification processess has to
    be switched with 'force_enzymes' and is sensible if the identification was conducted with
    unspecific cleavage to detect enzyme contamination or enzyme setting mixup is suspected.

    Parameters
    ----------
    pro : List[oms.ProteinIdentification]
        List of PyOpenMS ProteinIdentification as from reading a common identification file
    pep : List[oms.PeptideIdentification]
        List of PyOpenMS PeptideIdentification as from reading a common identification file
    force_enzymes : bool, optional
        If set, will force checking the identified peptide sequences against other known
        digestion enzyme patterns. By default False

    Returns
    -------
    List[mzqc.QualityMetric]
        List of resulting QualityMetrics
    """
    metrics: List[mzqc.QualityMetric] = list()

    # include all psm actually does not make much sense to assess the enzyme efficiency
    gre = {
        pro[0].getSearchParameters().digestion_enzyme.getName():
        re.compile(pro[0].getSearchParameters().digestion_enzyme.getRegEx())
    }

    # TODO pyopenms wrappers for DigestionEnzymeDB etc
    # li: List = list()
    # oms.DigestionEnzymeDB().getAllNames(li)
    # ore = {e: re.compile(oms.DigestionEnzymeDB().getEnzyme(e).getRegEx()) for e in li
    #            if e not in gre and e != 'no cleavage'}

    enzymematch_tab: Dict[str, List[Any]] = defaultdict(list)
    missed_ranks = list()
    matched_ranks = list()
    # alt = dict()
    for i, pepi in enumerate(pep):
        pepi.sort()
        spec_id = pepi.getMetaValue('spectrum_reference') \
          if pepi.metaValueExists('spectrum_reference') else i
        for i, h in enumerate(pepi.getHits()):
            pepseq = h.getPeptideEvidences()[0].getAABefore() \
                     + h.getSequence().toUnmodifiedString() \
                     + h.getPeptideEvidences()[0].getAAAfter()

            is_matched, internal_matches = matchEnzyme(
                next(iter(gre.values())), pepseq)
            if i == 0:
                enzymematch_tab['native_id'].append(spec_id)
                enzymematch_tab['matched'].append(is_matched)
                enzymematch_tab['missed'].append(internal_matches)
            else:
                missed_ranks.append(internal_matches)
                matched_ranks.append(is_matched)

            # if force_enzymes or not is_matched:
            #     oth_enz_matched = {k: matchEnzyme(v, pepseq) for k,v in ore.items()}
            #     alt[spec_id] = oth_enz_matched

    if len(missed_ranks):
        arr = np.array(missed_ranks)
        q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr)
        metrics.append(
            mzqc.QualityMetric(
                cvRef="QC",
                accession="QC:0000000",
                name=
                "Q1, Q2, Q3 of missed clevage counts for all lower rank identifications.",
                value=[q1, q2, q3]))

        metrics.append(
            mzqc.QualityMetric(
                cvRef="QC",
                accession="QC:0000000",
                name=
                "Sigma of missed clevage counts for all lower rank identifications.",
                value=s))

        metrics.append(
            mzqc.QualityMetric(
                cvRef="QC",
                accession="QC:0000000",
                name=
                "Mean of missed clevage counts for all lower rank identifications.",
                value=m))

        metrics.append(
            mzqc.QualityMetric(
                cvRef="QC",
                accession="QC:0000000",
                name=
                "Missed clevage count for all lower rank identifications +/-1.5*IQR outlier",
                value=ol))

    if len(matched_ranks):
        mdl: Dict[int, int] = defaultdict(int)
        arr = np.array(matched_ranks)
        uniq, counts = np.unique(arr, return_counts=True)
        mdl.update(dict(zip(uniq, counts)))
        metrics.append(
            mzqc.QualityMetric(
                cvRef="QC",
                accession="QC:0000000",
                name=
                "Match/semi/none counts for all lower rank identifications.",
                value=[mdl[2], mdl[1], mdl[0]]))

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                           accession="QC:0000000",
                           name="Missed cleavages",
                           value=enzymematch_tab))

    arr = np.array(enzymematch_tab['missed'])
    q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr)
    metrics.append(
        mzqc.QualityMetric(
            cvRef="QC",
            accession="QC:0000000",
            name="Q1, Q2, Q3 of missed clevage counts for top identifications.",
            value=[q1, q2, q3]))
    metrics.append(
        mzqc.QualityMetric(
            cvRef="QC",
            accession="QC:0000000",
            name="Sigma of missed clevage counts for top identifications.",
            value=s))
    metrics.append(
        mzqc.QualityMetric(
            cvRef="QC",
            accession="QC:0000000",
            name="Mean of missed clevage counts for top identifications.",
            value=m))

    metrics.append(
        mzqc.QualityMetric(
            cvRef="QC",
            accession="QC:0000000",
            name=
            "Missed clevage count for top identifications +/-1.5*IQR outlier",
            value=ol))

    return metrics
Exemple #6
0
def describeIdentifiedPrecursorIntensity(tandem_spectrum_metrics_MS2:mzqc.QualityMetric,
        identification_accuracy_metrics: mzqc.QualityMetric) -> List[mzqc.QualityMetric]:
    """
    describeIdentifiedPrecursorIntensity calculates the descriptive statistics metrics for precursor intensities of identified tandem spectra.

    From the proto-metrics on identification accuracies and tandem spectra, the function calculates descriptive statistics metrics on the
    precursor intensities from all identified tandem spectra. Namely, min and max, mean, standard deviation, Quartiles, and 1.5*IQR outliers.

    Parameters
    ----------
    tandem_spectrum_metrics_MS2 : mzqc.QualityMetric
        The proto-metrics on tandem spectra containing 'RT', 'precursor_mz', 'precursor_intensity', 'surveyscan_intensity_sum', 'surveyscan_intensity_max' values.
    identification_accuracy_metrics : mzqc.QualityMetric
        The proto-metrics on identification accuracies containing 'RT' and 'MZ' values

    Returns
    -------
    List[mzqc.QualityMetric]
        The list of metrics
    """
    metrics: List[mzqc.QualityMetric] = list()

    # Fraction of total MS2 scans identified in the first quartile of peptides sorted by MS1 intensity (sum)
    np_prec = np.array([tandem_spectrum_metrics_MS2.value['RT'],
                        tandem_spectrum_metrics_MS2.value['precursor_mz'],
                        tandem_spectrum_metrics_MS2.value['precursor_intensity'],
                        tandem_spectrum_metrics_MS2.value['surveyscan_intensity_sum'],
                        tandem_spectrum_metrics_MS2.value['surveyscan_intensity_max']])

    # DS-3A reimpl.: median( (surv max / prec int) for all ident. prec )
    id_coord = np.array([identification_accuracy_metrics.value['RT'],identification_accuracy_metrics.value['MZ']])  # TODO make sure intersection is round-proof
    intersected = np.intersect1d(np_prec[1],id_coord[1], assume_unique=False, return_indices=True)
    np_id = np_prec[:,intersected[1]]
    arr = np_id[2]
    q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr)

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Maximum identified precursor intensity",
                value=max(arr))
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Minmum identified precursor intensity",
                value=min(arr))
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Q1, Q2, Q3 of identified precursor intensities",
                value=[q1, q2, q3])
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Sigma of identified precursor intensities",
                value=s)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Mean of identified precursor intensities",
                value=m)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Precursor identified intensity +/-1.5*IQR outlier",
                value=ol)
    )

    return metrics
Exemple #7
0
def describeErrorRates(identification_accuracy_metrics:mzqc.QualityMetric) -> List[mzqc.QualityMetric]:
    """
    describeErrorRates calculates the descriptive statistics metrics for charge ratios of identified tandem spectra.

    From the proto-metrics on identification accuracy, the function calculates descriptive statistics metrics on the
    error rates from all identified tandem spectra. Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers.

    Parameters
    ----------
    identification_accuracy_metrics : mzqc.QualityMetric
        The proto-metrics on identification accuracies containing 'delta_ppm' and 'abs_error' values.

    Returns
    -------
    List[mzqc.QualityMetric]
        The list of metrics
    """
    metrics: List[mzqc.QualityMetric] = list()
    if 'delta_ppm' not in identification_accuracy_metrics:
        warnings.warn("No error values in given annotation, ignoring identification error rate metrics.", Warning)
        return metrics

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="MS15A",
                value= np.median(identification_accuracy_metrics.value['abs_error']) )
    )
    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="MS15B",
                value=np.mean(identification_accuracy_metrics.value['abs_error']) )
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="MS15C",
                value=np.median(identification_accuracy_metrics.value['delta_ppm']) )
    )

    arr = np.array(identification_accuracy_metrics.value['delta_ppm'])
    q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr)

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="MS15D",
                value=q3-q1)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Delta ppm Q1, Q2, Q3",
                value=[q1,q2,q3])
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Delta ppm sigma",
                value=s)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Delta ppm mean",
                value=m)
    )

    metrics.append(mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Delta ppm +/-1.5*IQR outlier",
                value=ol)
    )

    return metrics