def getPeptideLengthMetrics( identification_sequence_metrics: mzqc.QualityMetric ) -> List[mzqc.QualityMetric]: """ describePeptideLengthMetrics calculates the descriptive statistics metrics for identified sequences' length From the proto-metrics on identification sequences, the function calculates descriptive statistics metrics for the distribution of peak density from all involved mass spectra. Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers. Parameters ---------- identification_sequence_metrics : mzqc.QualityMetric QualityMetric with 'peptide' value, filtered for final outcome Returns ------- List[mzqc.QualityMetric] List of resulting QualityMetrics """ metrics: List[mzqc.QualityMetric] = list() regex_mod = r'(\([^\(]*\))' regex_noaa = r'([^A-Za-z])' # TODO test this: '.(iTRAQ4plex)M(Oxidation)C(Carbamidomethyl)HNVNR' lengths = np.array([ len(re.sub(regex_noaa, '', re.sub(regex_mod, '', x))) for x in identification_sequence_metrics.value['peptide'] ]) q1, q2, q3, s, m, ol = utils.extractDistributionStats(lengths) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Identified peptide lengths Q1, Q2, Q3", value=[q1, q2, q3])) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Identified peptide lengths sigma", value=s)) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Identified peptide lengths mean", value=m)) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name="Identified peptide lengths +/-1.5*IQR outlier", value=ol)) return metrics
def describeMSdensity(spectrum_acquisition_metrics_MS:mzqc.QualityMetric, start_time: datetime.datetime, ms_level: int) -> List[mzqc.QualityMetric]: """ describeMSdensity calculates the descriptive statistics metrics for spectra's peak density from a given level. From the proto-metrics on spectrum acquisition for a given MS level, the function calculates descriptive statistics metrics for the distribution of peak density from all involved mass spectra. Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers. Parameters ---------- spectrum_acquisition_metrics_MS : mzqc.QualityMetric Proto-metric containing 'RT' and 'peakcount' values for all involved spectra start_time : datetime.datetime MS run start time ms_level : int The MS level considered to produce the right QC metric accession Returns ------- List[mzqc.QualityMetric] List of resulting QualityMetrics """ metrics: List[mzqc.QualityMetric] = list() rts = [start_time + datetime.timedelta(seconds=i) for i in spectrum_acquisition_metrics_MS.value['RT']] arr = np.array(spectrum_acquisition_metrics_MS.value['peakcount']) q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Q1, Q2, Q3 of peak density for MS level {ms_level} collection".format(ms_level=ms_level), value=[q1, q2, q3]) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Sigma of peak density for MS level {ms_level} collection".format(ms_level=ms_level), value=s) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Mean of peak density for MS level {ms_level} collection".format(ms_level=ms_level), value=m) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Peak density for MS level {ms_level} collection +/-1.5*IQR outlier".format(ms_level=ms_level), value=ol) ) return metrics
def describeIdentificationScores(identification_scoring_metrics:mzqc.QualityMetric, score_type:str) -> List[mzqc.QualityMetric]: """ describeIdentificationScores calculates the descriptive statistics metrics for the scoring of identified tandem spectra. From the proto-metrics on identification scores, the function calculates descriptive statistics metrics on the charge id scores from all identified tandem spectra. Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers. Parameters ---------- identification_scoring_metrics : mzqc.QualityMetric The proto-metrics on identification scores containing `score_type` values. score_type : str The score_type descriptor used to create the identification score values category from `identification_scoring_metrics` Returns ------- List[mzqc.QualityMetric] The list of metrics """ metrics: List[mzqc.QualityMetric] = list() qs = np.quantile(identification_scoring_metrics.value[score_type], [.25,.5,.75]) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="scores Q1, Q2, Q3", value=list(qs)) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="score sigma", value=np.std(identification_scoring_metrics.value[score_type])) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="score mean", value=np.mean(identification_scoring_metrics.value[score_type])) ) np_score = np.array(identification_scoring_metrics.value[score_type]) low_out = qs[0]-(1.5*(qs[2]-qs[0])) high_out = qs[2]+(1.5*(qs[2]-qs[0])) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="score +/-1.5*IQR outlier", value=np.extract((np_score<low_out) | (np_score>high_out), np_score)) ) return metrics
def describeMSCollectionTime(trap_metrics:mzqc.QualityMetric, ms_level: int) -> List[mzqc.QualityMetric]: """ describeMSCollectionTime calculates the descriptive statistics metrics for ion collection times of spectra from a given level. From the proto-metrics on ion collection for a given MS level, the function calculates descriptive statistics metrics for the distribution of ion collection times from all involved mass spectra. Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers. Parameters ---------- trap_metrics : mzqc.QualityMetric The proto-metrics on ion collection times from the respective MS level containing 'traptime' values. ms_level : int The MS level considered to produce the right QC metric accession Returns ------- List[mzqc.QualityMetric] The list of metrics """ metrics: List[mzqc.QualityMetric] = list() arr = np.array(trap_metrics['traptime']) q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Q1, Q2, Q3 for MS level {ms_level} trap time collection".format(ms_level=ms_level), value=[q1, q2, q3]) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Sigma for MS level {ms_level} trap time collection".format(ms_level=ms_level), value=s) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Mean of frequency for MS level {ms_level} collection".format(ms_level=ms_level), value=m) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Frequency for MS level {ms_level} collection +/-1.5*IQR outlier".format(ms_level=ms_level), value=ol) ) return metrics
def getSamplingRatios(identification_sequence_metrics:mzqc.QualityMetric) -> List[mzqc.QualityMetric]: """ getSamplingRatios calculates the sampling ratio metric for identified tandem spectra. From the proto-metrics on identified sequences, the function calculates sampling rate and frequency. Parameters ---------- identification_sequence_metrics : mzqc.QualityMetric The proto-metrics on identified sequences containing 'peptide' (sequence) values. Returns ------- List[mzqc.QualityMetric] The list of metrics """ metrics: List[mzqc.QualityMetric] = list() sample_rate, sample_rate_counts = np.unique(np.unique(identification_sequence_metrics.value['peptide'], return_counts=True)[1], return_counts=True) # explicitly enum all sampling rates up to the max. explicit_rate_counts = np.zeros( np.max(sample_rate) ) explicit_rate = np.arange(1, np.max(sample_rate)+1) explicit_indices = np.where(np.isin(explicit_rate,sample_rate)) np.put(explicit_rate,explicit_indices,sample_rate) np.put(explicit_rate_counts,explicit_indices,sample_rate_counts) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Sampling frequencies", value={'sampling rate': list(explicit_rate), 'frequencies': list(explicit_rate_counts)}) ) return metrics
def test_NumpyValues(self): nup = qc.QualityMetric() nup.accession = "QC:123" nup.name = "einszweidrei" npnd = np.array([1 / 9, 2 / 8, 3 / 7], dtype=np.float32) nup.value = {"np": npnd} assert qc.JsonSerialisable.ToJson(nup) == NPQM
def test_DateTime(self): try: zqc = qc.MzQcFile(version="0.1.0", creationDate=datetime.now().isoformat(), runQualities=[], setQualities=[], controlledVocabularies=[]) except Exception as error: raise AssertionError(f"An unexpected exception {error} raised.")
def mzqc_assembly(rqs, sqs, out): # TODO check all the metrics to see which ontologies were used cv_qc = qc.ControlledVocabulary( ref="QC", name="Proteomics Standards Initiative Quality Control Ontology", version="0.1.0", uri= "https://github.com/HUPO-PSI/qcML-development/blob/master/cv/v0_1_0/qc-cv.obo" ) cv_ms = qc.ControlledVocabulary( ref="MS", name="Proteomics Standards Initiative Mass Spectrometry Ontology", version="4.1.7", uri="https://github.com/HUPO-PSI/psi-ms-CV/blob/master/psi-ms.obo") return qc.MzQcFile(version="0.1.0", creationDate=datetime.now().isoformat(), runQualities=rqs, setQualities=sqs, controlledVocabularies=[cv_qc, cv_ms])
def describePrecursorIntensity(tandem_spectrum_metrics_MS2:mzqc.QualityMetric) -> List[mzqc.QualityMetric]: """ describePrecursorIntensity calculates the descriptive statistics metrics for spectra's peak density from a given level. From the proto-metrics on tandem spectra, the function calculates descriptive statistics metrics for the distribution of precursor intensity. Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers. Parameters ---------- tandem_spectrum_metrics_MS2 : mzqc.QualityMetric Proto-metric of tandem spectra containing values for 'precursor_intensity' Returns ------- List[mzqc.QualityMetric] List of resulting QualityMetrics """ metrics: List[mzqc.QualityMetric] = list() arr = np.array(tandem_spectrum_metrics_MS2.value['precursor_intensity']) q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Maximum precursor intensity", value=max(arr)) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Minmum precursor intensity", value=min(arr)) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Q1, Q2, Q3 of precursor intensities", value=[q1, q2, q3]) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Sigma of precursor intensities", value=s) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Mean of precursor intensities", value=m) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Precursor intensity +/-1.5*IQR outlier", value=ol) ) return metrics
def getESIstability(ion_intensity_metric:mzqc.QualityMetric) -> List[mzqc.QualityMetric]: """ getESIstability calculates the count of signal jumps and falls during the course of a mass-spectrometry run's acquisition time. Counts the number of signal jumps/falls of at least 10-fold magnitude. Parameters ---------- ion_intensity_metric : mzqc.QualityMetric Proto-metric containing the 'int' values of signal intensity in timely order Returns ------- List[mzqc.QualityMetric] List of resulting QualityMetrics """ metrics: List[mzqc.QualityMetric] = list() folds = np.true_divide(ion_intensity_metric.value['int'][:-1],ion_intensity_metric.value['int'][1:]) jumps = len(np.where(folds > 10)[0]) falls = len(np.where(folds < 1/10)[0]) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="signal jump (10x) count", value=jumps) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="signal fall (10x) count", value=falls) ) return metrics
def getAnalysedSignalMetrics(tandem_spectrum_metrics_MS2:mzqc.QualityMetric) -> List[mzqc.QualityMetric]: """ getAnalysedSignalMetrics calculates a metric on the proportion of signal analysed with subsequent tandem spectra. The function calculates the median ratio of max survey scan intensity over sampled precursor intensity for the bottom (by MS1 max) half of MS2. Parameters ---------- tandem_spectrum_metrics_MS2 : mzqc.QualityMetric Proto-metric of tandem spectra containing values for 'RT', 'precursor_mz', 'precursor_intensity', 'surveyscan_intensity_sum', 'surveyscan_intensity_max'. Returns ------- List[mzqc.QualityMetric] List of resulting QualityMetrics """ metrics: List[mzqc.QualityMetric] = list() # Fraction of total MS2 scans identified in the first quartile of peptides sorted by MS1 intensity (sum) np_prec = np.array([tandem_spectrum_metrics_MS2.value['RT'], tandem_spectrum_metrics_MS2.value['precursor_mz'], tandem_spectrum_metrics_MS2.value['precursor_intensity'], tandem_spectrum_metrics_MS2.value['surveyscan_intensity_sum'], tandem_spectrum_metrics_MS2.value['surveyscan_intensity_max']]) # DS-3B reimpl.: median( (surv max / prec int) for bottom 50% of all precursors ) np_prec = np_prec[:,np_prec[4].argsort()] # Ratio of MS1 maximum to MS1 value at sampling for bottom 50% of analytes by MS1 maximum intensity (1 = sampled at peak maxima) bottom_sampled_prec = np_prec[:,np_prec[4]<np.median(np_prec[4])] metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Median ratio of max survey scan intensity over sampled precursor intensity for the bottom (by MS1 max) half of MS2", value=np.median(bottom_sampled_prec[4] / bottom_sampled_prec[2])) ) return metrics
def getCoverageRatios(pro_ids: oms.ProteinIdentification, pep_ids: List[oms.PeptideIdentification], fasta=Dict[str, SeqRecord.SeqRecord], fetch=False) -> List[mzqc.QualityMetric]: """ getCoverageRatios calculates the coverage ratios per protein from the identified searchspace. Calculating the coverage from all individual peptide identification also requires all protein sequences expected to be known. For this there are two options, either retrieve the sequences from the originally used fasta, or try to retrieve the sequences via UniProt through the accessions with the PeptideHits. Parameters ---------- pro_ids : List[oms.ProteinIdentification] The PyOpenMS ProteinIdentification as from reading a common identification file pep_ids : List[oms.PeptideIdentification] List of PyOpenMS PeptideIdentification as from reading a common identification file fasta : [type], optional Dictionary of sequences from a fasta file, Dict[accession,SeqRecord] by default Dict[str,SeqRecord.SeqRecord] fetch : bool, optional If set true, will attempt to retrieve sequences by accession, is ignored if `fasta` is provided, by default False Returns ------- List[mzqc.QualityMetric] [description] """ metrics: List[mzqc.QualityMetric] = list() # check all proteinhits have seq set # first via proteinhits, missing then either via fasta or # calc coverage missing_acc = list() nup = list() for p in pro_ids.getHits(): ac = p.getAccession() nup.append(oms.ProteinHit(p)) if not p.getSequence(): if fasta: nup[-1].setSequence( str(fasta.get(ac, SeqRecord.SeqRecord('')).seq)) # if still no sequence if not p.getSequence(): missing_acc.append(ac) if missing_acc: uniprot = {x.id: x for x in utils.getUniProtSequences(missing_acc)} for n in nup: ac = n.getAccession() if not n.getSequence(): n.setSequence(str( uniprot.get(ac, SeqRecord.SeqRecord('')).seq)) urx = re.compile('\w*\|(\w*)\|\w*') uniprot = { re.search(urx, x.id).group(): x for x in utils.getUniProtSequences(missing_acc) } del uniprot[''] for n in nup: ac = n.getAccession() if not n.getSequence(): n.setSequence(str( uniprot.get(ac, SeqRecord.SeqRecord('')).seq)) coverage_tab: Dict[str, List[Any]] = defaultdict(list) na = [n.getAccession() for n in nup if not n.getSequence()] nup = [n for n in nup if n.getSequence()] pro_ids.setHits(nup) pro_ids.computeCoverage(pep_ids) for p in pro_ids.getHits(): coverage_tab['Accession'].append(p.getAccession()) coverage_tab['Coverage'].append(p.getCoverage()) coverage_tab['Length'].append(len(p.getSequence())) # TODO figure out decoy string by fasta coverage_tab['TD'].append('decoy' if 'decoy' in p.getAccession().lower() else 'target') for n in na: coverage_tab['Accession'].append(n.getAccession()) coverage_tab['Coverage'].append('NA') coverage_tab['Length'].append('NA') coverage_tab['TD'].append('decoy' if 'decoy' in n.getAccession().lower() else 'target') metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Protein coverage", value=coverage_tab)) return metrics
def describeChargeRatios(identification_scoring_metrics:mzqc.QualityMetric) -> List[mzqc.QualityMetric]: """ describeChargeRatios calculates the descriptive statistics metrics for charge ratios of identified tandem spectra. From the proto-metrics on identification scores, the function calculates descriptive statistics metrics on the charge ratios from all identified tandem spectra. Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers. Parameters ---------- identification_scoring_metrics : mzqc.QualityMetric The proto-metrics on identification scores containing 'c' values. Returns ------- List[mzqc.QualityMetric] The list of metrics """ metrics: List[mzqc.QualityMetric] = list() if 'c' not in identification_scoring_metrics: warnings.warn("No charges in given annotation, ignoring charge ratio metrics.", Warning) return metrics # IS3A <- c1n / c2n # IS3B <- c3n / c2n # IS3C <- c4n / c2n unique_charges, charge_freq = np.unique(identification_scoring_metrics.value['c'], return_counts=True) c1i = np.where(unique_charges == 1) c1n = charge_freq[c1i[0][0]] if len(c1i[0])>0 else 0 c2i = np.where(unique_charges == 2) c2n = charge_freq[c2i[0][0]] if len(c2i[0])>0 else 0 c3i = np.where(unique_charges == 3) c3n = charge_freq[c3i[0][0]] if len(c3i[0])>0 else 0 c4i = np.where(unique_charges == 4) c4n = charge_freq[c4i[0][0]] if len(c4i[0])>0 else 0 mi = c4i if c4i > 0 else c3i if c3i > 0 else c2i if c2i > 0 else c1i if c1i > 0 else 0 c5p = sum(charge_freq[mi+1:]) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="IS3A", value= (c1n / c2n) if c2n > 0 else np.NAN ) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="IS3B", value= (c3n / c2n) if c2n > 0 else np.NAN ) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="IS3C", value= (c4n / c2n) if c2n > 0 else np.NAN ) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="IS3X", value=np.median(identification_scoring_metrics.value['c'])) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="IS3Y", value=np.mean(identification_scoring_metrics.value['c'])) ) return metrics
Unit tests for the MZQCFile library """ # String comparison -as in TestSerialisation- needs the 'empty' attributes, too, whereas Object comparison -as in TestDeserialisation- only compares 'non-empty' attributes QM = '{"cvRef": "QC", "accession": "QC:4000053", "name": "RT duration", "description": "", "value": 99, "unit": ""}' CV = '{"ref": "REF", "name": "TEST", "uri": "www.eff.off", "version": ""}' CVT = '{"cvRef": "REF", "accession": "TEST:123", "name": "testname", "description": "", "value": 99, "unit": ""}' ANSO = '{"cvRef": "QC", "accession": "QC:9999999", "name": "bigwhopqc", "description": "", "value": "", "unit": "", "version": "1.2.3", "uri": "file:///dev/null"}' INFI = '{"location": "file:///dev/null", "name": "file.raw", "fileFormat": {"cvRef": "MS", "accession": "MS:1000584", "name": "mzML format"}, "fileProperties": [{"cvRef": "MS", "accession": "MS:1000747", "name": "completion time", "value": "2017-12-08-T15:38:57Z"}]}' META = '{"inputFiles": [{"location": "file:///dev/null", "name": "file.raw", "fileFormat": {"cvRef": "MS", "accession": "MS:1000584", "name": "mzML format"}, "fileProperties": [{"cvRef": "MS", "accession": "MS:1000747", "name": "completion time", "value": "2017-12-08-T15:38:57Z"}]}], "analysisSoftware": [{"cvRef": "QC", "accession": "QC:9999999", "name": "bigwhopqc", "version": "1.2.3", "uri": "file:///dev/null"}]}' RUQU = '{"metadata": {"inputFiles": [{"location": "file:///dev/null", "name": "file.raw", "fileFormat": {"cvRef": "MS", "accession": "MS:1000584", "name": "mzML format"}, "fileProperties": [{"cvRef": "MS", "accession": "MS:1000747", "name": "completion time", "value": "2017-12-08-T15:38:57Z"}]}], "analysisSoftware": [{"cvRef": "QC", "accession": "QC:9999999", "name": "bigwhopqc", "version": "1.2.3", "uri": "file:///dev/null"}]}, "qualityMetrics": [{"cvRef": "QC", "accession": "QC:4000053", "name": "RT duration", "value": 99}]}' SEQU = '{"metadata": {"inputFiles": [{"location": "file:///dev/null", "name": "file.raw", "fileFormat": {"cvRef": "MS", "accession": "MS:1000584", "name": "mzML format"}, "fileProperties": [{"cvRef": "MS", "accession": "MS:1000747", "name": "completion time", "value": "2017-12-08-T15:38:57Z"}]}], "analysisSoftware": [{"cvRef": "QC", "accession": "QC:9999999", "name": "bigwhopqc", "version": "1.2.3", "uri": "file:///dev/null"}]}, "qualityMetrics": [{"cvRef": "QC", "accession": "QC:4000053", "name": "RT duration", "value": 99}]}' NPQM = '{"cvRef": "", "accession": "QC:123", "name": "einszweidrei", "description": "", "value": {"np": [0.1111111119389534, 0.25, 0.4285714328289032]}, "unit": ""}' cvt = qc.CvParameter(cvRef="REF", accession="TEST:123", name="testname", value=99) infi = qc.InputFile(name="file.raw", location="file:///dev/null", fileFormat=qc.CvParameter("MS", "MS:1000584", "mzML format"), fileProperties=[ qc.CvParameter(cvRef="MS", accession="MS:1000747", name="completion time", value="2017-12-08-T15:38:57Z") ]) anso = qc.AnalysisSoftware( cvRef="QC", accession="QC:9999999", name="bigwhopqc",
def getIDQuality(exp: oms.MSExperiment, pro_ids: List[oms.ProteinIdentification], pep_ids: List[oms.PeptideIdentification], ms2num: int = 0) -> List[mzqc.QualityMetric]: """ getIDQuality calculates the id-based QualityMetrics from a mass spectrometry peak file and associated identification file. Calculated are the id-based QC metrics and proto-metrics necessary to calculate more elaborate QC metrics with even more additional data (e.g. multiple runs). Parameters ---------- exp : oms.MSExperiment The mass spectrometry peak file to calculate metrics from pro_ids : List[oms.ProteinIdentification] List of PyOpenMS ProteinIdentification as from reading a common identification file pep_ids : List[oms.PeptideIdentification] List of PyOpenMS PeptideIdentification as from reading a common identification file ms2num : int, optional The total number of tandem spectra as from the id-free metrics, by default 0 Returns ------- List[mzqc.QualityMetric] List of resulting QualityMetrics """ metrics: List[mzqc.QualityMetric] = list() params = pro_ids[0].getSearchParameters() # var_mods = params.variable_modifications metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Sequence database name", value=pro_ids[0].getSearchParameters().db) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Sequence database version", value=pro_ids[0].getSearchParameters().db_version) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Sequence database taxonomy", value=pro_ids[0].getSearchParameters().taxonomy) ) spectrum_count: int = 0 psm_count: int = 0 runs_coun: int = 0 protein_evidence_count: int = 0 # TODO call mc functions missedcleavages: int = 0 missedcleavages_total: int = 0 peptides_allhits: Set[str] = set() peptides: Set[str] = set() proteins: Set[str] = set() for pepi in pep_ids: if not pepi.empty(): # TODO if not decoy and not under threshold spectrum_count += 1 psm_count += len(pepi.getHits()) for psm in pepi.getHits(): peptides_allhits.add(psm.getSequence().toString()) if pepi.getHits(): peptides.add(pepi.getHits()[0].getSequence().toString()) for proid in pro_ids: protein_evidence_count += len(proid.getHits()) for p in proid.getHits(): proteins.add(p.getAccession()) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Total number of protein evidences", value=protein_evidence_count) ) # TODO not yet factoring in protein inference, one psm might still account for several evidences metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Total number of identified proteins", value=len(proteins)) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Total number of PSM", value=psm_count) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Total number of peptide spectra", value=spectrum_count) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Total number identified unique peptide sequences", value=len(peptides)) ) identification_accuracy_metrics: Dict[str,List[Any]] = defaultdict(list) identification_scoring_metrics: Dict[str,List[Any]] = defaultdict(list) identification_sequence_metrics: Dict[str,List[Any]] = defaultdict(list) hydrophobicity_metrics: Dict[str,List[Any]] = defaultdict(list) # TODO constants available since 2.5 as oms.Constants.PROTON_MASS_U # PROTON_MASS_U = 1.00727646677 # Constants::PROTON_MASS_U unavailable score_type = pep_ids[0].getScoreType() psims = utils.obtainOntology("psi-ms") name_indexed = {psims[x].name: psims[x] for x in psims} score_indexed = {x.name: x for x in chain(psims['MS:1001143'].subclasses(),psims['MS:1001153'].subclasses(),psims['MS:1002347'].subclasses(),psims['MS:1002363'].subclasses())} if score_type in name_indexed: if not score_type in score_indexed: warnings.warn("Score type does not correspond to a score type in the OBO, proceed at own risk.", Warning) score_col_name = name_indexed[score_type].id else: score_col_name = score_indexed[score_type].id else: warnings.warn("OBO does not contain any entry matching the identification score, proceed at own risk.", Warning) score_col_name = score_type for pepi in pep_ids: pid = utils.pep_native_id(pepi) if pepi.getHits(): tmp = pepi.getHits()[0] # TODO apply custom filters and also consider 'pass_threshold' identification_scoring_metrics['RT'].append(pepi.getRT()) identification_scoring_metrics['c'].append(tmp.getCharge()) identification_scoring_metrics[score_col_name].append(tmp.getScore()) tw = (tmp.getSequence().getMonoWeight(0,0) + tmp.getCharge() * oms.Constants.PROTON_MASS_U) / tmp.getCharge() dppm = utils.getMassDifference(tw, pepi.getMZ(), True) identification_accuracy_metrics['RT'].append(pepi.getRT()) identification_accuracy_metrics['MZ'].append(pepi.getMZ()) identification_accuracy_metrics['delta_ppm'].append(dppm) err = utils.getMassDifference(tw, pepi.getMZ(), False) identification_accuracy_metrics['abs_error'].append(err) hydrophobicity_metrics['RT'].append(pepi.getRT()) hydrophobicity_metrics['gravy'].append(ProtParam.ProteinAnalysis(tmp.getSequence().toUnmodifiedString()).gravy()) identification_sequence_metrics['RT'].append(pepi.getRT()) identification_sequence_metrics['peptide'].append(tmp.getSequence().toString().lstrip().rstrip()) identification_sequence_metrics['target'].append(tmp.getMetaValue('target_decoy').lower() == 'target') identification_sequence_metrics['native_id'].append(pid) # #varmod??? # for (UInt w = 0; w < var_mods.size(); ++w) # { # at.colTypes.push_back(String(var_mods[w]).substitute(' ', '_')); # } ## Basic id metrics metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Identification scoring metric values", value=identification_scoring_metrics) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Identifications accuracy metric values", value=identification_accuracy_metrics) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Hydrophobicity metric values", value=hydrophobicity_metrics) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Identifications sequence metric values", value=identification_sequence_metrics) ) ## simple id metrics metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Identification to tandem spectra ratio", value=float(len(pep_ids))/float(ms2num)) ) return metrics
def describeErrorRates(identification_accuracy_metrics:mzqc.QualityMetric) -> List[mzqc.QualityMetric]: """ describeErrorRates calculates the descriptive statistics metrics for charge ratios of identified tandem spectra. From the proto-metrics on identification accuracy, the function calculates descriptive statistics metrics on the error rates from all identified tandem spectra. Namely, mean, standard deviation, Quartiles, and 1.5*IQR outliers. Parameters ---------- identification_accuracy_metrics : mzqc.QualityMetric The proto-metrics on identification accuracies containing 'delta_ppm' and 'abs_error' values. Returns ------- List[mzqc.QualityMetric] The list of metrics """ metrics: List[mzqc.QualityMetric] = list() if 'delta_ppm' not in identification_accuracy_metrics: warnings.warn("No error values in given annotation, ignoring identification error rate metrics.", Warning) return metrics metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="MS15A", value= np.median(identification_accuracy_metrics.value['abs_error']) ) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="MS15B", value=np.mean(identification_accuracy_metrics.value['abs_error']) ) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="MS15C", value=np.median(identification_accuracy_metrics.value['delta_ppm']) ) ) arr = np.array(identification_accuracy_metrics.value['delta_ppm']) q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="MS15D", value=q3-q1) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Delta ppm Q1, Q2, Q3", value=[q1,q2,q3]) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Delta ppm sigma", value=s) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Delta ppm mean", value=m) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Delta ppm +/-1.5*IQR outlier", value=ol) ) return metrics
def test_SyntaxCheck(): cvt = qc.CvParameter(cvRef="REF", accession="TEST:123", name="testname", value=99) infi = qc.InputFile(name="file.raw", location="file:///dev/null", fileFormat=qc.CvParameter("MS", "MS:1000584", "mzML format"), fileProperties=[ qc.CvParameter(cvRef="MS", accession="MS:1000747", name="completion time", value="2017-12-08-T15:38:57Z") ]) anso = qc.AnalysisSoftware( cvRef="QC", accession="QC:9999999", name="bigwhopqc", version="1.2.3", uri="file:///dev/null") # isn't requiring a uri a bit too much? meta = qc.MetaDataParameters(inputFiles=[infi], analysisSoftware=[anso]) qm = qc.QualityMetric(cvRef="QC", accession="QC:4000053", name="RT duration", value=99) qm2 = qc.QualityMetric(cvRef="QC", accession="QC:4000061", name="Maximal MS2 frequency", value=999) qm3 = qc.QualityMetric(cvRef="QC", accession="QC:4000055", name="MS1 quantiles RT fraction", value=9) rq = qc.RunQuality(metadata=meta, qualityMetrics=[qm, qm2]) sq = qc.SetQuality(metadata=meta, qualityMetrics=[qm3]) cv = qc.ControlledVocabulary(ref="QC", name="QCvocab", uri="www.qc.ml") cv2 = qc.ControlledVocabulary(ref="REF", name="TEST", uri="www.eff.off") mzqc = qc.MzQcFile(version="0.0.11", runQualities=[rq], setQualities=[sq], controlledVocabularies=[cv, cv2]) # with open('tests/mzqc_lib_out.mzqc', 'w') as f: # f.write("{ \"mzQC\": " + qc.JsonSerialisable.ToJson(mzqc) + " }") syn_check = sy.SyntacticCheck() syn_check.validate("{ \"mzQC\": " + qc.JsonSerialisable.ToJson(mzqc) + " }")
def getBasicQuality(exp: oms.MSExperiment, verbose: bool=False) -> mzqc.RunQuality: """ getBasicQuality calculates the basic QualityMetrics from a mass spectrometry peak file and creates the related RunQuality object. Calculated basic QC metrics and proto-metrics necessary to calculate more elaborate QC metrics with additional data (e.g. ID). Parameters ---------- exp : oms.MSExperiment The mass spectrometry peak file to calculate metrics from verbose : bool, optional switches on verbose logging, by default False Returns ------- mzqc.RunQuality A RunQuality object containing the list of metrics calculated and metadata collected, ready for integration into a mzQC file object. """ metrics: List[mzqc.QualityMetric] = list() if exp.getExperimentalSettings().getSourceFiles(): parent_base_name: str = basename(exp.getExperimentalSettings().getSourceFiles()[0].getNameOfFile()) parent_chksm: str = exp.getExperimentalSettings().getSourceFiles()[0].getChecksum() parent_chksm_type: str = exp.getExperimentalSettings().getSourceFiles()[0].getChecksumType() instr_srl: str = exp.getInstrument().getMetaValue('instrument serial number') \ if exp.getInstrument().metaValueExists('instrument serial number') else 'unknown' # MS:1000529 in mzML input_loc: str = exp.getExperimentalSettings().getLoadedFilePath() base_name: str = basename(input_loc) chksm: str = utils.sha256fromfile(exp.getExperimentalSettings().getLoadedFilePath()) cmpltn: str = exp.getDateTime().get() # strt:datetime.datetime = datetime.datetime.strptime(cmpltn, '%Y-%m-%d %H:%M:%S') - datetime.timedelta(seconds=exp.getChromatograms()[0][exp.getChromatograms()[0].size()-1].getRT()*60) meta: mzqc.MetaDataParameters = mzqc.MetaDataParameters( inputFiles=[ mzqc.InputFile(name=base_name,location=input_loc, fileFormat=mzqc.CvParameter("MS", "MS:1000584", "mzML format"), fileProperties=[ mzqc.CvParameter(cvRef="MS", accession="MS:1000747", name="completion time", value=cmpltn ), mzqc.CvParameter(cvRef="MS", accession="MS:1000569", name="SHA-256", value=chksm ), mzqc.CvParameter(cvRef="MS", accession="MS:1000031", name="instrument model", value=exp.getInstrument().getName() ), mzqc.CvParameter(cvRef="MS", accession="MS:1000529", name="instrument serial number", value=instr_srl ) # TODO integrate parent location and checksum # id: MS:1002846 (Associated raw file URI) N.B. definition is PRIDE specific - WTF # fitting checksum cv missing ] ) ], analysisSoftware=[ mzqc.AnalysisSoftware(cvRef="MS", accession="MS:1000752", name="TOPP software", version=oms.__version__, uri="openms.de") ] ) # this is mighty important to sort by RT exp.sortSpectra() min_mz: float = sys.maxsize max_mz: float = 0 mslevelcounts: Dict[int,int] = defaultdict(int) spectrum_acquisition_metrics_MS1: Dict[str,List[Any]] = defaultdict(list) spectrum_acquisition_metrics_MS2: Dict[str,List[Any]] = defaultdict(list) spectrum_topn: Dict[str,List[Any]] = defaultdict(list) tandem_spectrum_metrics_MS2: Dict[str,List[Any]] = defaultdict(list) trap_metrics_MS1: Dict[str,List[Any]] = defaultdict(list) trap_metrics_MS2: Dict[str,List[Any]] = defaultdict(list) isolation_window_metrics: Dict[str,List[Any]] = defaultdict(list) tic_tab: Dict[str,List[Any]] = defaultdict(list) # ActivationMethod look-up dict ams = {getattr(ActivationMethod,i): i for i in dir(ActivationMethod) if type(getattr(ActivationMethod,i))==int } intens_sum: np.float = 0 last_surveyscan_index:int = 0 for spin, spec in enumerate(exp): mslevelcounts[spec.getMSLevel()] += 1 iontraptime = utils.getTrapTime(spec) intens_max = spec.get_peaks()[1].max() intens_min = spec.get_peaks()[1].min() intens_sum = spec.get_peaks()[1].sum() if spec.getMSLevel() == 1: last_surveyscan_index = spin last_surveyscan_intensity = intens_sum last_surveyscan_max = intens_max spectrum_acquisition_metrics_MS1['RT'].append(spec.getRT()) spectrum_acquisition_metrics_MS1['SN'].append(noiseqc.getSN_medianmethod(spec)) spectrum_acquisition_metrics_MS1['peakcount'].append(spec.size()) spectrum_acquisition_metrics_MS1['int'].append(intens_sum.item()) # .item() for dtype to pytype trap_metrics_MS1['RT'].append(spec.getRT()) trap_metrics_MS1['traptime'].append(iontraptime) tic_tab['RT'].append(spec.getRT()) tic_tab['int'].append(intens_sum) if (spec.getMSLevel() == 2): if (spec.getPrecursors()[0].getMZ() < min_mz): min_mz = spec.getPrecursors()[0].getMZ() if (spec.getPrecursors()[0].getMZ() > max_mz): max_mz = spec.getPrecursors()[0].getMZ() spectrum_acquisition_metrics_MS2['RT'].append(spec.getRT()) spectrum_acquisition_metrics_MS2['SN'].append(noiseqc.getSN_medianmethod(spec)) spectrum_acquisition_metrics_MS2['peakcount'].append(spec.size()) spectrum_acquisition_metrics_MS2['int'].append(intens_sum.item()) # .item() for dtype to pytype spectrum_acquisition_metrics_MS2['native_id'].append(utils.spec_native_id(spec)) rank = spin - last_surveyscan_index spectrum_acquisition_metrics_MS2['rank'].append(rank) trap_metrics_MS2['RT'].append(spec.getRT()) trap_metrics_MS2['traptime'].append(iontraptime) trap_metrics_MS2['activation_method'].append(ams.get(next(iter(spec.getPrecursors()[0].getActivationMethods()), None),'unknown')) trap_metrics_MS2['activation_energy'].append(spec.getPrecursors()[0].getMetaValue('collision energy') if \ spec.getPrecursors()[0].metaValueExists('collision energy') else -1) precursor_index = np.searchsorted(exp[last_surveyscan_index].get_peaks()[0], [exp[spin].getPrecursors()[0].getMZ()])[0] if precursor_index != np.array(exp[last_surveyscan_index].get_peaks()).shape[1]: precursor_err = spec.getPrecursors()[0].getMZ() - np.array(exp[last_surveyscan_index].get_peaks())[:,precursor_index][0] precursor_int = np.array(exp[last_surveyscan_index].get_peaks())[:,precursor_index][1] else: precursor_err = np.nan precursor_int = np.nan tandem_spectrum_metrics_MS2['RT'].append(spec.getRT()) tandem_spectrum_metrics_MS2['precursor_intensity'].append(precursor_int) # TODO different from mzid->mzml getPrecursors[0].getIntensity() ? YES, latter one usually zero tandem_spectrum_metrics_MS2['precursor_error'].append(precursor_err) tandem_spectrum_metrics_MS2['precursor_mz'].append(spec.getPrecursors()[0].getMZ()) tandem_spectrum_metrics_MS2['precursor_c'].append(spec.getPrecursors()[0].getCharge()) tandem_spectrum_metrics_MS2['surveyscan_intensity_sum'].append(last_surveyscan_intensity) tandem_spectrum_metrics_MS2['surveyscan_intensity_max'].append(last_surveyscan_max) isolation_window_metrics['RT'].append(spec.getRT()) isolation_window_metrics['isolation_target'].append(spec.getPrecursors()[0].getMZ()) # https://github.com/OpenMS/OpenMS/blob/d17cc251fd0c4068eb253b03c9fb107897771fdc/src/openms/source/FORMAT/HANDLERS/MzMLHandler.cpp#L1992 isolation_window_metrics['isolation_lower'].append(spec.getPrecursors()[0].getIsolationWindowLowerOffset()) isolation_window_metrics['isolation_upper'].append(spec.getPrecursors()[0].getIsolationWindowUpperOffset()) lower = spec.getPrecursors()[0].getMZ() - spec.getPrecursors()[0].getIsolationWindowLowerOffset() upper = spec.getPrecursors()[0].getMZ() + spec.getPrecursors()[0].getIsolationWindowUpperOffset() s = np.array([(i.getMZ(),i.getIntensity()) for i in exp[last_surveyscan_index]], ndmin = 2) s = s[np.where(np.logical_and(s[:, 0]>=lower, s[:, 0]<=upper))[0]] isolation_window_metrics['peaks_in_window'].append(np.shape(s)[0]) int_sort_desc = np.flip(np.argsort(s[:,1])) if np.shape(s)[0] > 1: isolation_window_metrics['int_ratio_ranked_peaks_in_window'].append( s[int_sort_desc][:-1,1]/s[int_sort_desc][1:,1][0]) # intensity ratio between top1&2, 2&3, ... else: isolation_window_metrics['int_ratio_ranked_peaks_in_window'].append(0) # bigger is better, though best is 0 isolation_window_metrics['summed_window_intensity'].append(np.sum(s[int_sort_desc][:,1])) isolation_window_metrics['isolation_target_intensity'].append(spec.getPrecursors()[0].getIntensity()) # TODO this needs to go outside tol = 0.5 if spec.metaValueExists('filter string'): if 'FTMS' in spec.getMetaValue('filter string'): tol = 0.05 elif 'ITMS' in spec.getMetaValue('filter string'): tol = 0.5 elif 'QTOF' in spec.getMetaValue('filter string'): #TOFMS, SQMS, TQMS, SectorMS tol = 0.1 # ms2 peaks directly from isolation window? unfragmented = np.any([np.isclose(i[0],[x.getMZ() for x in spec], atol=tol) for i in s]) isolation_window_metrics['peaks_in_window_in_ms2'].append(str(unfragmented)) ## Spectra detail numbers metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Spectrum acquisition metric values - MS1", value=spectrum_acquisition_metrics_MS1) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Spectrum acquisition metric values - MS2", value=spectrum_acquisition_metrics_MS2) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Spectra topn ranks", value=spectrum_topn) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Tandem spectrum metric values - MS2", value=tandem_spectrum_metrics_MS2) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Trap metric values - MS1", value=trap_metrics_MS1) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Trap metric values - MS2", value=trap_metrics_MS2) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="isolation window metrics", value=isolation_window_metrics) ) ## Spectra numbers for levels in mslevelcounts.keys(): metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Number of MS{l} spectra".format(l=str(levels)), value=mslevelcounts[levels]) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Number of chromatograms", value=len(exp.getChromatograms())) ) ## Ranges metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="MZ aquisition range", value=[min_mz,max_mz]) ) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="RT aquisition range", value=[exp[0].getRT(),exp[exp.size()-1].getRT()]) ) # TIC metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Total ion current", value=tic_tab) ) # Chrom chrom_tab: Dict[str,List[Any]] = defaultdict(list) chroms = exp.getChromatograms() for t in chroms: if t.getChromatogramType() == oms.ChromatogramSettings.ChromatogramType.TOTAL_ION_CURRENT_CHROMATOGRAM: for chro_peak in t: chrom_tab['RT'].append(chro_peak.getRT()) chrom_tab['int'].append(chro_peak.getIntensity()) break metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Chromatogram", value=chrom_tab) ) # TODO is there a difference between TIC as defined in MS:1000235 and the chromatogram you get from TRP?? In MZML it says its a MS:1000235 (ion current detected in each of a series of mass spectra) but is it? # TODO consider collection of spectrum_native_id return mzqc.RunQuality(metadata=meta, qualityMetrics=metrics)
def getEnzymeContaminationMetrics(pep, pro, force_enzymes=False ) -> List[mzqc.QualityMetric]: """ getEnzymeContaminationMetrics calculates enzyme and enzyme contamination metrics from the identifications given. The function calculates the number of missed cleavages (internal), peptide length distribution, and peptide boundaries matching known enzyme patterns from the given identifications. Matching against digestion enzyme patterns other than the enyme used for identification processess has to be switched with 'force_enzymes' and is sensible if the identification was conducted with unspecific cleavage to detect enzyme contamination or enzyme setting mixup is suspected. Parameters ---------- pro : List[oms.ProteinIdentification] List of PyOpenMS ProteinIdentification as from reading a common identification file pep : List[oms.PeptideIdentification] List of PyOpenMS PeptideIdentification as from reading a common identification file force_enzymes : bool, optional If set, will force checking the identified peptide sequences against other known digestion enzyme patterns. By default False Returns ------- List[mzqc.QualityMetric] List of resulting QualityMetrics """ metrics: List[mzqc.QualityMetric] = list() # include all psm actually does not make much sense to assess the enzyme efficiency gre = { pro[0].getSearchParameters().digestion_enzyme.getName(): re.compile(pro[0].getSearchParameters().digestion_enzyme.getRegEx()) } # TODO pyopenms wrappers for DigestionEnzymeDB etc # li: List = list() # oms.DigestionEnzymeDB().getAllNames(li) # ore = {e: re.compile(oms.DigestionEnzymeDB().getEnzyme(e).getRegEx()) for e in li # if e not in gre and e != 'no cleavage'} enzymematch_tab: Dict[str, List[Any]] = defaultdict(list) missed_ranks = list() matched_ranks = list() # alt = dict() for i, pepi in enumerate(pep): pepi.sort() spec_id = pepi.getMetaValue('spectrum_reference') \ if pepi.metaValueExists('spectrum_reference') else i for i, h in enumerate(pepi.getHits()): pepseq = h.getPeptideEvidences()[0].getAABefore() \ + h.getSequence().toUnmodifiedString() \ + h.getPeptideEvidences()[0].getAAAfter() is_matched, internal_matches = matchEnzyme( next(iter(gre.values())), pepseq) if i == 0: enzymematch_tab['native_id'].append(spec_id) enzymematch_tab['matched'].append(is_matched) enzymematch_tab['missed'].append(internal_matches) else: missed_ranks.append(internal_matches) matched_ranks.append(is_matched) # if force_enzymes or not is_matched: # oth_enz_matched = {k: matchEnzyme(v, pepseq) for k,v in ore.items()} # alt[spec_id] = oth_enz_matched if len(missed_ranks): arr = np.array(missed_ranks) q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name= "Q1, Q2, Q3 of missed clevage counts for all lower rank identifications.", value=[q1, q2, q3])) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name= "Sigma of missed clevage counts for all lower rank identifications.", value=s)) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name= "Mean of missed clevage counts for all lower rank identifications.", value=m)) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name= "Missed clevage count for all lower rank identifications +/-1.5*IQR outlier", value=ol)) if len(matched_ranks): mdl: Dict[int, int] = defaultdict(int) arr = np.array(matched_ranks) uniq, counts = np.unique(arr, return_counts=True) mdl.update(dict(zip(uniq, counts))) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name= "Match/semi/none counts for all lower rank identifications.", value=[mdl[2], mdl[1], mdl[0]])) metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Missed cleavages", value=enzymematch_tab)) arr = np.array(enzymematch_tab['missed']) q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name="Q1, Q2, Q3 of missed clevage counts for top identifications.", value=[q1, q2, q3])) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name="Sigma of missed clevage counts for top identifications.", value=s)) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name="Mean of missed clevage counts for top identifications.", value=m)) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name= "Missed clevage count for top identifications +/-1.5*IQR outlier", value=ol)) return metrics
def getSNMetrics(spectrum_acquisition_metrics_MS: mzqc.QualityMetric, ms_level: int) -> List[mzqc.QualityMetric]: """ getSNMetrics collect S/N related QC metrics from a super metric collected in a first pass of the input mzML S/N from each spectrum are computed into 'spectrum acquisition metrics' for each MS level, from there S/N distribution values are computed. Parameters ---------- spectrum_acquisition_metrics_MS : mzqc.QualityMetric QualityMetric object with the spectrum acquisition metrics ms_level : int The MS level to which the given spectrum acquisition metrics belong to Returns ------- List[mzqc.QualityMetric] A list of new QualityMetric objects for mzQC deposition """ metrics: List[mzqc.QualityMetric] = list() np_sn = np.array(spectrum_acquisition_metrics_MS.value['SN']) qs = np.quantile(np_sn, [.25, .5, .75]) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name= "Signal-to-noise ratio Q1, Q2, Q3 for MS level {ms_level} collection" .format(ms_level=ms_level), value=list(qs))) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name="Signal-to-noise ratio sigma for MS level {ms_level} collection" .format(ms_level=ms_level), value=np.std(np_sn))) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name="Signal-to-noise ratio mean for MS level {ms_level} collection" .format(ms_level=ms_level), value=np.mean(np_sn))) low_out = qs[0] - (1.5 * (qs[2] - qs[0])) high_out = qs[2] + (1.5 * (qs[2] - qs[0])) metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name= "Signal-to-noise ratio +/-1.5*IQR outlier for MS level {ms_level} collection" .format(ms_level=ms_level), value=np.extract((np_sn < low_out) | (np_sn > high_out), np_sn))) return metrics
def getIdentifiedSignalMetrics(tandem_spectrum_metrics_MS2:mzqc.QualityMetric, spectrum_acquisition_metrics_MS1: mzqc.QualityMetric, identification_accuracy_metrics: mzqc.QualityMetric, tic_table: mzqc.QualityMetric) -> List[mzqc.QualityMetric]: """ getIdentifiedSignalMetrics calculate metrics on the proportions of recorded signal identified. The metrics calculated include the median ratio of max survey scan intensity over sampled precursor intensity for peptides identified, the fractions of identified MS2 in precursor intensity Quartiles, median SN for MS1 spectra in RT range in which the first half of peptides are identified, and median TIC value of RT range in which half of peptides are identified. Parameters ---------- tandem_spectrum_metrics_MS2 : mzqc.QualityMetric The proto-metrics on tandem spectra containing 'RT', 'precursor_mz', 'precursor_intensity', 'surveyscan_intensity_sum', 'surveyscan_intensity_max' values. spectrum_acquisition_metrics_MS1 : mzqc.QualityMetric The proto-metrics on MS1 spectra containing 'RT' and 'SN' values identification_accuracy_metrics : mzqc.QualityMetric The proto-metrics on identification accuracies containing 'RT' and 'MZ' values tic_table : mzqc.QualityMetric The proto-metrics on total ion current intensities containing 'RT' and 'int' Returns ------- List[mzqc.QualityMetric] The list of metrics """ metrics: List[mzqc.QualityMetric] = list() # Fraction of total MS2 scans identified in the first quartile of peptides sorted by MS1 intensity (sum) np_prec = np.array([tandem_spectrum_metrics_MS2.value['RT'], tandem_spectrum_metrics_MS2.value['precursor_mz'], tandem_spectrum_metrics_MS2.value['precursor_intensity'], tandem_spectrum_metrics_MS2.value['surveyscan_intensity_sum'], tandem_spectrum_metrics_MS2.value['surveyscan_intensity_max']]) # DS-3A reimpl.: median( (surv max / prec int) for all ident. prec ) id_coord = np.array([identification_accuracy_metrics.value['RT'],identification_accuracy_metrics.value['MZ']]) # TODO make sure intersection is round-proof intersected = np.intersect1d(np_prec[1],id_coord[1], assume_unique=False, return_indices=True) np_id = np_prec[:,intersected[1]] metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Median ratio of max survey scan intensity over sampled precursor intensity for peptides identified", value=np.median(np_id[4] / np_id[2])) ) # MS1-3A reimpl.: Ratio of 95th over 5th percentile MS1 maximum intensity values for identified peptides (approximates dynamic range of signal) p05, p95 = np.quantile(np_id[4], [.05, .95]) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Ratio of 95th over 5th percentile of precursor intensity for identified peptides", value=p95 / p05 ) ) # Quartiles by MS1 maximum intensity # Fraction of identified MS2 Spectra within # MS2-4A : 0 and Q1 # MS2-4B : Q1 and Q2 # MS2-4C : Q2 and Q3 # MS2-4D : above Q3 q1,q2,q3 = np.quantile(np_prec[4], [.25, .5, .75]) tandem_upto_q1 = np.shape(np_prec[:,np_prec[4]<q1])[1] id_upto_q1 = np.shape(np_id[:,np_id[4]<q1])[1] tandem_between_q1q2 = np.shape(np_prec[:,(q1<np_prec[4]) & (np_prec[4]<q2)])[1] id_between_q1q2 = np.shape(np_id[:,(q1<np_id[4]) & (np_id[4]<q2)])[1] tandem_between_q2q3 = np.shape(np_prec[:,(q2<np_prec[4]) & (np_prec[4]<q3)])[1] id_between_q2q3 = np.shape(np_id[:,(q2<np_id[4]) & (np_id[4]<q3)])[1] tandem_above_q3 = np.shape(np_prec[:,q3<np_prec[4]])[1] id_above_q3 = np.shape(np_id[:,q3<np_id[4]])[1] metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Fraction of identified MS2 below Q1 of precursor intensity.", value=tandem_upto_q1 / id_upto_q1) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Fraction of identified MS2 between Q1 and Q2 of precursor intensity.", value=tandem_between_q1q2 / id_between_q1q2) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Fraction of identified MS2 between Q2 and Q3 of precursor intensity.", value=tandem_between_q2q3 / id_between_q2q3) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Fraction of identified MS2 above Q3 of precursor intensity.", value=tandem_above_q3 / id_above_q3) ) # MS1-3B reimpl.: Median maximum MS1 value for identified peptides metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Median precursor intensity of identified MS2", value=np.median(np_id[4])) ) # MS1-2A Median SN for MS1 spectra in RT range in which half (which half???) of peptides are identified np_id = np_id[:,np_id[0].argsort()] msn = np.array([spectrum_acquisition_metrics_MS1.value['RT'], spectrum_acquisition_metrics_MS1.value['SN']]) median_id_rt = np.quantile(np_id[0], [.5])[0] metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Median SN for MS1 spectra in RT range in which the first half of peptides are identified", value=np.median(msn[:, msn[0]<median_id_rt ][1]) ) ) # the smallest rt range which contains half of all identified Spectra half_id_size = np.round(np_id.shape[1]/2) all_diff = np_id[:,-1*int(half_id_size):][0] - np_id[:,:int(half_id_size)][0] # the last (half_id_size) many - the first (half_id_size) many min_start_index = np.argmin(all_diff) min_stop_index = min_start_index+int(half_id_size)-1 rt_interval = np_id[0,min_stop_index] - np_id[0,min_start_index] densest = np.median(msn[:,(np_id[0,min_start_index]<msn[0]) & (msn[0]<np_id[0,min_stop_index])][1]) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Median SN for MS1 spectra in densest RT range in which any half of peptides are identified", value=densest) ) # is median tic value really meaningful? My guess is ratio of tic sum of RT half identified and rest is a better indicator (>1: most signal is in the most exlained region) # Median TIC value of RT range in which half of peptides are identified np_tic = np.array([tic_table.value['RT'], tic_table.value['int']]) densest_id_tic = np.median(np_tic[:,(np_id[0,min_start_index]<np_tic[0]) & (np_tic[0]<np_id[0,min_stop_index])][1]) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Median TIC value of RT range in which half of peptides are identified", value=np.median(np_tic[:, np_tic[0]<median_id_rt ][1]) ) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Median TIC value of densest RT range in which any half of peptides are identified", value=densest_id_tic ) ) return metrics
def getMQMetrics(target_raw: str, params: pandas.DataFrame, evidence: pandas.DataFrame, ms2num: int = 0) -> List[mzqc.QualityMetric]: """ getMQMetrics calculates id based QC metrics from MaxQuant results as close as possible to the way they are calculated from regular id files. For a given raw file (name), the respective results are extracted from dataframes derived off the parameters and evidence files from a MaxQuant result (of potentially multiple raw files combined analysis). As many metrics similar or equal to those dependent of regular id files are calculated. Parameters ---------- target_raw : str The name of the raw file (as per MaxQuant usage without file type extension) params : pandas.DataFrame Dataframe with data from the parameters result file as produced by MaxQuant and stratified column names evidence : pandas.DataFrame Dataframe with data from the evidence result file as produced by MaxQuant and stratified column names ms2num : int, optional The total number of tandem spectra as from the id-free metrics, by default 0 Returns ------- List[mzqc.QualityMetric] A list of QualityMetrics close to what is calculated from a regular id-based QC calculation. """ if not target_raw in evidence['raw file'].unique(): return list() # TODO warn else: mq_metrics: List[mzqc.QualityMetric] = list() #https://stackoverflow.com/questions/17071871/how-to-select-rows-from-a-dataframe-based-on-column-values target_mq = evidence.loc[(evidence['raw file'] == target_raw) & (evidence['ms/ms scan number'].notnull())] mq_metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Sequence database name", value=params.loc['fasta file']['value'])) proteins = len(target_mq['leading proteins'].unique()) mq_metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Total number of identified proteins", value=proteins)) # # name="Total number of PSM", # NA # metrics.append( # mzqc.QualityMetric(cvRef="QC", # accession="QC:0000000", # name="Total number of PSM", # value=psm_count) # ) mq_metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name="Total number of identified peptide spectra", value=len(target_mq))) peptides = len(target_mq['sequence'].unique()) mq_metrics.append( mzqc.QualityMetric( cvRef="QC", accession="QC:0000000", name="Total number identified unique peptide sequences", value=peptides)) score_type = "Andromeda:score" psims = utils.obtainOntology("psi-ms") name_indexed = {psims[x].name: psims[x] for x in psims} score_indexed = { x.name: x for x in chain(psims['MS:1001143'].subclasses( ), psims['MS:1001153'].subclasses(), psims['MS:1002347']. subclasses(), psims['MS:1002363'].subclasses()) } if score_type in name_indexed: if not score_type in score_indexed: warnings.warn( "Score type does not correspond to a score type in the OBO, proceed at own risk.", Warning) score_col_name = name_indexed[score_type].id else: score_col_name = score_indexed[score_type].id else: warnings.warn( "OBO does not contain any entry matching the identification score, proceed at own risk.", Warning) score_col_name = score_type identification_scoring_metrics = target_mq[[ 'retention time', 'charge', 'score' ]].rename(columns={ 'retention time': 'RT', 'charge': 'c', 'score': score_type }).to_dict(orient='list') mq_metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Identification scoring metric values", value=identification_scoring_metrics)) # TODO comparison column with qccalculator dppm values # TODO RT/native id? identification_accuracy_metrics = target_mq[['ms/ms m/z','mass error [ppm]','uncalibrated mass error [da]']]\ .rename(columns={'ms/ms m/z': 'MZ','mass error [ppm]':'delta_ppm','uncalibrated mass error [da]':'abs_error'}) identification_accuracy_metrics[ 'abs_error'] = identification_accuracy_metrics['abs_error'].abs() identification_accuracy_metrics = identification_accuracy_metrics.to_dict( orient='list') mq_metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Identifications accuracy metric values", value=identification_accuracy_metrics)) hydrophobicity_metrics = target_mq[['retention time', 'sequence' ]].rename(columns={ 'retention time': 'RT', 'sequence': 'peptide' }) hydrophobicity_metrics['gravy'] = hydrophobicity_metrics[ 'peptide'].apply(lambda x: ProtParam.ProteinAnalysis(x).gravy()) hydrophobicity_metrics = hydrophobicity_metrics[[ 'RT', 'gravy' ]].to_dict(orient='list') mq_metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Hydrophobicity metric values", value=hydrophobicity_metrics)) # TODO target/decoy info available?? identification_sequence_metrics = target_mq[[ 'sequence', 'retention time', 'ms/ms scan number' ]].rename( columns={ 'sequence': 'peptide', 'retention time': 'RT', 'ms/ms scan number': 'native_id' }).to_dict(orient='list') mq_metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Identifications sequence metric values", value=identification_sequence_metrics)) ## simple id metrics mq_metrics.append( mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Identification to tandem spectra ratio", value=float(len(target_mq)) / float(ms2num))) return mq_metrics
def describeIdentifiedPrecursorIntensity(tandem_spectrum_metrics_MS2:mzqc.QualityMetric, identification_accuracy_metrics: mzqc.QualityMetric) -> List[mzqc.QualityMetric]: """ describeIdentifiedPrecursorIntensity calculates the descriptive statistics metrics for precursor intensities of identified tandem spectra. From the proto-metrics on identification accuracies and tandem spectra, the function calculates descriptive statistics metrics on the precursor intensities from all identified tandem spectra. Namely, min and max, mean, standard deviation, Quartiles, and 1.5*IQR outliers. Parameters ---------- tandem_spectrum_metrics_MS2 : mzqc.QualityMetric The proto-metrics on tandem spectra containing 'RT', 'precursor_mz', 'precursor_intensity', 'surveyscan_intensity_sum', 'surveyscan_intensity_max' values. identification_accuracy_metrics : mzqc.QualityMetric The proto-metrics on identification accuracies containing 'RT' and 'MZ' values Returns ------- List[mzqc.QualityMetric] The list of metrics """ metrics: List[mzqc.QualityMetric] = list() # Fraction of total MS2 scans identified in the first quartile of peptides sorted by MS1 intensity (sum) np_prec = np.array([tandem_spectrum_metrics_MS2.value['RT'], tandem_spectrum_metrics_MS2.value['precursor_mz'], tandem_spectrum_metrics_MS2.value['precursor_intensity'], tandem_spectrum_metrics_MS2.value['surveyscan_intensity_sum'], tandem_spectrum_metrics_MS2.value['surveyscan_intensity_max']]) # DS-3A reimpl.: median( (surv max / prec int) for all ident. prec ) id_coord = np.array([identification_accuracy_metrics.value['RT'],identification_accuracy_metrics.value['MZ']]) # TODO make sure intersection is round-proof intersected = np.intersect1d(np_prec[1],id_coord[1], assume_unique=False, return_indices=True) np_id = np_prec[:,intersected[1]] arr = np_id[2] q1, q2, q3, s, m, ol = utils.extractDistributionStats(arr) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Maximum identified precursor intensity", value=max(arr)) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Minmum identified precursor intensity", value=min(arr)) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Q1, Q2, Q3 of identified precursor intensities", value=[q1, q2, q3]) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Sigma of identified precursor intensities", value=s) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Mean of identified precursor intensities", value=m) ) metrics.append(mzqc.QualityMetric(cvRef="QC", accession="QC:0000000", name="Precursor identified intensity +/-1.5*IQR outlier", value=ol) ) return metrics