def read_ms1_experiment(filepath):
    source_experiment = MSExperiment()
    file_handler = FileHandler()
    # bytes is required by `loadExperiment()` called below
    typed_fp = filepath if isinstance(filepath, bytes) else filepath.encode()
    file_handler.loadExperiment(typed_fp, source_experiment)

    ms1_experiment = MSExperiment()
    for spectrum in source_experiment:
        if spectrum.getMSLevel() == 1:
            ms1_experiment.addSpectrum(spectrum)
    return ms1_experiment
Beispiel #2
0
 def _read_mzml(self, mzml_filename):
     from pyopenms import MzMLFile, MSExperiment
     file_handler = MzMLFile()
     experiment = MSExperiment()
     file_handler.load(mzml_filename, experiment)
     spectra = []
     for sp in experiment:
         sp_converted = self._build_spectrum(sp)
         if sp_converted is not None:
             sp_converted.meta.update(
                 {'filename': basename(mzml_filename)})
             spectra.append(sp_converted)
     return spectra
Beispiel #3
0
def read_sample(input_file, expt_='ecoli', mode='pos'):
    SM = Sample(expt_, mode, input_file)
    exp = MSExperiment()
    MzMLFile().load(input_file, exp)
    print(input_file, exp.getNrSpectra())
    scans = []
    ii = 0
    for sp in exp:
        ms_level = sp.getMSLevel()
        if ms_level == 1:  # ONLY dealig with MS1 here
            rt = sp.getRT()
            mz, intensity = sp.get_peaks()
            # use int and tuple to save space when storage is considered
            intensity = [int(x) for x in intensity]
            # Scan defined as 'scan_number', 'retention_time', 'mz_values', 'intensity_values'
            scans.append(Scan(ii, rt, tuple(mz), tuple(intensity)))
            # scans.append(Scan(ii, rt, mz, intensity))
        ii += 1

    SM.scans = scans
    SM.retention_index = tuple([sc.scan_number for sc in scans])
    SM.retention_time = tuple([sc.retention_time for sc in scans])
    return SM
Beispiel #4
0
def readms(file_path):
    """
    Read mzXML, mzML and mzData files.

    Arguments:
        file_path: string
            path to the dataset locally

    Returns:
        Tuple of Numpy arrays: (m/z, intensity, retention time, mean interval of retention time).
    
    Examples:
        >>> from hpic.fileio import readms
        >>> ms,intensity,rt,rt_mean_interval = readms("MM14_20um.mzxml")
    """
    ms_format = os.path.splitext(file_path)[1]
    ms_format = ms_format.lower()
    msdata = MSExperiment()
    if ms_format == '.mzxml':
        file = MzXMLFile()
    elif ms_format == '.mzml':
        file = MzMLFile()
    elif ms_format == '.mzdata':
        file = MzDataFile()
    else:
        raise Exception('ERROR: %s is wrong format' % file_path)
    file.load(r'%s' % file_path, msdata)
    m_s = []
    intensity = []
    r_t = []
    for spectrum in msdata:
        if spectrum.getMSLevel() == 1:
            r_t.append(spectrum.getRT())
            p_ms = []
            p_intensity = []
            for peak in spectrum:
                if peak.getIntensity() != 0:
                    p_ms.append(peak.getMZ())
                    p_intensity.append(peak.getIntensity())
            ms_index = np.argsort(np.negative(p_intensity))
            m_s.append(np.array(p_ms)[ms_index])
            intensity.append(np.array(p_intensity)[ms_index])
    rt1 = np.array(r_t)
    if rt1.shape[0] > 1:
        rt_mean_interval = np.mean(np.diff(rt1))
    else:
        rt_mean_interval = 0.0
    return m_s, intensity, r_t, rt_mean_interval
Beispiel #5
0
def getBasicQuality(exp: oms.MSExperiment, verbose: bool=False) -> mzqc.RunQuality:
    """
    getBasicQuality calculates the basic QualityMetrics from a mass spectrometry peak file and creates the related RunQuality object.

    Calculated basic QC metrics and proto-metrics necessary to calculate more elaborate QC metrics with additional data (e.g. ID).

    Parameters
    ----------
    exp : oms.MSExperiment
        The mass spectrometry peak file to calculate metrics from
    verbose : bool, optional
        switches on verbose logging, by default False

    Returns
    -------
    mzqc.RunQuality
        A RunQuality object containing the list of metrics calculated and metadata collected, ready for integration into a mzQC file object.
    """
    metrics: List[mzqc.QualityMetric] = list()
    if exp.getExperimentalSettings().getSourceFiles():
        parent_base_name: str = basename(exp.getExperimentalSettings().getSourceFiles()[0].getNameOfFile())
        parent_chksm: str = exp.getExperimentalSettings().getSourceFiles()[0].getChecksum()
        parent_chksm_type: str = exp.getExperimentalSettings().getSourceFiles()[0].getChecksumType()

    instr_srl: str = exp.getInstrument().getMetaValue('instrument serial number') \
        if exp.getInstrument().metaValueExists('instrument serial number') else 'unknown'  # MS:1000529 in mzML

    input_loc: str = exp.getExperimentalSettings().getLoadedFilePath()
    base_name: str = basename(input_loc)
    chksm: str = utils.sha256fromfile(exp.getExperimentalSettings().getLoadedFilePath())
    cmpltn: str = exp.getDateTime().get()
    # strt:datetime.datetime = datetime.datetime.strptime(cmpltn, '%Y-%m-%d %H:%M:%S') - datetime.timedelta(seconds=exp.getChromatograms()[0][exp.getChromatograms()[0].size()-1].getRT()*60)

    meta: mzqc.MetaDataParameters = mzqc.MetaDataParameters(
        inputFiles=[
            mzqc.InputFile(name=base_name,location=input_loc,
                        fileFormat=mzqc.CvParameter("MS", "MS:1000584", "mzML format"),
                        fileProperties=[
                            mzqc.CvParameter(cvRef="MS",
                                accession="MS:1000747",
                                name="completion time",
                                value=cmpltn
                            ),
                            mzqc.CvParameter(cvRef="MS",
                                accession="MS:1000569",
                                name="SHA-256",
                                value=chksm
                            ),
                            mzqc.CvParameter(cvRef="MS",
                                accession="MS:1000031",
                                name="instrument model",
                                value=exp.getInstrument().getName()
                            ),
                            mzqc.CvParameter(cvRef="MS",
                                accession="MS:1000529",
                                name="instrument serial number",
                                value=instr_srl
                            )
                            # TODO integrate parent location and checksum
                            # id: MS:1002846 (Associated raw file URI) N.B. definition is PRIDE specific - WTF
                            # fitting checksum cv missing
                        ]
            )
        ],
        analysisSoftware=[
            mzqc.AnalysisSoftware(cvRef="MS", accession="MS:1000752", name="TOPP software", version=oms.__version__, uri="openms.de")
        ]
    )

    # this is mighty important to sort by RT
    exp.sortSpectra()

    min_mz: float = sys.maxsize
    max_mz: float = 0
    mslevelcounts: Dict[int,int] = defaultdict(int)

    spectrum_acquisition_metrics_MS1: Dict[str,List[Any]] = defaultdict(list)
    spectrum_acquisition_metrics_MS2: Dict[str,List[Any]] = defaultdict(list)
    spectrum_topn: Dict[str,List[Any]] = defaultdict(list)
    tandem_spectrum_metrics_MS2: Dict[str,List[Any]] = defaultdict(list)
    trap_metrics_MS1: Dict[str,List[Any]] = defaultdict(list)
    trap_metrics_MS2: Dict[str,List[Any]] = defaultdict(list)
    isolation_window_metrics: Dict[str,List[Any]] = defaultdict(list)
    tic_tab: Dict[str,List[Any]] = defaultdict(list)

    # ActivationMethod look-up dict
    ams = {getattr(ActivationMethod,i): i for i in dir(ActivationMethod) if type(getattr(ActivationMethod,i))==int }

    intens_sum: np.float = 0
    last_surveyscan_index:int = 0
    for spin, spec in enumerate(exp):
        mslevelcounts[spec.getMSLevel()] += 1

        iontraptime = utils.getTrapTime(spec)
        intens_max = spec.get_peaks()[1].max()
        intens_min = spec.get_peaks()[1].min()
        intens_sum = spec.get_peaks()[1].sum()

        if spec.getMSLevel() == 1:
            last_surveyscan_index = spin
            last_surveyscan_intensity = intens_sum
            last_surveyscan_max = intens_max

            spectrum_acquisition_metrics_MS1['RT'].append(spec.getRT())
            spectrum_acquisition_metrics_MS1['SN'].append(noiseqc.getSN_medianmethod(spec))
            spectrum_acquisition_metrics_MS1['peakcount'].append(spec.size())
            spectrum_acquisition_metrics_MS1['int'].append(intens_sum.item())  # .item() for dtype to pytype

            trap_metrics_MS1['RT'].append(spec.getRT())
            trap_metrics_MS1['traptime'].append(iontraptime)

            tic_tab['RT'].append(spec.getRT())
            tic_tab['int'].append(intens_sum)

        if (spec.getMSLevel() == 2):
            if (spec.getPrecursors()[0].getMZ() < min_mz):
                min_mz = spec.getPrecursors()[0].getMZ()
            if (spec.getPrecursors()[0].getMZ() > max_mz):
                max_mz = spec.getPrecursors()[0].getMZ()

            spectrum_acquisition_metrics_MS2['RT'].append(spec.getRT())
            spectrum_acquisition_metrics_MS2['SN'].append(noiseqc.getSN_medianmethod(spec))
            spectrum_acquisition_metrics_MS2['peakcount'].append(spec.size())
            spectrum_acquisition_metrics_MS2['int'].append(intens_sum.item())  # .item() for dtype to pytype
            spectrum_acquisition_metrics_MS2['native_id'].append(utils.spec_native_id(spec))

            rank = spin - last_surveyscan_index
            spectrum_acquisition_metrics_MS2['rank'].append(rank)

            trap_metrics_MS2['RT'].append(spec.getRT())
            trap_metrics_MS2['traptime'].append(iontraptime)
            trap_metrics_MS2['activation_method'].append(ams.get(next(iter(spec.getPrecursors()[0].getActivationMethods()), None),'unknown'))
            trap_metrics_MS2['activation_energy'].append(spec.getPrecursors()[0].getMetaValue('collision energy') if \
                spec.getPrecursors()[0].metaValueExists('collision energy') else -1)

            precursor_index = np.searchsorted(exp[last_surveyscan_index].get_peaks()[0], [exp[spin].getPrecursors()[0].getMZ()])[0]
            if precursor_index != np.array(exp[last_surveyscan_index].get_peaks()).shape[1]:
                precursor_err = spec.getPrecursors()[0].getMZ() - np.array(exp[last_surveyscan_index].get_peaks())[:,precursor_index][0]
                precursor_int = np.array(exp[last_surveyscan_index].get_peaks())[:,precursor_index][1]
            else:
                precursor_err = np.nan
                precursor_int = np.nan

            tandem_spectrum_metrics_MS2['RT'].append(spec.getRT())
            tandem_spectrum_metrics_MS2['precursor_intensity'].append(precursor_int)  # TODO different from mzid->mzml getPrecursors[0].getIntensity() ? YES, latter one usually zero
            tandem_spectrum_metrics_MS2['precursor_error'].append(precursor_err)
            tandem_spectrum_metrics_MS2['precursor_mz'].append(spec.getPrecursors()[0].getMZ())
            tandem_spectrum_metrics_MS2['precursor_c'].append(spec.getPrecursors()[0].getCharge())

            tandem_spectrum_metrics_MS2['surveyscan_intensity_sum'].append(last_surveyscan_intensity)
            tandem_spectrum_metrics_MS2['surveyscan_intensity_max'].append(last_surveyscan_max)

            isolation_window_metrics['RT'].append(spec.getRT())
            isolation_window_metrics['isolation_target'].append(spec.getPrecursors()[0].getMZ())  # https://github.com/OpenMS/OpenMS/blob/d17cc251fd0c4068eb253b03c9fb107897771fdc/src/openms/source/FORMAT/HANDLERS/MzMLHandler.cpp#L1992
            isolation_window_metrics['isolation_lower'].append(spec.getPrecursors()[0].getIsolationWindowLowerOffset())
            isolation_window_metrics['isolation_upper'].append(spec.getPrecursors()[0].getIsolationWindowUpperOffset())
            lower = spec.getPrecursors()[0].getMZ() - spec.getPrecursors()[0].getIsolationWindowLowerOffset()
            upper = spec.getPrecursors()[0].getMZ() + spec.getPrecursors()[0].getIsolationWindowUpperOffset()

            s = np.array([(i.getMZ(),i.getIntensity()) for i in exp[last_surveyscan_index]], ndmin = 2)
            s = s[np.where(np.logical_and(s[:, 0]>=lower, s[:, 0]<=upper))[0]]
            isolation_window_metrics['peaks_in_window'].append(np.shape(s)[0])

            int_sort_desc = np.flip(np.argsort(s[:,1]))
            if np.shape(s)[0] > 1:
                isolation_window_metrics['int_ratio_ranked_peaks_in_window'].append(
                    s[int_sort_desc][:-1,1]/s[int_sort_desc][1:,1][0])  # intensity ratio between top1&2, 2&3, ...
            else:
                isolation_window_metrics['int_ratio_ranked_peaks_in_window'].append(0)  # bigger is better, though best is 0

            isolation_window_metrics['summed_window_intensity'].append(np.sum(s[int_sort_desc][:,1]))
            isolation_window_metrics['isolation_target_intensity'].append(spec.getPrecursors()[0].getIntensity())

            # TODO this needs to go outside
            tol = 0.5
            if spec.metaValueExists('filter string'):
                if 'FTMS' in spec.getMetaValue('filter string'):
                    tol = 0.05
                elif 'ITMS' in spec.getMetaValue('filter string'):
                    tol = 0.5
                elif 'QTOF' in spec.getMetaValue('filter string'):  #TOFMS, SQMS, TQMS, SectorMS
                    tol = 0.1

            # ms2 peaks directly from isolation window?
            unfragmented = np.any([np.isclose(i[0],[x.getMZ() for x in spec], atol=tol) for i in s])
            isolation_window_metrics['peaks_in_window_in_ms2'].append(str(unfragmented))

    ## Spectra detail numbers
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Spectrum acquisition metric values - MS1",
                value=spectrum_acquisition_metrics_MS1)
    )
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Spectrum acquisition metric values - MS2",
                value=spectrum_acquisition_metrics_MS2)
    )
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Spectra topn ranks",
                value=spectrum_topn)
    )
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Tandem spectrum metric values - MS2",
                value=tandem_spectrum_metrics_MS2)
    )
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Trap metric values - MS1",
                value=trap_metrics_MS1)
    )
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Trap metric values - MS2",
                value=trap_metrics_MS2)
    )
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="isolation window metrics",
                value=isolation_window_metrics)
    )

    ## Spectra numbers
    for levels in mslevelcounts.keys():
        metrics.append(
            mzqc.QualityMetric(cvRef="QC",
                    accession="QC:0000000",
                    name="Number of MS{l} spectra".format(l=str(levels)),
                    value=mslevelcounts[levels])
        )

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Number of chromatograms",
                value=len(exp.getChromatograms()))
    )

    ## Ranges
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="MZ aquisition range",
                value=[min_mz,max_mz])
    )

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="RT aquisition range",
                value=[exp[0].getRT(),exp[exp.size()-1].getRT()])
    )

    # TIC
    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Total ion current",
                value=tic_tab)
    )

    # Chrom
    chrom_tab: Dict[str,List[Any]] = defaultdict(list)
    chroms = exp.getChromatograms()
    for t in chroms:
      if t.getChromatogramType() == oms.ChromatogramSettings.ChromatogramType.TOTAL_ION_CURRENT_CHROMATOGRAM:
        for chro_peak in t:
            chrom_tab['RT'].append(chro_peak.getRT())
            chrom_tab['int'].append(chro_peak.getIntensity())
        break

    metrics.append(
        mzqc.QualityMetric(cvRef="QC",
                accession="QC:0000000",
                name="Chromatogram",
                value=chrom_tab)
    )
    # TODO is there a difference between TIC as defined in MS:1000235 and the chromatogram you get from TRP?? In MZML it says its a MS:1000235 (ion current detected in each of a series of mass spectra) but is it?
    # TODO consider collection of spectrum_native_id
    return mzqc.RunQuality(metadata=meta, qualityMetrics=metrics)
Beispiel #6
0
    def test001(self):
        exp = MSExperiment()
        basename = "SHORT_MS2_FILE.mzData"
        here = os.path.dirname(os.path.abspath(__file__))
        FileHandler().loadExperiment(os.path.join(here, "data", basename), exp)
        assert exp.size() > 0

        pc = Precursor()
        pc.setMZ(1.0)
        pc.setIntensity(100)
        s0 = exp[0]
        s0.setPrecursors([pc])
        s0.setMSLevel(2)
        spec = Spectrum.fromMSSpectrum(s0)
        settings = InstrumentSettings()
        settings.setPolarity(IonSource.Polarity.POSITIVE)
        s0.setInstrumentSettings(settings)

        self.compare_specs(spec, s0)

        specneu = Spectrum.fromMSSpectrum(spec.toMSSpectrum())

        self.compare_specs(specneu, s0)

        pm = PeakMap.fromMSExperiment(exp)
        assert os.path.basename(pm.meta["source"]) == basename

        rtmin, rtmax = pm.rtRange(None)
        ms1s = pm.msNPeaks(1, rtmin, rtmax)
        assert ms1s.shape == (1797, 2), ms1s.shape

        ms1s2 = pm.msNPeaks(1, rtmax=rtmax)
        assert np.all(ms1s == ms1s2)

        ms1s3 = pm.msNPeaks(1, rtmin=0)
        assert np.all(ms1s == ms1s3)

        spec = pm.spectra[0]
        assert len(list(spec)) == len(spec)  # calls iter

        allrts = pm.allRts()
        assert allrts[0] <= pm.rtRange()[0]
        assert allrts[-1] >= pm.rtRange()[1]
        assert len(allrts) == 41, len(allrts)

        level1 = pm.levelNSpecs(1, 1)
        level2 = pm.levelNSpecs(2, 2)
        level12 = pm.levelNSpecs(1, 2)
        assert len(level1) > 0
        assert len(level2) > 0
        assert len(level1) + len(level2) == len(level12) == len(pm)
        assert level1[0].msLevel == 1
        assert level2[0].msLevel == 2

        # use default arg: nmax = nmin if not provided:
        level1 = pm.levelNSpecs(1)
        level2 = pm.levelNSpecs(2)
        assert len(level1) > 0
        assert len(level2) > 0
        assert len(level1) + len(level2) == len(level12) == len(pm)
        assert level1[0].msLevel == 1
        assert level2[0].msLevel == 2

        lone = pm.levelOneRts()
        assert len(lone) == len(level1)

        self.compare_exp(pm, exp, basename)
        pm2 = PeakMap.fromMSExperiment(pm.toMSExperiment())
        self.compare_exp(pm2, exp, basename)

        pm2 = pm.extract(rtmin=rtmin + .000001)
        assert len(pm2) == len(pm) - 1
        pm2 = pm2.extract(rtmax=rtmax - 0.000001)
        assert len(pm2) == len(pm) - 2

        mzmin, mzmax = pm.mzRange(2)

        assert mzmin < 250
        assert mzmax > 860

        mzmin, mzmax = pm.mzRange(1)

        assert mzmin >= 700
        assert mzmax <= 1050

        pm2 = pm.extract(rtmin + 0.00001, mzmin=300)

        mzmin2, mzmax2 = pm2.mzRange()
        assert mzmin2 >= 300
        assert mzmax2 == mzmax

        pm2 = pm.extract(rtmin=rtmin + 0.000001, mzmin=300, mzmax=1000)
        mzmin2, mzmax2 = pm2.mzRange()
        assert mzmin2 >= 300
        assert mzmax2 <= 1000

        with pytest.raises(Exception):
            pm.spectra[0].peaksInRange()

        pp1 = pm.spectra[0].peaksInRange(mzmax=10000)
        pp2 = pm.spectra[0].peaksInRange(mzmin=0)
        assert np.all(pp1 == pp2)

        specs0 = list(pm.spectra)
        specs1 = pm.specsInRange(0, 99999)
        specs2 = pm.specsInRange(0, specs0[0].rt)
        specs3 = pm.specsInRange(specs0[-1].rt, 999999)

        assert specs0 == specs1
        assert specs2 == [specs0[0]]
        assert specs3 == [specs0[-1]]

        pm.spectra[0].polarity = "+"
        pm.spectra[1].polarity = "-"

        pm = PeakMap(pm.spectra)
        mz = pm.representingMzPeak(0, 99999, 0, 99999)
        assert abs(mz - 831.86538) < 0.0001
Beispiel #7
0
    def test001(self):
        exp = MSExperiment()
        basename = "SHORT_MS2_FILE.mzData"
        FileHandler().loadExperiment(os.path.join("data", basename), exp)
        assert exp.size()>0

        pc = Precursor()
        pc.setMZ(1.0)
        pc.setIntensity(100)
        s0 = exp[0]
        s0.setPrecursors([pc])
        s0.setMSLevel(2)
        spec = Spectrum.fromMSSpectrum(s0)
        settings = InstrumentSettings()
        settings.setPolarity(IonSource.Polarity.POSITIVE)
        s0.setInstrumentSettings(settings)

        self.compare_specs(spec, s0)

        specneu = Spectrum.fromMSSpectrum(spec.toMSSpectrum())

        self.compare_specs(specneu, s0)

        pm = PeakMap.fromMSExperiment(exp)
        assert os.path.basename(pm.meta["source"]) ==  basename

        rtmin, rtmax = pm.rtRange()
        ms1s = pm.msNPeaks(1, rtmin, rtmax)
        assert ms1s.shape == (1797, 2), ms1s.shape

        ms1s2 = pm.msNPeaks(1, rtmax=rtmax)
        assert np.all(ms1s == ms1s2)

        ms1s3 = pm.msNPeaks(1, rtmin=0)
        assert np.all(ms1s == ms1s3)

        spec = pm.spectra[0]
        assert len(list(spec)) == len(spec) # calls iter


        allrts = pm.allRts()
        assert (allrts[0], allrts[-1]) == pm.rtRange()
        assert len(allrts) == 41, len(allrts)

        level1 = pm.levelNSpecs(1, 1)
        level2 = pm.levelNSpecs(2, 2)
        level12 = pm.levelNSpecs(1, 2)
        assert len(level1) > 0
        assert len(level2) > 0
        assert len(level1) + len(level2) == len(level12) == len(pm)
        assert level1[0].msLevel == 1
        assert level2[0].msLevel == 2

        # use default arg: nmax = nmin if not provided:
        level1 = pm.levelNSpecs(1)
        level2 = pm.levelNSpecs(2)
        assert len(level1) > 0
        assert len(level2) > 0
        assert len(level1) + len(level2) == len(level12) == len(pm)
        assert level1[0].msLevel == 1
        assert level2[0].msLevel == 2

        lone = pm.levelOneRts()
        assert len(lone) == len(level1)

        self.compare_exp(pm, exp, basename)
        pm2 = PeakMap.fromMSExperiment(pm.toMSExperiment())
        self.compare_exp(pm2, exp, basename)

        pm2 = pm.extract(rtmin = rtmin+.000001)
        assert len(pm2) == len(pm)-1
        pm2 = pm2.extract(rtmax = rtmax-0.000001)
        assert len(pm2) == len(pm)-2

        mzmin, mzmax = pm.mzRange()

        assert mzmin < 250
        assert mzmax > 1049

        pm2 = pm.extract(rtmin+0.00001, mzmin=300)

        mzmin2, mzmax2 = pm2.mzRange()
        assert mzmin2 >= 300
        assert mzmax2 == mzmax

        pm2 = pm.extract(rtmin = rtmin+0.000001, mzmin=300, mzmax=1000)
        mzmin2, mzmax2 = pm2.mzRange()
        assert mzmin2 >= 300
        assert mzmax2 <= 1000

        exceptionwrapper(lambda: pm.spectra[0].peaksInRange())

        pp1 = pm.spectra[0].peaksInRange(mzmax = 10000)
        pp2 = pm.spectra[0].peaksInRange(mzmin = 0)
        assert np.all(pp1 == pp2)

        specs0 = pm.spectra[:]
        specs1 = pm.specsInRange(0, 99999)
        specs2 = pm.specsInRange(0, specs0[0].rt)
        specs3 = pm.specsInRange(specs0[-1].rt, 999999)



        assert specs0 == specs1
        assert specs2 == [ specs0[0] ]
        assert specs3 == [ specs0[-1] ]



        pm.spectra[0].polarity = "+"
        pm.spectra[1].polarity = "-"

        pm = PeakMap(pm.spectra)
        mz = pm.representingMzPeak(0, 99999, 0, 99999)
        assert abs(mz-831.86538) < 0.00001
Beispiel #8
0
def write_mzml(file_name, spectra, title_prefix='Spectrum ', output_dir='./'):
    if '.mzml' not in file_name.lower():
        file_name += '.mzML'

    output_file = output_dir + file_name

    exp = MSExperiment()
    sp_count = 0

    ids = {}

    tic = []
    for spectrum in spectra:
        spec = MSSpectrum()
        spec.setMSLevel(2)
        name = title_prefix + str(
            sp_count) if 'name' not in spectrum else spectrum['name']
        ids[sp_count] = name
        sp_count += 1
        if 'abundance' not in spectrum:
            print('oops')
            i = [500 for _ in spectrum['spectrum']]
        else:
            i = spectrum['abundance']
        tic.append(sum(i))
        spec.set_peaks([spectrum['spectrum'], i])
        spec.setMSLevel(2)
        prec = Precursor()
        # check to see if a precursor charge was provided
        pc = 2 if 'precursor_charge' not in spectrum else spectrum[
            'precursor_charge']
        prec.setCharge(pc)
        prec.setMZ(spectrum['precursor_mass'])
        spec.setPrecursors([prec])
        spec.sortByPosition()
        exp.addSpectrum(spec)

    MzMLFile().store(output_file, exp)

    # load it back and fix it
    tree = ET.parse(output_file)
    mzml = tree.find('{http://psi.hupo.org/ms/mzml}mzML')
    # add an id to the mzml
    mzml.set('id', 'testSpectraFileFixed')
    mzml.set('xmlns', "http://psi.hupo.org/ms/mzml")
    mzml.set('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance')
    # remove some info from the mzml header [accession, version]
    toDelMzml = ['accession']
    for todel in toDelMzml:
        del mzml.attrib[todel]

    # add file description stuff to the fileDescription tag
    fileDescription = mzml.find('{http://psi.hupo.org/ms/mzml}fileDescription')

    sourceFileElement = ET.fromstring(
        '<sourceFileList count="1"> <sourceFile id="NOD2_E3.mzXML" name="NOD2_E3.mzXML" location="file:///C:/Users/zachmcgrath/Downloads"> <cvParam cvRef="MS" accession="MS:1000776" name="scan number only nativeID format" value=""/> <cvParam cvRef="MS" accession="MS:1000566" name="ISB mzXML format" value=""/> <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="89ecb0dd31ca3a2fdf5ef2c4f5341f6e5e9f06f0"/> </sourceFile> </sourceFileList>'
    )
    fileDescription.append(sourceFileElement)

    instrumentConfigurationList = mzml.find(
        '{http://psi.hupo.org/ms/mzml}instrumentConfigurationList')

    instrumentConfiguration = ET.fromstring(
        ' <instrumentConfiguration id="IC1"> <componentList count="3"> <source order="1"> <userParam name="msIonisation" value="HPLC-Chip/MS"/>  </source> <analyzer order="1"> <userParam name="msMassAnalyzer" value="Q-TOF"/>  </analyzer> <detector order="1"> <userParam name="msDetector" value="ADC"/> </detector> </componentList> </instrumentConfiguration>'
    )
    for child in instrumentConfigurationList:
        del child

    instrumentConfigurationList.append(instrumentConfiguration)

    run = mzml.find('{http://psi.hupo.org/ms/mzml}run')
    spectrumList = run.find('{http://psi.hupo.org/ms/mzml}spectrumList')
    print(len(spectrumList))
    for i, spectrumElement in enumerate(spectrumList):

        # set the id
        spectrumElement.set('id', ids[i])
        # need to add this
        # <cvParam cvRef="MS" accession="MS:1000127" name="centroid spectrum" value=""/>
        centroidElement = ET.Element('{http://psi.hupo.org/ms/mzml}cvParam',
                                     attrib={
                                         'name': 'centroid spectrum',
                                         'accession': 'MS:1000127',
                                         'value': ''
                                     })
        pl = spectrumElement.find('{http://psi.hupo.org/ms/mzml}precursorList')
        bdal = spectrumElement.find(
            '{http://psi.hupo.org/ms/mzml}binaryDataArrayList')
        if bdal is None or pl is None:
            print(spectrumElement)
            spectrumList.remove(spectrumElement)

        # add the range. Not good for MSConvert for whatever reason
        scanListEl = spectrumElement.find(
            '{http://psi.hupo.org/ms/mzml}scanList')
        scanEl = scanListEl.find('{http://psi.hupo.org/ms/mzml}scan')[0]
        scanWindowListElement = ET.fromstring(
            '<scanWindowList count="1"> <scanWindow> <cvParam cvRef="MS" accession="MS:1000501" value="0" name="scan window lower limit" unitAccession="MS:1000040" unitName="m/z" unitCvRef="MS" /> <cvParam cvRef="MS" accession="MS:1000500" value="10000" name="scan window upper limit" unitAccession="MS:1000040" unitName="m/z" unitCvRef="MS" /> </scanWindow></scanWindowList>'
        )
        scanEl.append(scanWindowListElement)

        spectrumElement.append(centroidElement)

    timearray = base64.encodebytes(
        struct.pack(f'<{len(tic)}f', *[i + 1 for i in range(len(tic))]))
    ticarray = base64.encodebytes(struct.pack(f'<{len(tic)}f', *tic))

    chromatogramListElmentString = f' <chromatogramList count="1" defaultDataProcessingRef="mzLibProcessing"> <chromatogram id="TIC" index="0" defaultArrayLength="1025" dataProcessingRef="mzLibProcessing"> <cvParam cvRef="MS" accession="MS:1000235" value="" name="total ion current chromatogram" /> <binaryDataArrayList count="2"> <binaryDataArray encodedLength="10936"> <cvParam cvRef="MS" accession="MS:1000523" value="" name="64-bit float" /> <cvParam cvRef="MS" accession="MS:1000576" value="" name="no compression" /> <cvParam cvRef="MS" accession="MS:1000595" value="" name="time array" unitAccession="UO:0000031" unitName="Minutes" unitCvRef="UO" /> <binary>{timearray}</binary> </binaryDataArray> <binaryDataArray encodedLength="10936"> <cvParam cvRef="MS" accession="MS:1000523" value="" name="64-bit float" /> <cvParam cvRef="MS" accession="MS:1000576" value="" name="no compression" /> <cvParam cvRef="MS" accession="MS:1000515" value="" name="intensity array" unitAccession="MS:1000131" unitName="number of counts" unitCvRef="MS" /> <binary>{ticarray}</binary> </binaryDataArray> </binaryDataArrayList> </chromatogram> </chromatogramList>'
    chromatogramListElment = ET.fromstring(chromatogramListElmentString)
    run.append(chromatogramListElment)

    tree.write(output_file, encoding='utf-8', xml_declaration=True)

    return output_file
Beispiel #9
0
def create(in_file="",
           in_featureXML="",
           spec_out="spectrum.png",
           label_out="labels.png",
           feat_out="features.png",
           hdf5_out="data.h5",
           width=1024,
           height=1024,
           rt_step_size=None,
           mz_step_size=None,
           save_png=False):
    """Create .png images for spectra, labels and bounding boxes.

    Spectra are read from mzML file whereas labels and bounding boxes are read
    from featureXML file. Labels image is not generated by OpenMS's
    ImageCreator.

    Arguments:
        in_file: file name of input spectrum file (.mzML)
        in_featureXML: file name of input features file (.featureXML)
        spec_out: name for output spectrum image
        label_out: name for output label image
        feat_out: name for output features image
        hdf5_out: name for HDF5 file
        width: Number of pixels in m/z dimension
                if 0, width defined w.r.t. mz_step_size (default: 1024, min: 0)
        height: Number of pixels in r/t dimension
                if 0, height defined w.r.t rt_step_size (default: 1024, min: 0)
        rt_step_size: step_size for discrete r/t values,
                      used only if width is 0 (default 1)
        mz_step_size: step_size for discrete m/z values,
                      used only if height is 0 (default 0.1)
        save_png: if False, png images will not be saved
    """
    if in_file == "":
        raise AttributeError('Input mzML file not specified')

    if in_featureXML == "":
        raise AttributeError('Input featureXML file not specified.')

    if not spec_out.endswith('.png'):
        raise AttributeError('spec_out should have .png format')

    if not feat_out.endswith('.png'):
        raise AttributeError('feat_out should have .png format')

    if not label_out.endswith('.png'):
        raise AttributeError('label_out should have .png format')

    if not hdf5_out.endswith('.h5'):
        raise AttributeError('hdf5_out should have .h5 format')

    if width < 0:
        raise AttributeError('width should be >= 0')

    if height < 0:
        raise AttributeError('height should be >= 0')

    # load mzML file
    print "Reading", in_file, "..."
    exp = MSExperiment()
    MzMLFile().load(in_file, exp)
    print "Number of Spectra = " + str(exp.size())

    print "Reading", in_featureXML, "..."
    # load featureXML
    feat_map = FeatureMap()
    FeatureXMLFile().load(in_featureXML, feat_map)
    print "number of peptide features = " + str(feat_map.size())

    # create Spectrum Dictionary and store ranges:
    exp.updateRanges(1)
    max_rt, max_mz = exp.getMax()
    min_rt, min_mz = exp.getMin()
    spec_rt_range = (min_rt, max_rt)
    spec_mz_range = (min_mz, max_mz)
    spectrum_dict = _create_spec_dict(exp)

    # create Features Dictionary and store ranges
    feature_dict = _create_feat_dict(feat_map)
    min_rt, min_mz = feat_map.getMin()
    max_rt, max_mz = feat_map.getMax()
    feat_rt_range = (min_rt, max_rt)
    feat_mz_range = (min_mz, max_mz)

    # min_rt = math.floor(min(spec_rt_range[0],
    #                         feat_rt_range[0]))
    # max_rt = math.ceil(max(spec_rt_range[1],
    #                        feat_rt_range[1]))
    # min_mz = math.floor(min(spec_mz_range[0],
    #                         feat_mz_range[0]))
    # max_mz = math.ceil(max(spec_mz_range[1],
    #                        feat_mz_range[1]))
    min_rt, max_rt = spec_rt_range
    min_mz, max_mz = spec_mz_range

    # set height according to rt range
    if height == 0:
        if rt_step_size is None:
            rt_step_size = 1
        height = int((max_rt - min_rt) / rt_step_size)
    else:
        rt_step_size = float(max_rt - min_rt) / height

    # set width according to mz range
    if width == 0:
        if mz_step_size is None:
            mz_step_size = 0.1
        width = int((max_mz - min_mz) / mz_step_size)
    else:
        mz_step_size = float(max_mz - min_mz) / width

    # Extract and grid peak data from MSExperiment
    # Perform Bilinear Interpolation for spectra
    bilip = _interpolation(exp)

    # array for storing specta values using Bilinear Interpolation
    spec_arr = _create_data_array(min_rt, max_rt, min_mz, max_mz, height,
                                  width)
    spec_arr = _fill_spec_array(spec_arr, bilip, spectrum_dict)
    # flip the array along rows so that the image resembles TOPPView plot
    spec_arr[1:, :] = np.flip(spec_arr[1:, :], 0)
    if save_png is True:
        print "Saving spectra image", spec_out, "..."
        spec_arr = _save_spectrum_image(spec_arr, spec_out)

    print "Saving '/data' group in ", hdf5_out, "..."
    outfile = h5py.File(hdf5_out)
    data = outfile.create_dataset("/data",
                                  spec_arr[1:, 1:].shape,
                                  chunks=True,
                                  compression="gzip",
                                  dtype=np.float32,
                                  data=spec_arr[1:, 1:])
    spec_arr = None  # free memory used by spec_arr

    data.attrs['rt_min'] = min_rt
    data.attrs['rt_max'] = max_rt
    data.attrs['rt_step_size'] = rt_step_size
    data.attrs['mz_min'] = min_mz
    data.attrs['mz_max'] = max_mz
    data.attrs['mz_step_size'] = mz_step_size

    # labels array
    dot_arr = _create_data_array(min_rt, max_rt, min_mz, max_mz, height, width)
    dot_arr = _insert_dot_values(feature_dict, dot_arr)

    if save_png is True:
        # array for storing feature values for png image
        bbox_arr = np.copy(
            dot_arr)  # bbox rectangle array contains dot values too
        bbox_arr = _insert_rect_values(feature_dict, bbox_arr)
        bbox_arr[1:, :] = np.flip(bbox_arr[1:, :], 0)
        print "Saving features image", feat_out, "..."
        bbox_arr = _save_features_image(bbox_arr, feat_out)
        bbox_arr = None

    # array for storing feature as vectors
    bbox_vec = _create_bbox_vectors(
        feature_dict,
        dot_arr,  # bbox vector arr uses dot_arr for index lookup
        mz_step_size,
        rt_step_size)

    dot_arr[1:, :] = np.flip(dot_arr[1:, :], 0)
    if save_png is True:
        print "Saving labels image", label_out, "..."
        dot_arr = _save_labels_image(dot_arr, label_out)

    print "Saving '/label' group in ", hdf5_out, "..."
    label = outfile.create_dataset("/label",
                                   dot_arr[1:, 1:].shape,
                                   chunks=True,
                                   compression="gzip",
                                   dtype=np.float32,
                                   data=dot_arr[1:, 1:])
    dot_arr = None

    bbox_vec = np.flip(bbox_vec, axis=1)
    print "Saving '/feature' group in", hdf5_out, "..."
    feature = outfile.create_dataset("/feature",
                                     bbox_vec.shape,
                                     chunks=True,
                                     compression="gzip",
                                     dtype=np.float32,
                                     data=bbox_vec)
    bbox_vec = None
    outfile.close()
Beispiel #10
0
    def store(self, ofname: str, peakMap: pyopenms.MSExperiment):

        #sort peakMap if necessary
        if not peakMap.isSorted():
            peakMap.sortSpectra()
            peakMap.updateRanges()

        outF = open(ofname, 'w')

        firstScan = self._getScan(
            peakMap.getSpectrum(0).getNativeID().decode('utf-8'))
        dataType = peakMap.getSpectrum(0).getType()
        dataType = 'Centroid' if dataType == 1 else 'Profile' if dataType == 2 else 'Unknown'
        lastScan = self._getScan(
            peakMap.getSpectrum(peakMap.getNrSpectra() -
                                1).getNativeID().decode('utf-8'))
        precursorFile = basename(ofname).replace('ms2', 'ms1')

        #print header
        outF.write(
            self._writeValue(
                MS2File._h_tag, 'CreationDate',
                datetime.datetime.now().strftime('%m/%d/%Y %I:%M:%S %p')))
        outF.write(self._writeValue(MS2File._h_tag, 'Extractor', 'msConvert'))
        outF.write(self._writeValue(MS2File._h_tag, 'ExtractorVersion', '0.1'))
        outF.write(
            self._writeValue(MS2File._h_tag, 'Comments',
                             'msConvert was written by Aaron Maurais, 2019'))
        outF.write(self._writeValue(MS2File._h_tag, 'ExtractorOptions', 'MS2'))
        outF.write(
            self._writeValue(MS2File._h_tag, 'AcquisitionMethod',
                             'Data-Dependent'))
        outF.write(
            self._writeValue(MS2File._h_tag, 'InstrumentType', 'Unknown'))
        outF.write(self._writeValue(MS2File._h_tag, 'ScanType', 'MS2'))
        outF.write(self._writeValue(MS2File._h_tag, 'DataType', dataType))
        outF.write(self._writeValue(MS2File._h_tag, 'FirstScan', firstScan))
        outF.write(self._writeValue(MS2File._h_tag, 'LastScan', lastScan))

        #iterate through spectra
        preScan = 'Unknown'
        for i, scan in enumerate(peakMap.getSpectra()):
            if scan.getMSLevel() == 1:
                preScan = self._getScan(scan.getNativeID().decode('utf-8'))

            if scan.getMSLevel() == 2:
                #write header info
                curScan = self._getScan(scan.getNativeID().decode('utf-8'))
                precursors = scan.getPrecursors()
                preCharge = int(precursors[0].getCharge())
                preMZ = precursors[0].getMZ()

                #print scan line
                outF.write('{}\t{}\t{}\t{}\n'.format(MS2File._s_tag,
                                                     curScan.zfill(6),
                                                     curScan.zfill(6), preMZ))
                #print scan info
                outF.write(
                    self._writeValue(MS2File._i_tag, 'RetTime', scan.getRT()))
                outF.write(
                    self._writeValue(MS2File._i_tag, 'PrecursorInt',
                                     precursors[0].getIntensity()))
                outF.write(
                    self._writeValue(MS2File._i_tag, 'IonInjectionTime',
                                     'Unknown'))
                ameth = list(precursors[0].getActivationMethods())
                ameth = ' '.join(
                    [MS2File._activationMethods[x] for x in ameth])
                if not ameth:
                    ameth = 'Unknown'
                outF.write(
                    self._writeValue(MS2File._i_tag, 'ActivationType', ameth))
                outF.write(
                    self._writeValue(MS2File._i_tag, 'PrecursorFile',
                                     precursorFile))
                outF.write(
                    self._writeValue(MS2File._i_tag, 'PrecursorScan', preScan))
                outF.write(
                    self._writeValue(MS2File._i_tag, 'InstrumentType',
                                     'Unknown'))

                #write z line
                #after charge, the M+H m/z for the ion is listed, so calculate that here
                outF.write(
                    self._writeValue(
                        MS2File._z_tag, preCharge, (float(preMZ) * preCharge) -
                        (preCharge * MS2File._H_mass) + MS2File._H_mass))

                #write ions
                for ion in scan:
                    outF.write('{0:.4f} {1:.1f}\n'.format(
                        round(ion.getMZ(), 4), round(ion.getIntensity(), 1)))
from pyopenms import MSExperiment, MzXMLFile
import pandas as pd

labview_file = "/home/james/Downloads/20200124_TP_2.csv"
mzXML_file = "/home/james/Downloads/20200124_17645.mzXML"

#
# Make smaller mzXML file to keep in repo
#
n_spectra_to_keep = 20

exp = MSExperiment()
MzXMLFile().load(mzXML_file, exp)
print(f"Full size of experiment {len(exp.getSpectra())}")

spec = []
for i, s in enumerate(exp.getSpectra()):
    if i == n_spectra_to_keep:
        break
    spec.append(s)

exp.setSpectra(spec)
print(f"New size of experiment {len(exp.getSpectra())}")
MzXMLFile().store("test.mzXML", exp)

#
# Take subset of CSV LabView data
#
cols = ["time", "b", "temp", "d", "e", "f", "g", "h"]
df = pd.read_csv(labview_file, names=cols)
df["time"] -= df["time"][0]
Beispiel #12
0
    def load(self, ifname: str, peakMap: pyopenms.MSExperiment):
        inF = open(ifname, 'r')

        lines = inF.read().splitlines()
        curLine = 0
        nLines = len(lines)

        #generate spectrum list
        spectraList = list()
        while curLine < nLines:
            if lines[curLine] == 'BEGIN IONS':
                spectrum = pyopenms.MSSpectrum()
                spectrum.setMSLevel(2)
                precursor = pyopenms.Precursor()

                curLine += 1
                while curLine < nLines:
                    if lines[curLine][0].isalpha():
                        match = re.search('^([A-Z]+)=(.+)$', lines[curLine])
                        if match.group(1) == 'TITLE':
                            titleData = match.group(2).split(',')

                            for s in titleData:
                                if re.search('^scan[_=]', s):
                                    match = re.search('^scan[_=]([0-9]+)', s)
                                    assert (len(match.groups()) == 1)
                                    spectrum.setNativeID('scan={}'.format(
                                        match.group(1)))

                        elif match.group(1) == 'PEPMASS':
                            preMZ = [
                                float(x) for x in match.group(2).split(' ')
                            ]
                            assert (len(preMZ) <= 2)
                            precursor.setMZ(preMZ[0])
                            if len(preMZ) > 1:
                                precursor.setIntensity(preMZ[1])

                        elif match.group(1) == 'CHARGE':
                            match = re.search('^([0-9])[+-]{0,1}$',
                                              match.group(2))
                            assert (len(match.groups()) == 1)
                            precursor.setCharge(int(match.group(1)))

                        elif match.group(1) == 'RTINSECONDS':
                            spectrum.setRT(float(match.group(2)))

                    elif lines[curLine][0].isnumeric():
                        while curLine < nLines and lines[curLine] != 'END IONS':
                            ion = [float(x) for x in lines[curLine].split(' ')]
                            assert (len(ion) == 2)
                            ion_temp = pyopenms.Peak1D()
                            ion_temp.setMZ(ion[0])
                            ion_temp.setIntensity(ion[1])
                            spectrum.push_back(ion_temp)
                            curLine += 1

                        break

                    curLine += 1
                spectrum.setPrecursors([precursor])
                spectraList.append(spectrum)

            curLine += 1

        peakMap.setSpectra(spectraList)
Beispiel #13
0
def combine_experiments(exp1: ms.MSExperiment, exp2: ms.MSExperiment) -> None:
    """Merges two experiments (putting the second into the first)."""
    for i in range(exp2.getNrSpectra()):
        spec = exp2.getSpectrum(i)
        exp1.addSpectrum(spec)
Beispiel #14
0
def find_spectral_groups(input_file,
                         MS1_round_precision=2,
                         min_match_peaks=1,
                         ms2_tol=0.1,
                         score_thresh=0.8,
                         verbose=False):

    from pyopenms import MSExperiment, MzMLFile
    from mnet import Spectrum, Cluster
    from scoring_functions import fast_cosine

    print("Finding groups of spectra in {}".format(input_file))
    exp = MSExperiment()
    MzMLFile().load(input_file, exp)

    clusters = {
    }  # dictionary, key is precursor mz, value is a list of clusters

    # loop over spectra
    cl_id = 0
    n_spec = 0
    biggest_cluster_size = 0
    for ind, spectrum in enumerate(exp):
        if spectrum.getMSLevel() != 1:
            raw_precursormz = spectrum.getPrecursors()[0].getMZ()
            precursormz = round(spectrum.getPrecursors()[0].getMZ(),
                                MS1_round_precision)
            if not precursormz in clusters:
                clusters[precursormz] = []
            # make a Spectrum object
            peaks = zip(spectrum.get_peaks()[0], spectrum.get_peaks()[1])
            s = Spectrum(peaks,
                         input_file,
                         ind,
                         None,
                         raw_precursormz,
                         raw_precursormz,
                         rt=spectrum.getRT())
            if len(clusters[precursormz]) == 0:
                # no clusters exist, make one
                new_cluster = Cluster(s, cl_id)
                cl_id += 1
                clusters[precursormz].append(new_cluster)
            else:
                #match_ers exist, compute the similarity between the spectrum and each
                best_score = -1
                best_pos = None
                for pos, cl in enumerate(clusters[precursormz]):
                    sc, _ = fast_cosine(s, cl, ms2_tol, min_match_peaks)
                    if sc > best_score:
                        best_score = sc
                        best_pos = pos
                if best_score >= score_thresh:
                    clusters[precursormz][pos].add_spectrum(s)
                    if clusters[precursormz][
                            pos].n_spectra > biggest_cluster_size:
                        biggest_cluster_size = clusters[precursormz][
                            pos].n_spectra
                else:
                    new_cluster = Cluster(s, cl_id)
                    cl_id += 1
                    clusters[precursormz].append(new_cluster)

            n_spec += 1
        if verbose and n_spec % 100 == 0:
            print(
                "{} Masses, {} clusters, biggest cluster = {} spectra".format(
                    len(clusters), cl_id, biggest_cluster_size))
    return clusters