def _make_iter(self):
     self._reader = mzml.read(self.file_path)
     self._iter = (
         process_mzml_scan(
             scan, savgol_window_length=self.savgol_window_length,
             remove_baseline=self.remove_baseline)
         for scan in self._reader)
Esempio n. 2
0
    def __compute_mz_axis(cls, filename):
        ## TODO completely refactor this to make it smartly handle profile or centroid datasets
        ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution
        ## TODO: profile datasets should work as is
        ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array']
        """
        Internal helper function used to compute the mz axis of each scantype
        Returns a list of numpy arrays
        """
        reader = mzml.read(filename)
        mz_list = []
        counter = 0
        for spectrum in reader:
            mz_list.append(np.asarray(spectrum['m/z array']))
            counter += 1
        mzdiff = 10000000.0
        mz_min = 10000000.0
        mz_max = 0.0
        for mz in mz_list:
            d = np.diff(mz).min()
            if d < mzdiff:
                mzdiff = d
            m = mz.min()
            if m < mz_min:
                mz_min = m
            m = mz.max()
            if m > mz_max:
                mz_max = m

        mz_axes = np.arange(start=mz_min, stop=mz_max, step=mzdiff)
        return mz_axes
def mzml_reader(msconvert_file):
    ind,mslev,bpmz,bpint,starttime,mzarray,intarray = [],[],[],[],[],[],[]
    with mzml.read(msconvert_file) as reader:
        k_count = 0
        for each_dict in reader:
            # print(each_dict)
            if each_dict['ms level'] == 1:
                ind.append(each_dict['index'])
                bpmz.append(each_dict['base peak m/z'])
                bpint.append(each_dict['base peak intensity'])
                mzarray.append(each_dict['m/z array'])
                intarray.append(each_dict['intensity array'])
                v_dict = each_dict['scanList']
                v_dict = v_dict['scan'][0]
                starttime.append(v_dict['scan start time'])

    mslev = [1] * len(ind)
    mzarray = [x.tolist() for x in mzarray]
    intarray = [x.tolist() for x in intarray]
    col_set = [
        'ind', 'mslev', 'bpmz', 'bpint', 'starttime', 'mzarray', 'intarray'
    ]
    df_ms1 = pd.DataFrame(list(
        zip(ind, mslev, bpmz, bpint, starttime, mzarray, intarray)),
                          columns=col_set)
    return df_ms1
def ingest_mzML(input_file):
    #mzml_reader=mzml.read(input_file.split(".",1)[0]+".mzml",iterative=True)
    mzml_reader = mzml.read(input_file.split(".", 1)[0] + ".mzML",
                            iterative=True)
    parsed_scans = []
    for each_scan in tqdm(mzml_reader):
        if each_scan['ms level'] == 2:
            this_scan = {}
            #this_scan['scan']=int(each_scan['index'])+1 #Turns out, sometimes people use truncated files so this isn't a good plan.
            this_scan['scan'] = int(
                each_scan['id'].rsplit("=", 1)
                [1])  #Instead, we'll take scan number from the scan id string.
            this_scan['z'] = each_scan['precursorList']['precursor'][0][
                'selectedIonList']['selectedIon'][0]['charge state']
            this_scan['m/z'] = each_scan['precursorList']['precursor'][0][
                'selectedIonList']['selectedIon'][0]['selected ion m/z']
            this_scan["RT"] = each_scan['scanList']['scan'][0][
                'scan start time']
            parsed_scans.append(this_scan)
            #this_dataset=this_group.create_dataset(str(each_scan['index']),compression="gzip",compression_opts=9,dtype="float32",data=numpy.column_stack((each_scan['m/z array'],each_scan['intensity array'])).T)
            #this_dataset.attrs["scan_index"]=each_scan['index']
    print "done reading from with file {0}".format(input_file)
    new_df = pandas.DataFrame(parsed_scans)
    del mzml_reader
    return new_df
Esempio n. 5
0
def split_mzml(mzml_file):
    """
    function to split a mzML file into dict of MS2_Spectra objects (can be written to mgf format)
    by fragmentation method

    Parameters:
    -----------------------------------------
    mzml_file: str,
            path to mzML file

    Return: dict {fragMethod: list(MS2_spectrum)

    """

    mzml_reader = mzml.read(mzml_file)
    ordered_ms2_spectra = {
        "CID": [],
        "HCD": [],
        "ETD": [],
        "ETciD": [],
        "EThcD": []
    }

    for spectrum in mzml_reader:
        if spectrum['ms level'] == 2:
            try:
                groups = re.search(
                    "@([A-z]+)([0-9.]+)@?([A-z]+)?([0-9.]+)?",
                    spectrum['scanList']['scan'][0]['filter string']).groups()
            except:
                print spectrum['scanList']['scan'][0]['filter string']
            title = os.path.split(mzml_file)[1].split(
                'mzML')[0] + spectrum['id']
            rt = spectrum['scanList']['scan'][0]['scan start time'] * 60
            precursor = spectrum['precursorList']['precursor'][0][
                'selectedIonList']['selectedIon'][0]
            pre_mz = precursor['selected ion m/z']
            try:
                pre_int = precursor['peak intensity']
            except KeyError:
                pre_int = 0
            pre_z = precursor['charge state']
            peaks = zip(spectrum['m/z array'], spectrum['intensity array'])

            ms2class_spectrum = MS2_spectrum(title, rt, pre_mz, pre_int, pre_z,
                                             peaks)

            if "etd" in groups:
                if "cid" in groups:
                    ordered_ms2_spectra['ETciD'].append(ms2class_spectrum)
                elif "hcd" in groups:
                    ordered_ms2_spectra['EThcD'].append(ms2class_spectrum)
                else:
                    ordered_ms2_spectra['ETD'].append(ms2class_spectrum)
            elif "cid" in groups:
                ordered_ms2_spectra['CID'].append(ms2class_spectrum)
            elif "hcd" in groups:
                ordered_ms2_spectra['HCD'].append(ms2class_spectrum)

    return {k: v for k, v in ordered_ms2_spectra.items() if len(v) > 0}
Esempio n. 6
0
def generate_cihcd_spectra(mzml_file):
    """

    """

    mzml_reader = mzml.read(mzml_file)
    cihcd_spectra = []

    n = 0
    for spectrum in mzml_reader:
        if spectrum['ms level'] == 3:
            n += 1
            filter_str = spectrum['scanList']['scan'][0]['filter string']
            try:
                detector_str = re.search("^(FT|IT)", filter_str).groups()[0]
                frag_groups = re.findall("@([A-z]+)([0-9.]+)", filter_str)
                precursor_mz_groups = re.findall("([0-9.]+)@", filter_str)
            except AttributeError:
                raise StandardError("filter string parse error: %s" % filter_str)

            ms2_id = spectrum['precursorList']['precursor'][0]['spectrumRef']

            title = os.path.split(mzml_file)[1].split('.mzML')[0] + " " + spectrum['id'] + " ms2_scanId=" + ms2_id
            rt = spectrum['scanList']['scan'][0]['scan start time'] * 60

            pre_mz = precursor_mz_groups[0]     # take ms2 precursor as precursor
            pre_int = -1
            pre_z = -1
            peaks = zip(spectrum['m/z array'], spectrum['intensity array'])

            ms2class_spectrum = ProteoFileReader.MS2_spectrum(title, rt, pre_mz, pre_int, pre_z, peaks)

            cihcd_spectra.append(ms2class_spectrum)

    return cihcd_spectra
Esempio n. 7
0
    def spectrum_iter(self):
        """
        Generator function that yields a position and associated spectrum for a selected datacube type.

        :yield: (xidx, yidx) a tuple of ints representing x and y position in the image
        :yield: yi,          a numpy 1D-array of floats containing spectral intensities at the given position \
                             and for the selected datacube type

        """
        reader = mzml.read(self.basename)
        if self.select_dataset is None:
           raise ValueError('Select a dataset to continue!')
        dataset_index = self.select_dataset
        for idx, spectrum in enumerate(reader):
            mz = self.mz_all[0]
            x = spectrum['m/z array']
            try:
                y = spectrum['intensity array']
            except KeyError:
                raise KeyError('Key "intensity array" not found in this mzml file')

            yi = np.interp(mz, x, y, 0, 0)      # Interpolate the data onto the new axes in profiles mode
            # else:
            #     shift = np.diff(mz).mean()
            #     bin_edges = np.append(mz, mz[-1]+ shift)
            #     yi, _ = np.histogram(x, bins=bin_edges, weights=y)   # Re-histogram the data in centroided mode
            xidx = np.nonzero(self.x_pos == self.coordinates[idx, 0])[0][0]
            yidx = np.nonzero(self.y_pos == self.coordinates[idx, 1])[0][0]

            yield (xidx, yidx), yi
Esempio n. 8
0
    def spectrum_iter(self):
        """
        Generator function that yields a position and associated spectrum for a selected datacube type.

        :yield: (xidx, yidx) a tuple of ints representing x and y position in the image
        :yield: yi,          a numpy 1D-array of floats containing spectral intensities at the given position \
                             and for the selected datacube type

        """
        reader = mzml.read(self.basename)
        if self.select_dataset is None:
            raise ValueError('Select a dataset to continue!')
        dataset_index = self.select_dataset
        for idx, spectrum in enumerate(reader):
            mz = self.mz_all[0]
            x = spectrum['m/z array']
            try:
                y = spectrum['intensity array']
            except KeyError:
                raise KeyError(
                    'Key "intensity array" not found in this mzml file')

            yi = np.interp(
                mz, x, y, 0,
                0)  # Interpolate the data onto the new axes in profiles mode
            # else:
            #     shift = np.diff(mz).mean()
            #     bin_edges = np.append(mz, mz[-1]+ shift)
            #     yi, _ = np.histogram(x, bins=bin_edges, weights=y)   # Re-histogram the data in centroided mode
            xidx = np.nonzero(self.x_pos == self.coordinates[idx, 0])[0][0]
            yidx = np.nonzero(self.y_pos == self.coordinates[idx, 1])[0][0]

            yield (xidx, yidx), yi
Esempio n. 9
0
def read_mzml(PATH, scanlist, event_scan, fname, output, soutput, newscanno,
              spec_outfile):
    if os.path.isfile(PATH) and os.access(PATH, os.R_OK):
        with mzml.read(PATH) as reader:
            for scanindex, spectrum in enumerate(reader):
                if scanlist.has_key(scanindex):
                    try:
                        pev = event_scan[fname][scanindex]['pev']
                        output.append("%s\t%d\t%s\t%s\t%s\n" %
                                      (pev, newscanno, spec_outfile,
                                       event_scan[fname][scanindex]['pseq'],
                                       event_scan[fname][scanindex]['etype']))
                        newscanno += 1
                        charge = int(spectrum['precursorList']['precursor'][0]
                                     ['selectedIonList']['selectedIon'][0]
                                     ['charge state'])
                        mz = spectrum['precursorList']['precursor'][0][
                            'selectedIonList']['selectedIon'][0][
                                'selected ion m/z']
                        soutput.write(
                            "BEGIN IONS\nTITLE=controllerType=0 controllerNumber=1 scan=%d\nCHARGE=%d+\nPEPMASS=%s\n"
                            % (newscanno, charge, mz))
                        for x, y in zip(spectrum['m/z array'],
                                        spectrum['intensity array']):
                            soutput.write("%s %s\n" % (x, y))
                        soutput.write("END IONS\n\n")
                    except:
                        print("Error reading mzML file")
    return newscanno
Esempio n. 10
0
 def __compute_mz_axis(cls, filename):
     ## TODO completely refactor this to make it smartly handle profile or centroid datasets
     ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution
     ## TODO: profile datasets should work as is
     ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array']
     """
     Internal helper function used to compute the mz axis of each scantype
     Returns a list of numpy arrays
     """
     reader = mzml.read(filename)
     mz_list = []
     counter = 0
     for spectrum in reader:
         mz_list.append(np.asarray(spectrum['m/z array']))
         counter += 1
     mzdiff = 10000000.0
     mz_min = 10000000.0
     mz_max = 0.0
     for mz in mz_list:
         d  = np.diff(mz).min()
         if d < mzdiff:
             mzdiff = d
         m = mz.min()
         if m < mz_min:
             mz_min = m
         m = mz.max()
         if m > mz_max:
             mz_max = m
         
     mz_axes = np.arange(start=mz_min, stop=mz_max, step=mzdiff)
     return mz_axes
Esempio n. 11
0
    def __compute_coordinates(self, filename, num_scans):
        """
        Internal helper function used to compute the coordinates for each scan.

        :returns: 2D numpy integer array of shape (numScans,2) indicating for each scan its x and y coordinate
        """
        spectrumid = 0
        reader = mzml.read(filename)
        coords = np.zeros(shape=(num_scans, 2), dtype='uint32')
        with open(filename, 'r') as origin_file:
            for line in origin_file:
                s = re.findall(r'location="', line)
                if s:
                    m = re.search(
                        r'_[0-9]+x_[0-9]+y_',
                        line,
                    )
                    if m:
                        coord_str = m.group()
                        coord_str = coord_str.strip('_').split('_')
                        coord_str = [int(c[:-1]) for c in coord_str]
                        coords[spectrumid, 0] = coord_str[0]
                        coords[spectrumid, 1] = coord_str[1]
                        spectrumid += 1
        return coords
Esempio n. 12
0
def parse_mzml(mzml_path, pickle_data=None, logfile=None):
    """
    retrieves all scans from a portion of an mzml file and generates arrays of mz versus intensity at each RT

    :param mzml_path: string pointing to the .mzml file to extra scans from
    :param pickle_data: string pointing to a pickle file to save the extracted spectra into. Default is not saved
    :param logfile: path to file to log

    :return: a dictionary with keys retention_times, and ms1_scans. retention_times points to a numpy array of
    each retention time. ms1_scans points to a list of pandas dataframes with columns mz_data and intensity_data.
    The index of this list corresponds to the position in the retention_times numpy array.
    """
    with mzml.read(mzml_path) as mz_reader:
        scan_list = []
        rt_list = []
        log('reading mzml file ' + mzml_path + '...', logfile)
        for scan in mz_reader:
            assert scan['ms level'] == 1, 'Your mzml file contains non-MS1 scans. ' \
                                          'When converting your mzml file, only include MS1 scans. See the pysodist' \
                                          'docs for how to appropriately convert to .mzml files using msconvert.'
            mz_int_pd = pd.DataFrame({'mz_data': scan['m/z array'], 'intensity_data': scan['intensity array']})
            scan_list.append(mz_int_pd)
            new_rt = float(scan['scanList']['scan'][0]['scan start time'])
            rt_list.append(new_rt)
    parsed_mz_file = {'retention_times': np.array(rt_list), 'ms1_scans': scan_list}
    if not (pickle_data is None):
        pickle.dump(parsed_mz_file, open(pickle_data, 'wb'))
    return parsed_mz_file
Esempio n. 13
0
def extract_mid_from_file(mzml_path, mz_windows, rt_window, ppm):
    """
    Pulls out MID info from mzML file, looking only in given mz_windows and rt_window.
    :param mzml_path:    str    path to mzml_file for extraction
    :param mz_windows:   array  (span+1-by-2) matrix with columns mz_min and mz_max for rows M0, M1 ... 
    :param rt_window:    array  (1-by-2) array containing rt_min and rt_max in minutes for peak of interest
    :param ppm:          float  parts-per-million mass accuracy 
    :return out:         dict w/ keys:
                                           m, np.array of ints, the number of heavy neutrons
                                           mean_mz, the intensity-weighted m/z of the m peak
                                           total_i, the total intensity of isotopologue m
    """
    with mzml.read(mzml_path) as reader:
        # identify rt window
        rt_min, rt_max = tuple(rt_window)

        # initialize output matrix of mean mz and intensities
        n_rows = mz_windows.shape[0]
        m_span = range(n_rows)

        # initialize lists to store relevant raw data points
        mzs = [[] for m in m_span]
        intensities = [[] for m in m_span]

        # loop through scans, keeping data points in mz_windows and rt_window only
        for spec in reader:
            try:
                rt = spec['scanList']['scan'][0]['scan start time']
            except (KeyError, IndexError):
                continue
            if rt >= rt_min and rt <= rt_max:
                # get raw scan data
                these_mzs = spec['m/z array']
                these_intensities = spec['intensity array']

                # index into mz_windows to find relevant data in scan
                index_mat = np.searchsorted(these_mzs, mz_windows)
                start = index_mat[:, 0]
                stop = index_mat[:, 1]

                for m in m_span:
                    # if scan has no mz values of interest, skip it
                    if start[m] != stop[m]:
                        mzs[m].extend(list(these_mzs[start[m]:stop[m]]))
                        intensities[m].extend(
                            list(these_intensities[start[m]:stop[m]]))
        try:
            mean_mz = np.asarray(
                [np.average(mzs[m], weights=intensities[m]) for m in m_span])
        except ZeroDivisionError:
            mean_mz = np.asarray([0 for m in m_span])
        total_i = np.asarray([np.sum(intensities[m]) for m in m_span])

        return ({
            'm': np.asarray(m_span),
            'mean_mz': mean_mz,
            'total_i': total_i
        })
Esempio n. 14
0
    def __compute_mz_axis(cls, filename, mzml_filetype, scan_types, resolution):
        ## TODO completely refactor this to make it smartly handle profile or centroid datasets
        ## TODO: centroid datasets should take in a user parameter "Resolution" and resample data at that resolution
        ## TODO: profile datasets should work as is
        ## TODO: checks for profile data vs. centroid data on the variation in length of ['m/z array']
        """
        Internal helper function used to compute the mz axis of each scantype
        Returns a list of numpy arrays
        """
        reader = mzml.read(filename)

        if mzml_filetype == cls.available_mzml_types['thermo']:

            mz_axes = [np.array([]) for _ in scan_types]
            # all_centroid = True
            for spectrum in reader:
                scanfilt = spectrum['scanList']['scan'][0]['filter string']
                scantype_idx = scan_types.index(scanfilt)
                mz = spectrum['m/z array']
                try:
                    len_axes = len(mz_axes[scantype_idx])
                except TypeError:
                    len_axes = 1
                if spectrum.has_key('profile spectrum'):
                    # all_centroid = False
                    if len(mz) > len_axes:
                        mzdiff = np.diff(mz).min()
                        mzmin = spectrum['scanList']['scan'][0]['scanWindowList']['scanWindow'][0]['scan window lower limit']
                        mzmax = spectrum['scanList']['scan'][0]['scanWindowList']['scanWindow'][0]['scan window upper limit']
                        mz_axes[scantype_idx] = np.arange(start=mzmin, stop=mzmax, step=mzdiff)
                        mz_axes[scantype_idx] = np.append(arr=mz_axes[scantype_idx], values=mzmax)
                else:
                    if len(mz) > len_axes:
                        mzmin = spectrum['scanList']['scan'][0]['scanWindowList']['scanWindow'][0]['scan window lower limit']
                        mzmax = spectrum['scanList']['scan'][0]['scanWindowList']['scanWindow'][0]['scan window upper limit']
                        f = np.ceil(1e6 * np.log(mzmax/mzmin)/resolution)
                        mz_axes[scantype_idx] = np.logspace(np.log10(mzmin), np.log10(mzmax), f)
                        # ['count', 'index', 'highest observed m/z', 'm/z array', 'total ion current', 'ms level', 'spotID', 'lowest observed m/z', 'defaultArrayLength', 'intensity array', 'centroid spectrum', 'positive scan', 'MS1 spectrum', 'spectrum title', 'base peak intensity', 'scanList', 'id', 'base peak m/z']

            return mz_axes

        # assume bruker instruments have constant m/z axis from scan to scan
        elif mzml_filetype == cls.available_mzml_types['bruker']:

            mz_axes = [np.array([]) for _ in scan_types]

            for spectrum in reader:
                scanfilt = spectrum['scanList']['scan'][0]['filter string']
                scantype_idx = scan_types.index(scanfilt)
                # grossly inefficient reassignment of m/z array at each scan
                mz_axes[scantype_idx] = spectrum['m/z array']

            return mz_axes

        else:
            raise ValueError('Unknown mzml format')
Esempio n. 15
0
def parse_mzml_file(filepath, mz_resolution=1):

    mzml_file = read(filepath)
    spectrumList_dict = iterfind(filepath,
                                 'indexedmzML/mzML/run/spectrumList',
                                 read_schema=True,
                                 recursive=False).__next__()
    n_spectra = int(spectrumList_dict['count'])  # number of all spectra

    mzmin, mzmax = None, None
    mz_array = None
    mat = None
    times = None

    for i, sp in enumerate(mzml_file):

        if mat is None:
            mzmin = float(sp['scanList']['scan'][0]['scanWindowList']
                          ['scanWindow'][0]['scan window lower limit'])
            mzmax = float(sp['scanList']['scan'][0]['scanWindowList']
                          ['scanWindow'][0]['scan window upper limit'])

            mzmax = mzmax + mz_resolution - (
                mzmax -
                mzmin) % mz_resolution  # recalculate the maximum m/z value

            mz_array = np.linspace(
                mzmin, mzmax,
                int((mzmax - mzmin) / mz_resolution) +
                1)  # make sure to have evenly spaced integerers
            mat = np.zeros((n_spectra, mz_array.shape[0]))
            times = np.zeros(n_spectra)

        indexes = (sp['m/z array'] * mz_array.shape[0] / mzmax).astype(int)

        # find duplicated and integrate (just sum) them
        indexes, indices, counts = np.unique(indexes,
                                             return_index=True,
                                             return_counts=True)
        intensities = sp['intensity array']
        integrated_intensities = intensities[indices]

        for j in range(indexes.shape[0]):
            if counts[j] < 2:
                continue

            idx = indices[j]
            integrated_intensities[j] = intensities[idx:idx + counts[j]].sum()

        try:
            times[i] = float(sp['scanList']['scan'][0]['scan start time'])
        except KeyError:
            pass
        mat[i, indexes] = integrated_intensities

    return mat, times, mz_array
def plot_spectra(mzml_id, peptide, scan_id, mzml_dir, spec_pic_dir, psm_id):
    mzml_file = str(
        subprocess.check_output("find {} -name {}.mzML".format(
            mzml_dir, mzml_id),
                                shell=True))
    mzml_file = mzml_file.replace("b'", "").replace("\\n'", "")
    with mzml.read(mzml_file) as reader:
        # auxiliary.print_tree(next(reader))
        for scan in reader:
            if not scan["index"] == int(scan_id) - 1:
                continue
            if "precursorList" not in scan.keys():
                print("no precursor list")
                return
            mz = scan['m/z array']
            intensity = scan['intensity array']
            identifier = scan['index']
            retention_time = float(
                scan['scanList']['scan'][0]["scan start time"]) * 60.0
            precursor_mz = scan["precursorList"]["precursor"][0][
                "selectedIonList"]["selectedIon"][0]["selected ion m/z"]
            precursor_charge = int(
                scan["precursorList"]["precursor"][0]["selectedIonList"]
                ["selectedIon"][0]["charge state"])
            spec = spectrum.MsmsSpectrum(identifier,
                                         precursor_mz,
                                         precursor_charge,
                                         mz,
                                         intensity,
                                         retention_time=retention_time,
                                         peptide=peptide)
            min_mz, max_mz = 100, 1400
            fragment_tol_mass, fragment_tol_mode = 10, 'ppm'
            min_intensity, max_num_peaks = 0.05, 150
            scaling = 'root'
            ion_types = 'aby'
            spec = spec.set_mz_range(min_mz, max_mz)
            spec = spec.remove_precursor_peak(fragment_tol_mass,
                                              fragment_tol_mode)
            spec = spec.filter_intensity(min_intensity, max_num_peaks)
            spec = spec.scale_intensity(scaling)
            # spec = spec.annotate_peaks(fragment_tol_mass, fragment_tol_mode, ion_types)
            spec = spec.annotate_peptide_fragments(fragment_tol_mass,
                                                   fragment_tol_mode,
                                                   ion_types)
            plt.figure()
            plot.spectrum(spec, grid=False)
            mzml_id = os.path.splitext(os.path.split(mzml_file)[1])[0]
            plt.savefig("{}/{}_{}.svg".format(spec_pic_dir, mzml_id, psm_id),
                        bbox_inches='tight')
            plt.close()
            print("print")
            return
        else:
            print("Scan not found")
Esempio n. 17
0
 def __compute_filetype(cls, filename):
     """
     Internal helper function used to compute the filetype.
     """
     spectrum = next(mzml.read(filename))
     if 'spotID' in spectrum:
         return cls.available_mzml_types['thermo']
     elif 'id' in spectrum:
         return cls.available_mzml_types['bruker']
     else:
         return cls.available_mzml_types['unknown']
Esempio n. 18
0
 def __compute_filetype(cls, filename):
     """
     Internal helper function used to compute the filetype.
     """
     spectrum = next(mzml.read(filename))
     if 'spotID' in spectrum:
         return cls.available_mzml_types['thermo']
     elif 'id' in spectrum:
         return cls.available_mzml_types['bruker']
     else:
         return cls.available_mzml_types['unknown']
def set_retention_times(file: str):
    retention_scan_dictionary = {}
    with mzml.read(file) as f:
        for scan in f:
            if scan["ms level"] == 2:
                scan_time = float(
                    scan["scanList"]["scan"][0]["scan start time"])
                scan_time = (scan_time - CON.RETENTION_SHIFT_INTERCEPT
                             ) / CON.RETENTION_SHIFT_SLOPE
                scan_time *= CON.MINUTES_TO_SECONDS
                retention_scan_dictionary[scan["index"] + 1] = scan_time
    return retention_scan_dictionary
 def read_mzml(self, file: str):
     total = 0
     print()
     print("(initializing)")
     with mzml.read(file) as f:
         for scan in f:
             if scan["ms level"] == 1:
                 total += 1
     count = 0
     retention_time = None
     with mzml.read(file) as f:
         for scan in f:
             if scan["ms level"] == 1:
                 retention_time = scan["scanList"]["scan"][0][
                     "scan start time"]
                 retention_time *= CON.MINUTES_TO_SECONDS
                 count += 1
                 if count % 200 == 0 or count == 1 or count == total:
                     print(count, "/", total, "scans")
                 self.process_scan(scan)
     self.set_tuple_dictionary(self.sliding_window(retention_time))
Esempio n. 21
0
    def __read_all(self):
        """
        Internal helper function used to read all data. The
        function directly modifies the self.data entry.  Data is now a list of datacubes
        """

        self.data = [
            np.zeros(shape=self.shape_all_data[scan_idx], dtype=self.data_type)
            for scan_idx, scantype in enumerate(self.scan_types)
        ]

        for scan_idx, scantype in enumerate(self.scan_types):
            reader = mzml.read(self.basename)
            spectrumid = 0
            if not self.scan_profiled[scan_idx]:
                shift = np.diff(self.mz_all[scan_idx]).mean()
                bin_edges = np.append(self.mz_all[scan_idx],
                                      self.mz_all[scan_idx][-1] + shift)
            else:
                bin_edges = None

            for spectrum in reader:
                if spectrum['scanList']['scan'][0][
                        'filter string'] == scantype:
                    x = spectrum['m/z array']
                    try:
                        y = spectrum['intensity array']
                    except KeyError:
                        raise KeyError
                    if bin_edges is None:
                        yi = np.interp(
                            self.mz_all[scan_idx], x, y, 0,
                            0)  # Re-interpolate the data in profiled mode
                    else:
                        yi, _ = np.histogram(
                            x, bins=bin_edges, weights=y
                        )  # Re-histogram the data in centroided mode
                    xidx = np.nonzero(
                        self.x_pos == self.coordinates[spectrumid, 0])[0]
                    yidx = np.nonzero(
                        self.y_pos == self.coordinates[spectrumid, 1])[0]
                    try:
                        self.data[scan_idx][xidx, yidx, :] = yi
                    except:
                        log_helper.debug(__name__, spectrumid, scan_idx,
                                         scantype, self.mz_all[scan_idx].shape)
            # TODO Note if the data is expected to be of float precision then self.data_type needs to be set accordingly
                if spectrumid % 1000 == 0:
                    log_helper.info(
                        __name__,
                        'Processed data for %s spectra to datacube for scan type %s'
                        % (spectrumid, scantype))
                spectrumid += 1
Esempio n. 22
0
def load_from_mzml(filename: str,
                   ms_level: int = 2,
                   metadata_harmonization: bool = True
                   ) -> Generator[Spectrum, None, None]:
    """Load spectrum(s) from mzml file.

    This function will create ~matchms.Spectrum for every spectrum of desired
    ms_level found in a given MzML file. For more extensive parsing options consider
    using pyteomics or pymzml packages.

    Example:

    .. code-block:: python

        from matchms.importing import load_from_mzml

        file_mzml = "testdata.mzml"
        spectrums = list(load_from_mzml(file_mzml))

    Parameters
    ----------
    filename:
        Filename for mzml file to import.
    ms_level:
        Specify which ms level to import. Default is 2.
    metadata_harmonization : bool, optional
        Set to False if metadata harmonization to default keys is not desired.
        The default is True.
    """
    for pyteomics_spectrum in mzml.read(filename, dtype=dict):
        if "ms level" in pyteomics_spectrum and pyteomics_spectrum[
                "ms level"] == ms_level:
            metadata = parse_mzml_mzxml_metadata(pyteomics_spectrum)
            mz = numpy.asarray(pyteomics_spectrum["m/z array"], dtype="float")
            intensities = numpy.asarray(pyteomics_spectrum["intensity array"],
                                        dtype="float")

            if mz.shape[0] > 0:
                # Sort by mz (if not sorted already)
                if not numpy.all(mz[:-1] <= mz[1:]):
                    idx_sorted = numpy.argsort(mz)
                    mz = mz[idx_sorted]
                    intensities = intensities[idx_sorted]

                yield Spectrum(mz=mz,
                               intensities=intensities,
                               metadata=metadata,
                               metadata_harmonization=metadata_harmonization)
Esempio n. 23
0
def mZML_reader(filetoopen):
    app.queueFunction(app.setStatusbarWidth, len("Loading file..."), field=2)
    app.queueFunction(app.setStatusbar, "Loading file...", 2)
    data = mzml.read(filetoopen)

    #load only the first scan
    for scan in data:
        if scan['id'] == '1':
            X = scan['m/z array']
            Y = scan['intensity array']
            break

    Xscalar = np.around(
        min([
            X[i + 1] - X[i] for i in range(0,
                                           len(X) - 1)
            if X[i + 1] - X[i] != 0.0
        ]), 2
    )  # no Xscalar is encoded in this data - and values have been filtered out  - so need to guess
    if Xscalar > 1:
        Xscalar = 0.2  # fix if the estimator is way off

    try:
        XY = [(X[i] / Xscalar, Y[i]) for i in range(len(X))
              if Y[i] > 0]  # remove 0 values and adjust for scalar
        X, Y = ([int(XY[x][0]) for x in range(len(XY))],
                [int(XY[x][1]) for x in range(len(XY))])
        ave_noise = np.average([
            float(XY[j][1]) for j in range(len(XY)) if int(XY[j][1]) <= 10
        ])  # determine average noise
    except:  # if all else fails set scalar to 1 - so we can open it
        Xscalar = 1
        XY = [(X[i] / Xscalar, Y[i]) for i in range(len(X))
              if Y[i] > 0]  # remove 0 values and adjust for scalar
        X, Y = ([int(XY[x][0]) for x in range(len(XY))],
                [int(XY[x][1]) for x in range(len(XY))])
        ave_noise = np.average([
            float(XY[j][1]) for j in range(len(XY)) if int(XY[j][1]) <= 10
        ])  # determine average noise
    global progress
    progress = 50
    app.registerEvent(updatprogress)
    return (X, Y, ave_noise, Xscalar, filetoopen)
Esempio n. 24
0
    def get_data(self, ms_level):
        data_for_analyse = []
        for z in mzml.read(self.input_mzml_path):
            if z['ms level'] == ms_level:
                if 1:
                    idx = z['intensity array'] >= self.min_intensity
                    z['intensity array'] = z['intensity array'][idx]
                    z['m/z array'] = z['m/z array'][idx]
                    if 'mean inverse reduced ion mobility array' in z:
                        z['mean inverse reduced ion mobility array'] = z[
                            'mean inverse reduced ion mobility array'][idx]

                    idx = np.argsort(z['m/z array'])
                    z['m/z array'] = z['m/z array'][idx]
                    z['intensity array'] = z['intensity array'][idx]
                    if 'mean inverse reduced ion mobility array' in z:
                        z['mean inverse reduced ion mobility array'] = z[
                            'mean inverse reduced ion mobility array'][idx]
                    data_for_analyse.append(z)
        return data_for_analyse
Esempio n. 25
0
def main():

    argparser = argparse.ArgumentParser(
        description='Creates an index for an MSP spectral library file')
    argparser.add_argument('--mzml_file',
                           action='store',
                           help='Name of the mzML file to read')
    argparser.add_argument('--version',
                           action='version',
                           version='%(prog)s 0.5')
    params = argparser.parse_args()

    #### Ensure that mzml_file was passed
    if params.mzml_file is None or params.mzml_file == "":
        print(
            'ERROR: Parameter --mzml_file must be provided. See --help for more information'
        )
        return

    if not os.path.isfile(params.mzml_file):
        print(f"ERROR: File '{params.mzml_file}' not found or not a file")
        return

    #### Read spectra from the file
    t0 = timeit.default_timer()
    stats = {'counter': 0, 'ms1spectra': 0, 'ms2spectra': 0}
    with mzml.read(params.mzml_file) as reader:
        for spectrum in reader:
            if stats['counter'] == 0:
                auxiliary.print_tree(spectrum)

            #### Update counters and print progress
            stats['counter'] += 1
            if stats['counter'] / 1000 == int(stats['counter'] / 1000):
                print(f"  {stats['counter']}")

    #### Print final timing information
    t1 = timeit.default_timer()
    print(f"INFO: Read {stats['counter']} spectra from {params.mzml_file}")
    print(f"INFO: Elapsed time: {t1-t0}")
    print(f"INFO: Processed {stats['counter']/(t1-t0)} spectra per second")
Esempio n. 26
0
    def __compute_scan_types_and_indices(self, filename=None):
        """
        Internal helper function used to compute a list of unique scan types in the mzml file.
        Also computes a numpy 1d array of ints which index every scan to relevant datacube.
        """
        reader = mzml.read(filename)
        scantypes = []
        scan_indices = []
        scan_profiled = []
        for idx, spectrum in enumerate(reader):
            try:
                scanfilter = spectrum['scanList']['scan'][0]['filter string']
                if scanfilter not in scantypes:
                    scantypes.append(scanfilter)
                    scan_profiled.append(spectrum.has_key('profile spectrum'))
                scan_indices.append(scantypes.index(scanfilter))
            except:
                log_helper.debug(__name__, idx)

        assert len(scan_indices) == self.num_scans
        return scantypes, scan_indices, scan_profiled
Esempio n. 27
0
 def load_data(self, path_to_file, min_peak_th=10, data_type='ups1'):
     """Loading experimental data from *.mzML file"""
     self.spectrum_collection = []
     # print(eval(mzml_params[data_type]['scan_id']))
     with mzml.read(path_to_file, dtype=dict) as spectra:
         for spectrum_id, spectrum in enumerate(spectra):
             spectrum_record = Spectrum(
                 path_to_file,  # path to file
                 eval(mzml_params[data_type]['scan_id']),  #scan id
                 eval(mzml_params[data_type]['mz_array']),  # mz array
                 eval(mzml_params[data_type]
                      ['intensity_array']),  # intensity array
                 eval(mzml_params[data_type]['charge']),  # charge
                 eval(mzml_params[data_type]
                      ['precursor_mass']),  # precursor mass
                 self.max_peak,
                 self.remove_precursor_peak,
                 self.remove_precursor_tolerance)
             if len(spectrum_record.intensity_array) >= min_peak_th:
                 self.spectrum_collection.append(spectrum_record)
     self.set_spectrum_idx()
Esempio n. 28
0
    def is_valid_dataset(cls, name):
        """Check whether the given file or directory points to a img file.

           :param name: Name of the dir or file.
           :type name: String

           :returns: Boolean indicating whether the given file or folder is a valid img file.
        """
        if os.path.isdir(name):  # If we point to a directory, check if the dir contains an mzML file
            filelist = cls.get_files_from_dir(name)
            return len(filelist) > 0
        else:
            try:
                # Try to open the file and iterate over it
                reader = mzml.read(name)
                for _ in reader:
                    pass
                del reader
                return True
            except:
                return False
Esempio n. 29
0
    def __compute_scan_types_and_indices(self, filename=None):
        """
        Internal helper function used to compute a list of unique scan types in the mzml file.
        Also computes a numpy 1d array of ints which index every scan to relevant datacube.
        """
        reader = mzml.read(filename)
        scantypes = []
        scan_indices = []
        scan_profiled = []
        for idx, spectrum in enumerate(reader):
            try:
                scanfilter = spectrum['scanList']['scan'][0]['filter string']
                if scanfilter not in scantypes:
                    scantypes.append(scanfilter)
                    scan_profiled.append(spectrum.has_key('profile spectrum'))
                scan_indices.append(scantypes.index(scanfilter))
            except:
                log_helper.debug(__name__, idx)

        assert len(scan_indices) == self.num_scans
        return scantypes, scan_indices, scan_profiled
Esempio n. 30
0
    def __compute_coordinates(self,filename,num_scans):
        """
        Internal helper function used to compute the coordinates for each scan.

        :returns: 2D numpy integer array of shape (numScans,2) indicating for each scan its x and y coordinate
        """
        spectrumid = 0
        reader = mzml.read(filename)
        coords = np.zeros(shape=(num_scans, 2), dtype='uint32')
        with open(filename,'r') as origin_file:
            for line in origin_file:
                s = re.findall(r'location="', line)
                if s:
                    m = re.search(r'_[0-9]+x_[0-9]+y_', line,)
                    if m:
                        coord_str = m.group()
                        coord_str = coord_str.strip('_').split('_')
                        coord_str = [int(c[:-1]) for c in coord_str]
                        coords[spectrumid, 0] = coord_str[0]
                        coords[spectrumid, 1] = coord_str[1]
                        spectrumid += 1
        return coords
Esempio n. 31
0
    def __read_all(self):
        """
        Internal helper function used to read all data. The
        function directly modifies the self.data entry.  Data is now a list of datacubes
        """

        # self.data = np.zeros(shape=self.shape_all_data[scan_idx], dtype=self.data_type) for scan_idx, scantype in enumerate(self.scan_types)
        data = np.zeros(self.shape)
        reader = mzml.read(self.basename)
        spectrumid = 0
        # if not self.scan_profiled[scan_idx]:
        #     shift = np.diff(self.mz_all[scan_idx]).mean()
        #     bin_edges = np.append(self.mz_all[scan_idx], self.mz_all[scan_idx][-1]+ shift)
        # else:
        #     bin_edges = None

        for spectrum in reader:
            # if spectrum['scanList']['scan'][0]['filter string'] == scantype:
            x = spectrum['m/z array']
                # try:
            y = spectrum['intensity array']
                # except KeyError:
                    # raise KeyError
                # if bin_edges is None:
            yi = np.interp(self.mz_all[scan_idx], x, y, 0, 0)  # Re-interpolate the data in profiled mode
                # else:
                     # yi, _ = np.histogram(x, bins=bin_edges, weights=y)   # Re-histogram the data in centroided mode
                # xidx = np.nonzero(self.x_pos == self.coordinates[spectrumid, 0])[0]
                # yidx = np.nonzero(self.y_pos == self.coordinates[spectrumid, 1])[0]
                # try:
            data[self.coordinates[spectrumid, 0], self.coordinates[spectrumid, 1], :] = yi
                # except:
                    # log_helper.debug(__name__, spectrumid, scan_idx, scantype, self.mz_all[scan_idx].shape)
        # TODO Note if the data is expected to be of float precision then self.data_type needs to be set accordingly
            # if spectrumid%1000 == 0:
                # log_helper.info(__name__, 'Processed data for %s spectra to datacube for scan type %s' % (spectrumid, scantype))
            spectrumid += 1
        return data
Esempio n. 32
0
    def __compute_coordinates(self):
        """
        Internal helper function used to compute the coordinates for each scan.

        :returns: 2D numpy integer array of shape (numScans,2) indicating for each scan its x and y coordinate
        """
        reader = mzml.read(self.basename)
        coords = np.zeros(shape=(self.num_scans, 2), dtype='uint32')
        if self.mzml_type == self.available_mzml_types['thermo']:
            spectrumid = 0
            for spectrum in reader:
                spotid = spectrum['spotID']
                coords[spectrumid, :] = map(int, spotid.split(',')[-1].split('x'))
                spectrumid += 1
        elif self.mzml_type == self.available_mzml_types['bruker']:
            spectrumid = 0
            for spectrum in reader:
                spotdesc = spectrum['id'].split('_x002f_')[1]
                matchobj = re.findall('\d+', spotdesc)
                coords[spectrumid, 0] = int(matchobj[2])
                coords[spectrumid, 1] = int(matchobj[3])
                spectrumid += 1
        return coords
Esempio n. 33
0
    def is_valid_dataset(cls, name):
        """Check whether the given file or directory points to a img file.

           :param name: Name of the dir or file.
           :type name: String

           :returns: Boolean indicating whether the given file or folder is a valid img file.
        """
        if os.path.isdir(
                name
        ):  # If we point to a directory, check if the dir contains an mzML file
            filelist = cls.get_files_from_dir(name)
            return len(filelist) > 0
        else:
            try:
                # Try to open the file and iterate over it
                reader = mzml.read(name)
                for _ in reader:
                    pass
                del reader
                return True
            except:
                return False
Esempio n. 34
0
def average_ms1(input_filename,
                output_filename=None,
                bin_width=1.0,
                format="csv"):
    mass_list = []
    intensity_list = []

    filename, file_extension = os.path.splitext(input_filename)

    if file_extension == ".mzXML":
        spectra = mzxml.read(input_filename,
                             read_schema=True)  #type is pyteomics mzxml
    if file_extension == ".mzML":
        spectra = mzml.read(input_filename,
                            read_schema=True)  #type is pyteomics mzxml

    peaks_list = []

    for element in spectra:
        if "msLevel" in element:
            mslevel = element["msLevel"]
        if "ms level" in element:
            mslevel = element["ms level"]

        mlist = copy.deepcopy(element['m/z array'])
        inten = copy.deepcopy(element['intensity array'])

        if mslevel != 2:
            peaks_list += zip(mlist, inten)

    numpy_vector = vectorize_peaks(peaks_list, 2000, bin_width)

    if output_filename != None:
        dt = pd.DataFrame(data=numpy_vector)
        dt.to_csv(output_filename, mode='a', index=True)

    return numpy_vector
Esempio n. 35
0
    def __compute_coordinates(self):
        """
        Internal helper function used to compute the coordinates for each scan.

        :returns: 2D numpy integer array of shape (numScans,2) indicating for each scan its x and y coordinate
        """
        reader = mzml.read(self.basename)
        coords = np.zeros(shape=(self.num_scans, 2), dtype='uint32')
        if self.mzml_type == self.available_mzml_types['thermo']:
            spectrumid = 0
            for spectrum in reader:
                spotid = spectrum['spotID']
                coords[spectrumid, :] = map(int,
                                            spotid.split(',')[-1].split('x'))
                spectrumid += 1
        elif self.mzml_type == self.available_mzml_types['bruker']:
            spectrumid = 0
            for spectrum in reader:
                spotdesc = spectrum['id'].split('_x002f_')[1]
                matchobj = re.findall('\d+', spotdesc)
                coords[spectrumid, 0] = int(matchobj[2])
                coords[spectrumid, 1] = int(matchobj[3])
                spectrumid += 1
        return coords
Esempio n. 36
0
 def __compute_num_scans(filename=None):
     """
     Internal helper function used to compute the number of scans in the mzml file.
     """
     reader = mzml.read(filename)
     return sum(1 for _ in reader)
Esempio n. 37
0
def ingest_mzML(input_filename):
    """Ingest an mzML or mzXML file given it's name and return a dataframe of the file
    """
    '''
    {'count': 2,
    'index': 2,
    'highest observed m/z': 2020.216835219264,
    'm/z array': array([  346.51808351
    'ms level': 1,
    'total ion current': 5284812.0,
    'profile spectrum': '',
    'lowest observed m/z': 346.518083514683,
    'defaultArrayLength': 6305,
    'intensity array':,
    'positive scan': '',
    'MS1 spectrum': '',
    'spectrum title': 'exp1720-04-ds259269.3.3. File:"exp1720-04-ds259269.raw", NativeID:"controllerType=0 controllerNumber=1 scan=3"',
    'base peak intensity': 836452.44,
    'scanList': {'count': 1, 'no combination': '', 'scan': [{'filter string': 'FTMS + p NSI Full ms [350.00-2000.00]',
    'scan start time': 5.0165227,
    'ion injection time': 100.000001490116,
    'scanWindowList': {'count': 1, 'scanWindow': [{'scan window lower limit': 350.0, 'scan window upper limit': 2000.0}]}, 'preset scan configuration': 1.0}]},
    'id': 'controllerType=0 controllerNumber=1 scan=3',
    'base peak m/z': 371.1017749}
        '''
    columns = []
    colProc = False
    with mzml.read(input_filename) as reader:
      mzml_list = []
      for item in reader:
        row = []
        #item["count"],
        #item["index"],
        col = "highest observed m/z"
        if col in item:
          row.append(float(item[col]))
          if not colProc:
            columns.append(col)
        #item["m/z array"],
        col = "ms level"
        if col in item:
          row.append(int(item[col]))
          if not colProc:
            columns.append(col)
        col = "total ion current"
        if col in item:
          row.append(float(item[col]))
          if not colProc:
            columns.append(col)
       #item["profile spectrum"],
        col = "lowest observed m/z"
        if col in item:
          row.append(float(item[col]))
          if not colProc:
            columns.append(col)
        #item["intensity array"],
        #item["positive scan"],
        #item["MS1 spectrum"],
        #exp1720-04-ds259269.3.3. File:"exp1720-04-ds259269.raw", NativeID:"controllerType=0 controllerNumber=1 scan=3"
        col = "spectrum title"
        if col in item:
          row.append(str(item[col].split("File:\"")[1].split("\",")[0]))
          row.append(int(item[col].split("controllerType=")[1].split(" ")[0]))
          row.append(int(item[col].split("controllerNumber=")[1].split(" ")[0]))
          row.append(int(item[col].split("scan=")[1].split("\"")[0]))
          if not colProc:
            columns.append("File")
            columns.append("controllerType")
            columns.append("controllerNumber")
            columns.append("scan")
        col = "base peak intensity"
        if col in item:
          row.append(float(item[col]))
          if not colProc:
            columns.append(col)
        #item["scanList"],
        #item["id"],
        col = "base peak m/z"
        if col in item:
          row.append(float(item[col]))
          if not colProc:
            columns.append(col)
        if not colProc:
          colProc = True
        mzml_list.append(row)
    df = pd.DataFrame(mzml_list,columns=columns)

    return df
def pick_peaks(mzml_file):
    scan = mzml.read(mzml_file).next()
    return process_mzml_scan(scan, savgol_window_length=7)
def load_mzml_file(filename, drop_ms1=False):
    output_ms1 = []
    output_ms2 = []

    for spectrum in pyteomicsmzml.read(filename):
        # print("==========================")
        #
        # for key in spectrum.keys():
        #     print(key, spectrum[key])

        ms_level = spectrum["ms level"]
        scan = -1
        index = int(spectrum["index"])
        peaks = []
        #peaks_zipped = zip(spectrum["m/z array"], spectrum["intensity array"])

        for i in range(len(spectrum["m/z array"])):
            peaks.append([float(spectrum["m/z array"][i]), float(spectrum["intensity array"][i])])



        #Determining scan
        for id_split in spectrum["id"].split(" "):
            if id_split.find("scan=") != -1:
                scan = int(id_split.replace("scan=", ""))

        if ms_level == 1:
            if drop_ms1 == False:
                output = Spectrum(
                        filename,
                        scan,
                        index,
                        peaks,
                        0,
                        0,
                        ms_level
                    )

                output_ms1.append(output)

        if ms_level == 2:
            precusor_list = spectrum["precursorList"]["precursor"][0]
            activation = precusor_list["activation"]
            collision_energy = float(activation["collision energy"])

            selected_ion_list = precusor_list["selectedIonList"]
            precursor_mz = float(selected_ion_list["selectedIon"][0]["selected ion m/z"])
            precursor_intensity = 0
            precursor_charge = 0

            try:
                precursor_intensity = float(selected_ion_list["selectedIon"][0]["peak intensity"])
            except:
                precursor_intensity = 0

            try:
                precursor_charge = int(selected_ion_list["selectedIon"][0]["charge state"])
            except:
                precursor_charge = 0


            fragmentation_method = "NO_FRAG"
            totIonCurrent = float(spectrum["total ion current"])

            try:
                for key in activation:
                    if key == "beam-type collision-induced dissociation":
                        fragmentation_method = "HCD"
            except:
                fragmentation_method = "NO_FRAG"

            output = Spectrum(
                    filename,
                    scan,
                    index,
                    peaks,
                    precursor_mz,
                    precursor_charge,
                    ms_level,
                    collision_energy=collision_energy,
                    fragmentation_method=fragmentation_method,
                    precursor_intensity=precursor_intensity,
                    totIonCurrent=totIonCurrent
                )
            output_ms1.append(output)

    return output_ms1 + output_ms2
Esempio n. 40
0
    def size(cls, name, max_num_reads=1000):
        """
        Classmethod used to check the estimated size for the given file/folder.
        For mzml this is an estimate of the final size of the full 3D datacube.
        For efficiency the number of scans is estimated based on the size of
        the first 1000 scans.

        :param name: Name of the dir or file.
        :type name: unicode
        :param max_num_reads: The maximum number of spectrum reads to be performed to estimate the file size
        :type max_num_reads: int

        :returns: Integer indicating the size in byte or None if unknown.
        """
        basename = None
        if os.path.isdir(
                name
        ):  # If we point to a directory, check if the dir contains an mzML file
            filelist = cls.get_files_from_dir(name)
            if len(filelist) > 0:
                basename = filelist[0]
        else:
            basename = name
        if basename is not None:
            num_scans = -1
            # Try to compute the number of scans by looking at the spectrumList count entry in the file
            try:
                size_line = os.popen('head -n 120 "' + basename +
                                     '" | grep "spectrumList count="').read()
                if len(size_line) > 0:
                    size_text = size_line.split(
                        'spectrumList count=')[1].split('"')[1]
                    if size_text.isdigit():
                        num_scans = int(size_text)
            except:
                pass
            if num_scans < 0:
                # Estimate the number of scans by reading the first 1000 spectra
                index = 0
                prev_tell = 0
                sizes = []
                reader = mzml.read(basename)
                for _ in reader:
                    if index >= max_num_reads:
                        break
                    current_tell = reader.file.file.tell()
                    sizes.append(current_tell - prev_tell)
                    prev_tell = current_tell
                    index += 1
                npsizes = np.asarray(sizes)
                filesize = os.stat(basename).st_size
                scansize = (npsizes.max() - npsizes.min()) / 2.
                num_scans = int(filesize / scansize)
            mz_axis_len = cls.__compute_mz_axis(
                filename=basename,
                mzml_filetype=cls.__compute_filetype(filename=basename),
                scan_types=cls.__compute_scan_types(
                    filename=basename)).shape[0]
            return num_scans * mz_axis_len

            # temp_mzml_file = cls(basename=basename, requires_slicing=False)
            # itemsize = np.dtype(temp_mzml_file.data_type).itemsize
            # size = np.asarray(temp_mzml_file.shape).prod() * itemsize
            # print ('MZML size', size)
            # return size
        else:
            return None
Esempio n. 41
0
 def yield_spectrum(mzml_path):
     for spectrum in mzml.read(mzml_path):
         yield spectrum
Esempio n. 42
0
    def size(cls, name, max_num_reads=1000):
        """
        Classmethod used to check the estimated size for the given file/folder.
        For mzml this is an estimate of the final size of the full 3D datacube.
        For efficiency the number of scans is estimated based on the size of
        the first 1000 scans.

        :param name: Name of the dir or file.
        :type name: unicode
        :param max_num_reads: The maximum number of spectrum reads to be performed to estimate the file size
        :type max_num_reads: int

        :returns: Integer indicating the size in byte or None if unknown.
        """
        basename = None
        if os.path.isdir(name):  # If we point to a directory, check if the dir contains an mzML file
            filelist = cls.get_files_from_dir(name)
            if len(filelist) > 0:
                basename = filelist[0]
        else:
            basename = name
        if basename is not None:
            num_scans = -1
            # Try to compute the number of scans by looking at the spectrumList count entry in the file
            try:
                size_line = os.popen('head -n 120 "' + basename + '" | grep "spectrumList count="').read()
                if len(size_line) > 0:
                    size_text = size_line.split('spectrumList count=')[1].split('"')[1]
                    if size_text.isdigit():
                        num_scans = int(size_text)
            except:
                pass
            if num_scans < 0:
                # Estimate the number of scans by reading the first 1000 spectra
                index = 0
                prev_tell = 0
                sizes = []
                reader = mzml.read(basename)
                for _ in reader:
                    if index >= max_num_reads:
                        break
                    current_tell = reader.file.file.tell()
                    sizes.append(current_tell - prev_tell)
                    prev_tell = current_tell
                    index += 1
                npsizes = np.asarray(sizes)
                filesize = os.stat(basename).st_size
                scansize = (npsizes.max() - npsizes.min()) / 2.
                num_scans = int(filesize/scansize)
            mz_axis_len = cls.__compute_mz_axis(filename=basename,
                                                mzml_filetype=cls.__compute_filetype(filename=basename),
                                                scan_types=cls.__compute_scan_types(filename=basename)).shape[0]
            return num_scans*mz_axis_len

            # temp_mzml_file = cls(basename=basename, requires_slicing=False)
            # itemsize = np.dtype(temp_mzml_file.data_type).itemsize
            # size = np.asarray(temp_mzml_file.shape).prod() * itemsize
            # print ('MZML size', size)
            # return size
        else:
            return None
Esempio n. 43
0
 def __compute_num_scans(filename=None):
     """
     Internal helper function used to compute the number of scans in the mzml file.
     """
     reader = mzml.read(filename)
     return sum(1 for _ in reader)
def load_mzml_file(filename, drop_ms1=False):
    output_ms1 = []
    output_ms2 = []

    for spectrum in pyteomicsmzml.read(filename):
        # print("==========================")
        #
        # for key in spectrum.keys():
        #     print(key, spectrum[key])

        ms_level = spectrum["ms level"]
        scan = -1
        index = int(spectrum["index"])
        peaks = []
        #peaks_zipped = zip(spectrum["m/z array"], spectrum["intensity array"])

        for i in range(len(spectrum["m/z array"])):
            peaks.append([float(spectrum["m/z array"][i]), float(spectrum["intensity array"][i])])



        #Determining scan
        for id_split in spectrum["id"].split(" "):
            if id_split.find("scan=") != -1:
                scan = int(id_split.replace("scan=", ""))

        if ms_level == 1:
            if drop_ms1 == False:
                output = Spectrum(
                        filename,
                        scan,
                        index,
                        peaks,
                        0,
                        0,
                        ms_level
                    )

                output_ms1.append(output)

        if ms_level == 2:
            precusor_list = spectrum["precursorList"]["precursor"][0]
            activation = precusor_list["activation"]
            collision_energy = float(activation["collision energy"])

            selected_ion_list = precusor_list["selectedIonList"]
            precursor_mz = float(selected_ion_list["selectedIon"][0]["selected ion m/z"])
            precursor_intensity = 0
            precursor_charge = 0

            try:
                precursor_intensity = float(selected_ion_list["selectedIon"][0]["peak intensity"])
            except:
                precursor_intensity = 0

            try:
                precursor_charge = int(selected_ion_list["selectedIon"][0]["charge state"])
            except:
                precursor_charge = 0


            fragmentation_method = "NO_FRAG"
            try:
                totIonCurrent = float(spectrum["total ion current"])
            except:
                totIonCurrent = 0

            try:
                for key in activation:
                    if key == "beam-type collision-induced dissociation":
                        fragmentation_method = "HCD"
            except:
                fragmentation_method = "NO_FRAG"

            output = Spectrum(
                    filename,
                    scan,
                    index,
                    peaks,
                    precursor_mz,
                    precursor_charge,
                    ms_level,
                    collision_energy=collision_energy,
                    fragmentation_method=fragmentation_method,
                    precursor_intensity=precursor_intensity,
                    totIonCurrent=totIonCurrent
                )
            output_ms1.append(output)

    return output_ms1 + output_ms2
Esempio n. 45
0
    4.69519356e+03, 1.55343822e+04, 5.45621612e+03, 5.53939031e+03,
    9.49732490e+03, 8.05000735e+03, 2.65457068e+03, 1.36766228e+04,
    2.69348480e+03, 6.71802368e+03, 4.46828571e+02, 1.39065143e+04,
    4.29267365e+03, 2.73782365e+03, 1.35373492e+03, 1.17601397e+03
]

charge_array = [
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -3, -2, -2, -3,
    -2, -2, -2, -3, -3, -2, -2, -3, -3, -2, -3, -3, -2, -3, -3, -3, -3, -3, -5,
    -4, -3, -6
]

f = writer.MzMLWriter(open(path, 'wb'))

with f:
    f.controlled_vocabularies()
    with f.element('run'):
        f.write_spectrum(mz_array,
                         intensity_array,
                         charge_array,
                         id='scanId=1',
                         params=[{
                             "name": "ms level",
                             "value": 1
                         }],
                         polarity='negative scan')

spec = next(mzml.read(path))
assert (all(np.abs(spec['m/z array'] - mz_array) < 1e-4))