Example #1
0
 def data(self):
     delim = ','
     try:  # TODO: better, smarter error checking than this
         with open(self.filename, 'r') as f:
             lns = f.readlines()
             ions = [float(i) for i in lns[0].split(delim)[1:]]
             data = np.array(
                 [np.fromstring(ln, sep=delim) for ln in lns[1:]])
             return Chromatogram(data[:, 1:], data[:, 0], ions)
     except:
         return Chromatogram()
Example #2
0
    def data(self):
        fidx = open(self.filename, 'rb')
        fdat = open(op.splitext(self.filename)[0] + '.DAT', 'rb')

        ions = set([])
        while True:
            fdat.seek(fdat.tell() + 2)
            try:
                i = struct.unpack('<H', fdat.read(2))[0]
            except struct.error:
                break
            ions.add(i)
        ions = sorted(ions)

        data = []
        tme = []
        while True:
            try:
                idx = struct.unpack('<IHHffhhh', fidx.read(22))
                tme.append(idx[4])
                fdat.seek(idx[0])
                new_line = np.zeros(len(ions))
                d = struct.unpack('<' + int(idx[1] / 4) * 'HH',
                                  fdat.read(idx[1]))
                for i, v in zip(d[1::2], d[0::2]):
                    new_line[ions.index(i)] = v
                data.append(new_line)
            except struct.error:
                break
        fdat.close()
        fidx.close()
        return Chromatogram(np.array(data), np.array(tme), ions)
Example #3
0
def read_amdis_list(db, filename):

    def get_val(line, cols, key):
        return line.split('\t')[cols.index(key)]

    cmp_lvl = 2  # number of directory levels to compare
    # TODO: does this work for agilent files only?
    mapping = defaultdict(list)
    with open(filename, 'r') as f:
        cols = f.readline().split('\t')
        for line in f:
            filename = get_val(line, cols, 'FileName')
            fn = op.splitext('/'.join(filename.split('\\')[-cmp_lvl:]))[0]
            # find if filtered filename overlaps with anything in the db
            for dt in db.children_of_type('file'):
                if fn in '/'.join(dt.rawdata.split(op.sep)):
                    break
            else:
                continue
            info = {}
            info['name'] = get_val(line, cols, 'Name')
            info['p-s-time'] = get_val(line, cols, 'RT')
            info['p-s-area'] = get_val(line, cols, 'Area')
            ts = Chromatogram(np.array([np.nan]), np.array([np.nan]), [''])
            mapping[dt] += [Peak(info, ts)]
    with db:
        for dt in mapping:
            dt.children += mapping[dt]
Example #4
0
    def data(self):
        fhead = open(self.filename[:-3] + '.sd', 'rb')
        fdata = open(self.filename[:-3] + '.sp', 'rb')

        fhead.seek(0x50)
        nscans = struct.unpack('Q', fhead.read(8))[0]
        fhead.seek(0xA4)

        ions = []
        for scn in range(nscans):
            t = struct.unpack('<IdddIQIIdddd', fhead.read(80))

            npts = t[7]
            if scn == 0:
                # TODO: this will fail if the wavelengths collected
                # change through the run.
                data = np.zeros((nscans, npts))
                times = np.zeros((nscans))
                ions = [t[8] + x * t[3] for x in range(npts)]

            times[scn] = t[1]

            fdata.seek(t[5] + 16)
            # TODO: use np.fromfile?
            data[scn] = struct.unpack('<' + npts * 'd', fdata.read(npts * 8))
        return Chromatogram(data, times, ions)

        fhead.close()
        fdata.close()
Example #5
0
    def data(self):
        print(self.filename)
        # Because the spectra are stored in several files in the same
        # directory, we need to loop through them and return them together.
        ions = []
        dtraces = []
        foldname = os.path.dirname(self.filename)
        # if foldname == '': foldname = os.curdir
        for i in [os.path.join(foldname, i) for i in
                  os.listdir(foldname)]:
            if i[-3:].upper() == '.CH':
                wv, dtrace = self._read_ind_file(i)

                if wv is None:
                    continue

                # generate the time points if this is the first trace
                if len(ions) == 0:
                    f = open(i, 'rb')
                    f.seek(0x244)
                    yunits = f.read(struct.unpack('>B', f.read(1))[0]).decode()
                    f.seek(0x11A)
                    st_t = struct.unpack('>i', f.read(4))[0] / 60000.
                    en_t = struct.unpack('>i', f.read(4))[0] / 60000.
                    f.close()

                    times = np.linspace(st_t, en_t, len(dtrace))

                # add the wavelength and trace into the appropriate places
                ions.append(float(wv))
                dtraces.append(dtrace)
        data = np.array(dtraces).transpose()
        return Chromatogram(data, times, ions, yunits=yunits)
Example #6
0
    def data(self):
        f = open(self.filename, 'rb')
        f.seek(19)
        while True:
            f.seek(f.tell() - 19)
            if f.read(19) == b'CRawDataScanStorage':
                break
            if f.read(1) == b'':
                f.close()
                return

        f.seek(f.tell() + 62)
        nscans = struct.unpack('H', f.read(2))[0]

        # TODO: this shouldn't be hardcoded
        ions = [44, 45, 46]
        ni = len(ions)

        f.seek(f.tell() + 35)
        data = np.array([struct.unpack('<f' + ni * 'd', f.read(4 + ni * 8))
                         for _ in range(nscans)])
        data[:, 0] /= 60.  # convert time to minutes
        # self.data = TimeSeries(data[:, 1:], data[:, 0], ions)
        f.close()
        return Chromatogram(data[:, 1:], data[:, 0], ions)
Example #7
0
    def data(self):
        # convenience function for reading in data
        def rd(f, st):
            return struct.unpack(st, f.read(struct.calcsize(st)))

        # open the file
        f = open(self.filename, 'rb')

        nscans = rd(f, 'ii')[1]
        if nscans == 0:
            self.data = Trace(np.array([]), np.array([]), [])
            return
        times = np.array(rd(f, nscans * 'd')) / 60.0
        f.seek(f.tell() + 4)  # number of scans again

        # set up the array of column indices
        indptr = np.empty(nscans + 1, dtype=int)
        indptr[0] = 0

        # figure out the total number of points
        dpos = f.tell()
        tot_pts = 0
        for scn in range(nscans):
            npts = rd(f, 'i')[0]
            # rd(f, npts * 'f' + 'i' + n_pts * 'f')
            f.seek(f.tell() + 8 * npts + 4)
            tot_pts += npts
            indptr[scn + 1] = tot_pts
        f.seek(dpos)

        ions = []
        i_lkup = {}
        idxs = np.empty(tot_pts, dtype=int)
        vals = np.empty(tot_pts, dtype=float)

        for scn in range(nscans):
            npts = rd(f, 'i')[0]
            rd_ions = rd(f, npts * 'f')
            f.seek(f.tell() + 4)  # number of points again
            abun = rd(f, npts * 'f')

            nions = set([int(i) for i in rd_ions if int(i) not in i_lkup])
            i_lkup.update(
                dict((ion, i + len(ions) - 1) for i, ion in enumerate(nions)))
            ions += nions

            idxs[indptr[scn]:indptr[scn + 1]] = \
                [i_lkup[int(i)] for i in rd_ions]
            vals[indptr[scn]:indptr[scn + 1]] = \
                abun

        idxs += 1
        data = scipy.sparse.csr_matrix((vals, idxs, indptr),
                                       shape=(nscans, len(ions)),
                                       dtype=float)
        return Chromatogram(data, times, ions)
Example #8
0
    def data(self):
        # TODO: handle skip mass ranges
        with open(self.filename, 'rb') as f:
            # read in the time segments/mz ranges for the run

            # read in the data itself
            doff = find_offset(f, 4 * b'\xff' + 'HapsScan'.encode('ascii'))
            if doff is None:
                return
            f.seek(doff - 20)
            data_end = doff + struct.unpack('<I', f.read(4))[0] + 55

            f.seek(doff + 56)
            times, abns, mzs = [], [], []
            cur_seg = None
            mz_reader = self._ions(f)
            while f.tell() <= data_end:
                # record info looks like a standard format
                n, t, _, recs, _, seg = struct.unpack('<IiHHHH', f.read(16))
                if cur_seg != seg:
                    # if we've switched segments, update the list of mzs
                    try:
                        cur_mzs = next(mz_reader)
                    except StopIteration:
                        break
                    mzs += set(cur_mzs).difference(mzs)
                    mz_idx = [mzs.index(i) for i in cur_mzs]
                cur_seg = seg

                # just add the new time in
                times.append(t)

                # read the list of abundances
                cur_abns = struct.unpack('<' + 'f' * recs, f.read(4 * recs))
                # convert this list into an array that matches up with
                # whatever mzs we currently have
                empty_row = np.zeros(len(mzs))
                empty_row[mz_idx] = cur_abns
                # add that row into the list
                abns.append(empty_row)

        # convert the time from milliseconds to minutes
        times = np.array(times, dtype=float) / 60000
        # create the data array and populate it
        data = np.zeros((len(times), len(mzs)))
        for i, r in enumerate(abns):
            data[i, 0:len(r)] = r
        return Chromatogram(data, times, mzs)
Example #9
0
    def data(self):
        # TODO: the chromatograms this generates are not exactly the
        # same as the ones in the *.CH files. Maybe they need to be 0'd?
        f = open(self.filename, 'rb')

        f.seek(0x146)
        yunits = f.read(struct.unpack('>B',
                                      f.read(1))[0]).decode('ascii').strip()

        f.seek(0x116)
        nscans = struct.unpack('>i', f.read(4))[0]

        times = np.zeros(nscans)
        data = nscans * [{}]
        ions = []
        npos = 0x202
        for i in range(nscans):
            f.seek(npos)
            npos += struct.unpack('<H', f.read(2))[0]
            times[i] = struct.unpack('<L', f.read(4))[0] / 60000.
            nm_srt = struct.unpack('<H', f.read(2))[0] / 20.
            nm_end = struct.unpack('<H', f.read(2))[0] / 20.
            nm_stp = struct.unpack('<H', f.read(2))[0] / 20.
            f.read(8)
            s = {}
            v = struct.unpack('<h', f.read(2))[0] / 2000.
            s[nm_srt] = v
            for wv in np.arange(nm_srt, nm_end, nm_stp):
                ov = struct.unpack('<h', f.read(2))[0]
                if ov == -32768:
                    v = struct.unpack('<i', f.read(4))[0] / 2000.
                else:
                    v += ov / 2000.
                s[wv] = v
                if wv not in ions:
                    ions.append(wv)
            data[i] = s

        ndata = np.zeros((nscans, len(ions)))
        for i, d in zip(range(nscans), data):
            for ion, abn in d.items():
                ndata[i, ions.index(ion)] = abn
        return Chromatogram(ndata, times, ions, yunits=yunits)
Example #10
0
    def data(self):
        f = open(self.filename, 'rb')

        # TODO: use find_offset to find this?
        # f.seek(11)
        # while True:
        #     f.seek(f.tell() - 11)
        #     if f.read(11) == b'CEvalGCData':
        #         break
        #     if f.read(1) == b'':
        #         f.close()
        #         return
        f.seek(find_offset(f, b'CRawData') + 9)
        strlen = 2 * struct.unpack('<B', f.read(1))[0]
        tname = f.read(strlen).decode('utf_16_le')
        if tname == 'CO2':
            ions = [44, 45, 46]
        elif tname == 'CO':
            ions = [28, 29, 30]
        elif tname == 'SO2,SO-SO2 Ext,SO':
            # TODO: check this is in the right order
            ions = [48, 49, 50, 64, 65, 66]
        else:
            # TODO: should save the tname somewhere for future reference
            ions = [1, 2, 3]

        f.seek(find_offset(f, b'CEvalGCData') + 4)

        # bytes until the end converted to # of records
        nscans = int(struct.unpack('<I', f.read(4))[0] /
                     (4.0 + len(ions) * 8.0))

        dtype = np.dtype([('index', '<f4'), ('values', '<f8', len(ions))])
        data = np.fromfile(f, dtype=dtype, count=nscans)
        # convert time to minutes
        data['index'] /= 60.

        f.close()
        return Chromatogram(data['values'], data['index'], ions)
Example #11
0
    def data(self):
        f = open(self.filename, 'rb')

        # get number of scans to read in
        # note that GC and LC chemstation store this in slightly different
        # places
        f.seek(0x5)
        if f.read(4) == 'GC':
            f.seek(0x142)
        else:
            f.seek(0x118)
        nscans = struct.unpack('>H', f.read(2))[0]

        f.seek(0x10A)
        f.seek(2 * struct.unpack('>H', f.read(2))[0] - 2)
        dstart = f.tell()

        # determine total number of measurements in file
        tot_pts = 0
        rowst = np.empty(nscans + 1, dtype=int)
        rowst[0] = 0

        for scn in range(nscans):
            # get the position of the next scan
            npos = f.tell() + 2 * struct.unpack('>H', f.read(2))[0]

            # keep a running total of how many measurements
            tot_pts += (npos - f.tell() - 26) / 4
            rowst[scn + 1] = tot_pts

            # move forward
            f.seek(npos)

        # go back to the beginning and load all the other data
        f.seek(dstart)

        ions = []
        i_lkup = {}
        cols = np.empty(tot_pts, dtype=int)
        vals = np.empty(tot_pts, dtype=np.int32)
        times = np.empty(nscans)

        for scn in range(nscans):
            npos = f.tell() + 2 * struct.unpack('>H', f.read(2))[0]

            # the sampling rate is evidentally 60 kHz on all Agilent's MS's
            times[scn] = struct.unpack('>I', f.read(4))[0] / 60000.

            f.seek(f.tell() + 12)
            npts = rowst[scn + 1] - rowst[scn]
            mzs = struct.unpack('>' + npts * 'HH', f.read(npts * 4))

            # there's some bug in the numpy implementation that makes this fail
            # after the first time
            # mzs = np.fromfile(f, dtype='>H', count=npts * 2)

            nions = set(mzs[0::2]).difference(i_lkup)
            i_lkup.update({ion: i + len(ions) for i, ion in enumerate(nions)})
            ions += nions

            cols[rowst[scn]:rowst[scn + 1]] = [i_lkup[i] for i in mzs[0::2]]
            vals[rowst[scn]:rowst[scn + 1]] = mzs[1::2]
            f.seek(npos)
        f.close()

        vals = ((vals & 16383) * 8 ** (vals >> 14)).astype(float)
        data = scipy.sparse.csr_matrix((vals, cols, rowst),
                                       shape=(nscans, len(ions)), dtype=float)
        ions = np.array(ions) / 20.
        return Chromatogram(data, times, ions)
Example #12
0
    def old_data(self):
        f = open(self.filename, 'rb')

        # get number of scans to read in
        # note that GC and LC chemstation store this in slightly different
        # places
        f.seek(0x5)
        if f.read(4) == 'GC':
            f.seek(0x142)
        else:
            f.seek(0x118)
        nscans = struct.unpack('>H', f.read(2))[0]

        # find the starting location of the data
        f.seek(0x10A)
        f.seek(2 * struct.unpack('>H', f.read(2))[0] - 2)

        # make a list of all of the ions and also read in times
        ions = set()
        times = np.empty(nscans)
        scan_locs = np.empty(nscans, dtype=int)
        scan_pts = np.empty(nscans, dtype=int)
        for scn in range(nscans):
            npos = f.tell() + 2 * struct.unpack('>H', f.read(2))[0]

            # the sampling rate is evidentally 60 kHz on all Agilent's MS's
            times[scn] = struct.unpack('>I', f.read(4))[0] / 60000.

            f.seek(f.tell() + 6)
            npts = struct.unpack('>H', f.read(2))[0]

            # jump to the data and save relevant parameters for later
            f.seek(f.tell() + 4)
            scan_locs[scn] = f.tell()
            scan_pts[scn] = npts

            # TODO: use numpy.fromfile?
            nions = np.fromfile(f, dtype='>H', count=npts * 2)[0::2]
            if scn < 2:
                print(npts)
                print(nions)
            # nions = struct.unpack('>' + npts * 'HH', f.read(npts * 4))[0::2]
            ions.update(nions)
            f.seek(npos)

        ions = np.array(sorted(list(ions)))
        data = np.empty((len(times), len(ions)), dtype=float)
        for scn in range(nscans):
            f.seek(scan_locs[scn])
            # TODO: use numpy.fromfile?
            mzs = np.fromfile(f, dtype='>H', count=scan_pts[scn] * 2)
            # mzs = np.array(struct.unpack('>' + npts * 'HH', f.read(npts * 4)))  # noqa
            if len(mzs) == 0:
                continue
            ilocs = np.searchsorted(ions, mzs[0::2])
            abn = (mzs[1::2] & 16383) * 8 ** (mzs[1::2] >> 14)
            data[scn][ilocs] = abn
        f.close()

        ions /= 20.
        return Chromatogram(data, times, ions)
Example #13
0
def read_peaks(db, filename, ftype='isodat'):
    if ftype is None:
        with open(filename, 'r') as f:
            header = f.readline()
            if 'd 13C/12C[per mil]vs. VPDB' in header:
                ftype = 'isodat'
            else:
                ftype = 'amdis'
    if ftype == 'amdis':
        delim = '\t'
        cvtr = {'name': 'name',
                'p-s-time': 'rt',
                'p-s-area': 'area'}
    elif ftype == 'isodat':
        delim = ','
        cvtr = {'name': 'peak nr.',
                'p-s-time': 'rt[s]',
                'p-s-area': 'area all[vs]',
                'p-s-width': 'width[s]',
                'p-s-d13c': 'd 13c/12c[per mil]vs. vpdb',
                'p-s-d18o': 'd 18o/16o[per mil]vs. vsmow'}
    headers = None
    mapping = defaultdict(list)
    ref_pk_info = {}

    def get_val(line, cols, key):
        return line.split(delim)[cols.index(key)]

    with open(filename, 'r') as f:
        for line in f:
            if bool(re.match('filename' + delim,
                             line, re.I)) or headers is None:
                headers = line.lower().split(',')
                continue
            fn = get_val(line, headers, 'filename')
            if ftype == 'amdis':
                # AMDIS has '.FIN' sufffixes and other stuff, so
                # munge Filename to get it into right format
                cmp_lvl = 2
                fn = op.splitext('/'.join(fn.split('\\')[-cmp_lvl:]))[0]
            # find if filtered filename overlaps with anything in the db
            for dt in db.children_of_type('file'):
                if fn in '/'.join(dt.rawdata.split(op.sep)):
                    break
            else:
                continue
            info = {}
            # load all the predefined fields
            for k in cvtr:
                info[k] = get_val(line, headers, cvtr[k])

            # create peak shapes for plotting
            if ftype == 'isodat':
                rt = float(info['p-s-time']) / 60.
                width = float(info['p-s-width']) / 60.
                t = np.linspace(rt - width, rt + width)
                data = []
                for ion in ['44', '45', '46']:
                    area = float(get_val(line, headers,
                                         'rarea ' + ion + '[mvs]')) / 60.
                    # bgd = float(get_val(line, headers, \
                    #                       'bgd ' + ion + '[mv]'))
                    height = float(get_val(line, headers,
                                           'ampl. ' + ion + '[mv]'))
                    # save the height at 44 into the info for linearity
                    if ion == '44':
                        info['p-s-ampl44'] = height
                    # 0.8 is a empirical number to make things look better
                    data.append(gaussian(t, x=rt, w=0.5 * area / height,
                                         h=height))
                # save info if this is the main ref gas peak
                if info['name'].endswith('*'):
                    ref_pk_info[dt] = info
                ts = Chromatogram(np.array(data).T, t, [44, 45, 46])
            else:
                ts = Chromatogram(np.array([np.nan]), np.array([np.nan]), [''])
            mapping[dt] += [Peak(info, ts)]
    # do drift correction
    if ftype == 'isodat':
        for dt in mapping:
            ref_pks = []
            hgt44 = ref_pk_info[dt]['p-s-ampl44']
            d18o = float(ref_pk_info[dt]['p-s-d18o'])
            d13c = float(ref_pk_info[dt]['p-s-d13c'])
            for pk in mapping[dt]:
                # if the d18o and height are similar, it's a ref peak
                if abs(pk.info['p-s-ampl44'] - hgt44) < 10. and \
                   abs(float(pk.info['p-s-d18o']) - d18o) < 2.:
                    ref_pks.append(pk)

            # get out the dd13C values and times for the ref gas peaks
            d13cs = [float(pk.info['p-s-d13c']) for pk in ref_pks]
            dd13cs = np.array(d13cs) - d13c
            rts = [float(pk.info['p-s-time']) for pk in ref_pks]

            # try to fit a linear model through all of them
            p0 = [d13cs[0], 0]

            def errfunc(p, x, y):
                return p[0] + p[1] * x - y

            try:
                p, succ = leastsq(errfunc, p0, args=(np.array(rts), dd13cs))
            except Exception:
                p = p0
            # apply the linear model to get the dd13C linearity correction
            # for a given time and add it to the value of this peak
            for pk in mapping[dt]:
                pk.info['p-s-d13c'] = str(-errfunc(p,
                                                   float(pk.info['p-s-time']),
                                                   float(pk.info['p-s-d13c'])))

    # save everything
    with db:
        for dt in mapping:
            dt.children += mapping[dt]
Example #14
0
    def data(self):
        f = open(self.filename, 'rb')

        f.seek(0xC15)
        yunits = string_read(f)

        f.seek(0x116)
        nscans = struct.unpack('>i', f.read(4))[0]

        # get all wavelengths and times
        wvs = set()
        times = np.empty(nscans)
        npos = 0x1002
        for i in range(nscans):
            f.seek(npos)
            npos += struct.unpack('<H', f.read(2))[0]
            times[i] = struct.unpack('<L', f.read(4))[0]
            nm_srt, nm_end, nm_stp = struct.unpack('<HHH', f.read(6))
            n_wvs = np.arange(nm_srt, nm_end, nm_stp) / 20.
            wvs.update(set(n_wvs).difference(wvs))
        wvs = list(wvs)

        ndata = np.empty((nscans, len(wvs)), dtype="<i4")
        npos = 0x1002

        # try to speed up by preloading the function
        unpack = struct.unpack
        seek = f.seek
        read = f.read
        tell = f.tell

        for i in range(nscans):
            seek(npos)
            dlen = unpack('<H', read(2))[0]
            npos += dlen
            seek(tell() + 4)  # skip time
            nm_srt, nm_end, nm_stp = unpack('<HHH', read(6))
            seek(tell() + 8)

            # OLD CODE
            v = 0
            pos = f.tell()
            for wv in np.arange(nm_srt, nm_end, nm_stp) / 20.:
                ov = unpack('<h', read(2))[0]
                if ov == -32768:
                    v = unpack('<i', read(4))[0]
                else:
                    v += ov
                ndata[i, wvs.index(wv)] = v
            seek(pos)

            #  WORKING ON A FASTER WAY TO READ ALL THIS DATA BELOW
            #  read in all the data
            # data = np.fromfile(f, dtype="<i2", count=int((dlen - 24) / 2))

            #  if there are any records marked -32768, we need to reinterpret
            #  parts of the array as i4's
            # oob_idxs = np.where(data == -32768)[0]

            #  locations of the cells to merge into 32-bit ints
            # big_idxs = np.repeat(oob_idxs, 2) + \
            #         np.tile([1, 2], oob_idxs.shape[0])
            # big_data = data[big_idxs].view('<i4').copy()

            #  remove the 32-bit cells, so the arrays the right size
            # data = np.delete(data, big_idxs).astype("<i4")

            #  set the marker cells to the right values
            # oob_idxs = np.where(data == -32768)[0]
            # data[oob_idxs] = big_data

            #  compute cumulative sums for each chunk
            # pidx = 0
            # if data.shape[0] != ndata.shape[1]:
            #     print(big_data)
            # for idx in np.hstack([oob_idxs, data.shape[0]]):
            #     ndata[i, pidx:idx] = np.cumsum(data[pidx:idx])
            #     pidx = idx

        return Chromatogram(ndata / 2000., times / 60000., wvs, yunits=yunits)
Example #15
0
 def data(self):
     if self._data is not None:
         return self._data
     else:
         return Chromatogram()
Example #16
0
 def wrap_func(df, *args):
     # TODO: should vectorize to apply over all columns?
     return Chromatogram(f(df.values, *args), df.index, df.columns)