def data(self): delim = ',' try: # TODO: better, smarter error checking than this with open(self.filename, 'r') as f: lns = f.readlines() ions = [float(i) for i in lns[0].split(delim)[1:]] data = np.array( [np.fromstring(ln, sep=delim) for ln in lns[1:]]) return Chromatogram(data[:, 1:], data[:, 0], ions) except: return Chromatogram()
def data(self): fidx = open(self.filename, 'rb') fdat = open(op.splitext(self.filename)[0] + '.DAT', 'rb') ions = set([]) while True: fdat.seek(fdat.tell() + 2) try: i = struct.unpack('<H', fdat.read(2))[0] except struct.error: break ions.add(i) ions = sorted(ions) data = [] tme = [] while True: try: idx = struct.unpack('<IHHffhhh', fidx.read(22)) tme.append(idx[4]) fdat.seek(idx[0]) new_line = np.zeros(len(ions)) d = struct.unpack('<' + int(idx[1] / 4) * 'HH', fdat.read(idx[1])) for i, v in zip(d[1::2], d[0::2]): new_line[ions.index(i)] = v data.append(new_line) except struct.error: break fdat.close() fidx.close() return Chromatogram(np.array(data), np.array(tme), ions)
def read_amdis_list(db, filename): def get_val(line, cols, key): return line.split('\t')[cols.index(key)] cmp_lvl = 2 # number of directory levels to compare # TODO: does this work for agilent files only? mapping = defaultdict(list) with open(filename, 'r') as f: cols = f.readline().split('\t') for line in f: filename = get_val(line, cols, 'FileName') fn = op.splitext('/'.join(filename.split('\\')[-cmp_lvl:]))[0] # find if filtered filename overlaps with anything in the db for dt in db.children_of_type('file'): if fn in '/'.join(dt.rawdata.split(op.sep)): break else: continue info = {} info['name'] = get_val(line, cols, 'Name') info['p-s-time'] = get_val(line, cols, 'RT') info['p-s-area'] = get_val(line, cols, 'Area') ts = Chromatogram(np.array([np.nan]), np.array([np.nan]), ['']) mapping[dt] += [Peak(info, ts)] with db: for dt in mapping: dt.children += mapping[dt]
def data(self): fhead = open(self.filename[:-3] + '.sd', 'rb') fdata = open(self.filename[:-3] + '.sp', 'rb') fhead.seek(0x50) nscans = struct.unpack('Q', fhead.read(8))[0] fhead.seek(0xA4) ions = [] for scn in range(nscans): t = struct.unpack('<IdddIQIIdddd', fhead.read(80)) npts = t[7] if scn == 0: # TODO: this will fail if the wavelengths collected # change through the run. data = np.zeros((nscans, npts)) times = np.zeros((nscans)) ions = [t[8] + x * t[3] for x in range(npts)] times[scn] = t[1] fdata.seek(t[5] + 16) # TODO: use np.fromfile? data[scn] = struct.unpack('<' + npts * 'd', fdata.read(npts * 8)) return Chromatogram(data, times, ions) fhead.close() fdata.close()
def data(self): print(self.filename) # Because the spectra are stored in several files in the same # directory, we need to loop through them and return them together. ions = [] dtraces = [] foldname = os.path.dirname(self.filename) # if foldname == '': foldname = os.curdir for i in [os.path.join(foldname, i) for i in os.listdir(foldname)]: if i[-3:].upper() == '.CH': wv, dtrace = self._read_ind_file(i) if wv is None: continue # generate the time points if this is the first trace if len(ions) == 0: f = open(i, 'rb') f.seek(0x244) yunits = f.read(struct.unpack('>B', f.read(1))[0]).decode() f.seek(0x11A) st_t = struct.unpack('>i', f.read(4))[0] / 60000. en_t = struct.unpack('>i', f.read(4))[0] / 60000. f.close() times = np.linspace(st_t, en_t, len(dtrace)) # add the wavelength and trace into the appropriate places ions.append(float(wv)) dtraces.append(dtrace) data = np.array(dtraces).transpose() return Chromatogram(data, times, ions, yunits=yunits)
def data(self): f = open(self.filename, 'rb') f.seek(19) while True: f.seek(f.tell() - 19) if f.read(19) == b'CRawDataScanStorage': break if f.read(1) == b'': f.close() return f.seek(f.tell() + 62) nscans = struct.unpack('H', f.read(2))[0] # TODO: this shouldn't be hardcoded ions = [44, 45, 46] ni = len(ions) f.seek(f.tell() + 35) data = np.array([struct.unpack('<f' + ni * 'd', f.read(4 + ni * 8)) for _ in range(nscans)]) data[:, 0] /= 60. # convert time to minutes # self.data = TimeSeries(data[:, 1:], data[:, 0], ions) f.close() return Chromatogram(data[:, 1:], data[:, 0], ions)
def data(self): # convenience function for reading in data def rd(f, st): return struct.unpack(st, f.read(struct.calcsize(st))) # open the file f = open(self.filename, 'rb') nscans = rd(f, 'ii')[1] if nscans == 0: self.data = Trace(np.array([]), np.array([]), []) return times = np.array(rd(f, nscans * 'd')) / 60.0 f.seek(f.tell() + 4) # number of scans again # set up the array of column indices indptr = np.empty(nscans + 1, dtype=int) indptr[0] = 0 # figure out the total number of points dpos = f.tell() tot_pts = 0 for scn in range(nscans): npts = rd(f, 'i')[0] # rd(f, npts * 'f' + 'i' + n_pts * 'f') f.seek(f.tell() + 8 * npts + 4) tot_pts += npts indptr[scn + 1] = tot_pts f.seek(dpos) ions = [] i_lkup = {} idxs = np.empty(tot_pts, dtype=int) vals = np.empty(tot_pts, dtype=float) for scn in range(nscans): npts = rd(f, 'i')[0] rd_ions = rd(f, npts * 'f') f.seek(f.tell() + 4) # number of points again abun = rd(f, npts * 'f') nions = set([int(i) for i in rd_ions if int(i) not in i_lkup]) i_lkup.update( dict((ion, i + len(ions) - 1) for i, ion in enumerate(nions))) ions += nions idxs[indptr[scn]:indptr[scn + 1]] = \ [i_lkup[int(i)] for i in rd_ions] vals[indptr[scn]:indptr[scn + 1]] = \ abun idxs += 1 data = scipy.sparse.csr_matrix((vals, idxs, indptr), shape=(nscans, len(ions)), dtype=float) return Chromatogram(data, times, ions)
def data(self): # TODO: handle skip mass ranges with open(self.filename, 'rb') as f: # read in the time segments/mz ranges for the run # read in the data itself doff = find_offset(f, 4 * b'\xff' + 'HapsScan'.encode('ascii')) if doff is None: return f.seek(doff - 20) data_end = doff + struct.unpack('<I', f.read(4))[0] + 55 f.seek(doff + 56) times, abns, mzs = [], [], [] cur_seg = None mz_reader = self._ions(f) while f.tell() <= data_end: # record info looks like a standard format n, t, _, recs, _, seg = struct.unpack('<IiHHHH', f.read(16)) if cur_seg != seg: # if we've switched segments, update the list of mzs try: cur_mzs = next(mz_reader) except StopIteration: break mzs += set(cur_mzs).difference(mzs) mz_idx = [mzs.index(i) for i in cur_mzs] cur_seg = seg # just add the new time in times.append(t) # read the list of abundances cur_abns = struct.unpack('<' + 'f' * recs, f.read(4 * recs)) # convert this list into an array that matches up with # whatever mzs we currently have empty_row = np.zeros(len(mzs)) empty_row[mz_idx] = cur_abns # add that row into the list abns.append(empty_row) # convert the time from milliseconds to minutes times = np.array(times, dtype=float) / 60000 # create the data array and populate it data = np.zeros((len(times), len(mzs))) for i, r in enumerate(abns): data[i, 0:len(r)] = r return Chromatogram(data, times, mzs)
def data(self): # TODO: the chromatograms this generates are not exactly the # same as the ones in the *.CH files. Maybe they need to be 0'd? f = open(self.filename, 'rb') f.seek(0x146) yunits = f.read(struct.unpack('>B', f.read(1))[0]).decode('ascii').strip() f.seek(0x116) nscans = struct.unpack('>i', f.read(4))[0] times = np.zeros(nscans) data = nscans * [{}] ions = [] npos = 0x202 for i in range(nscans): f.seek(npos) npos += struct.unpack('<H', f.read(2))[0] times[i] = struct.unpack('<L', f.read(4))[0] / 60000. nm_srt = struct.unpack('<H', f.read(2))[0] / 20. nm_end = struct.unpack('<H', f.read(2))[0] / 20. nm_stp = struct.unpack('<H', f.read(2))[0] / 20. f.read(8) s = {} v = struct.unpack('<h', f.read(2))[0] / 2000. s[nm_srt] = v for wv in np.arange(nm_srt, nm_end, nm_stp): ov = struct.unpack('<h', f.read(2))[0] if ov == -32768: v = struct.unpack('<i', f.read(4))[0] / 2000. else: v += ov / 2000. s[wv] = v if wv not in ions: ions.append(wv) data[i] = s ndata = np.zeros((nscans, len(ions))) for i, d in zip(range(nscans), data): for ion, abn in d.items(): ndata[i, ions.index(ion)] = abn return Chromatogram(ndata, times, ions, yunits=yunits)
def data(self): f = open(self.filename, 'rb') # TODO: use find_offset to find this? # f.seek(11) # while True: # f.seek(f.tell() - 11) # if f.read(11) == b'CEvalGCData': # break # if f.read(1) == b'': # f.close() # return f.seek(find_offset(f, b'CRawData') + 9) strlen = 2 * struct.unpack('<B', f.read(1))[0] tname = f.read(strlen).decode('utf_16_le') if tname == 'CO2': ions = [44, 45, 46] elif tname == 'CO': ions = [28, 29, 30] elif tname == 'SO2,SO-SO2 Ext,SO': # TODO: check this is in the right order ions = [48, 49, 50, 64, 65, 66] else: # TODO: should save the tname somewhere for future reference ions = [1, 2, 3] f.seek(find_offset(f, b'CEvalGCData') + 4) # bytes until the end converted to # of records nscans = int(struct.unpack('<I', f.read(4))[0] / (4.0 + len(ions) * 8.0)) dtype = np.dtype([('index', '<f4'), ('values', '<f8', len(ions))]) data = np.fromfile(f, dtype=dtype, count=nscans) # convert time to minutes data['index'] /= 60. f.close() return Chromatogram(data['values'], data['index'], ions)
def data(self): f = open(self.filename, 'rb') # get number of scans to read in # note that GC and LC chemstation store this in slightly different # places f.seek(0x5) if f.read(4) == 'GC': f.seek(0x142) else: f.seek(0x118) nscans = struct.unpack('>H', f.read(2))[0] f.seek(0x10A) f.seek(2 * struct.unpack('>H', f.read(2))[0] - 2) dstart = f.tell() # determine total number of measurements in file tot_pts = 0 rowst = np.empty(nscans + 1, dtype=int) rowst[0] = 0 for scn in range(nscans): # get the position of the next scan npos = f.tell() + 2 * struct.unpack('>H', f.read(2))[0] # keep a running total of how many measurements tot_pts += (npos - f.tell() - 26) / 4 rowst[scn + 1] = tot_pts # move forward f.seek(npos) # go back to the beginning and load all the other data f.seek(dstart) ions = [] i_lkup = {} cols = np.empty(tot_pts, dtype=int) vals = np.empty(tot_pts, dtype=np.int32) times = np.empty(nscans) for scn in range(nscans): npos = f.tell() + 2 * struct.unpack('>H', f.read(2))[0] # the sampling rate is evidentally 60 kHz on all Agilent's MS's times[scn] = struct.unpack('>I', f.read(4))[0] / 60000. f.seek(f.tell() + 12) npts = rowst[scn + 1] - rowst[scn] mzs = struct.unpack('>' + npts * 'HH', f.read(npts * 4)) # there's some bug in the numpy implementation that makes this fail # after the first time # mzs = np.fromfile(f, dtype='>H', count=npts * 2) nions = set(mzs[0::2]).difference(i_lkup) i_lkup.update({ion: i + len(ions) for i, ion in enumerate(nions)}) ions += nions cols[rowst[scn]:rowst[scn + 1]] = [i_lkup[i] for i in mzs[0::2]] vals[rowst[scn]:rowst[scn + 1]] = mzs[1::2] f.seek(npos) f.close() vals = ((vals & 16383) * 8 ** (vals >> 14)).astype(float) data = scipy.sparse.csr_matrix((vals, cols, rowst), shape=(nscans, len(ions)), dtype=float) ions = np.array(ions) / 20. return Chromatogram(data, times, ions)
def old_data(self): f = open(self.filename, 'rb') # get number of scans to read in # note that GC and LC chemstation store this in slightly different # places f.seek(0x5) if f.read(4) == 'GC': f.seek(0x142) else: f.seek(0x118) nscans = struct.unpack('>H', f.read(2))[0] # find the starting location of the data f.seek(0x10A) f.seek(2 * struct.unpack('>H', f.read(2))[0] - 2) # make a list of all of the ions and also read in times ions = set() times = np.empty(nscans) scan_locs = np.empty(nscans, dtype=int) scan_pts = np.empty(nscans, dtype=int) for scn in range(nscans): npos = f.tell() + 2 * struct.unpack('>H', f.read(2))[0] # the sampling rate is evidentally 60 kHz on all Agilent's MS's times[scn] = struct.unpack('>I', f.read(4))[0] / 60000. f.seek(f.tell() + 6) npts = struct.unpack('>H', f.read(2))[0] # jump to the data and save relevant parameters for later f.seek(f.tell() + 4) scan_locs[scn] = f.tell() scan_pts[scn] = npts # TODO: use numpy.fromfile? nions = np.fromfile(f, dtype='>H', count=npts * 2)[0::2] if scn < 2: print(npts) print(nions) # nions = struct.unpack('>' + npts * 'HH', f.read(npts * 4))[0::2] ions.update(nions) f.seek(npos) ions = np.array(sorted(list(ions))) data = np.empty((len(times), len(ions)), dtype=float) for scn in range(nscans): f.seek(scan_locs[scn]) # TODO: use numpy.fromfile? mzs = np.fromfile(f, dtype='>H', count=scan_pts[scn] * 2) # mzs = np.array(struct.unpack('>' + npts * 'HH', f.read(npts * 4))) # noqa if len(mzs) == 0: continue ilocs = np.searchsorted(ions, mzs[0::2]) abn = (mzs[1::2] & 16383) * 8 ** (mzs[1::2] >> 14) data[scn][ilocs] = abn f.close() ions /= 20. return Chromatogram(data, times, ions)
def read_peaks(db, filename, ftype='isodat'): if ftype is None: with open(filename, 'r') as f: header = f.readline() if 'd 13C/12C[per mil]vs. VPDB' in header: ftype = 'isodat' else: ftype = 'amdis' if ftype == 'amdis': delim = '\t' cvtr = {'name': 'name', 'p-s-time': 'rt', 'p-s-area': 'area'} elif ftype == 'isodat': delim = ',' cvtr = {'name': 'peak nr.', 'p-s-time': 'rt[s]', 'p-s-area': 'area all[vs]', 'p-s-width': 'width[s]', 'p-s-d13c': 'd 13c/12c[per mil]vs. vpdb', 'p-s-d18o': 'd 18o/16o[per mil]vs. vsmow'} headers = None mapping = defaultdict(list) ref_pk_info = {} def get_val(line, cols, key): return line.split(delim)[cols.index(key)] with open(filename, 'r') as f: for line in f: if bool(re.match('filename' + delim, line, re.I)) or headers is None: headers = line.lower().split(',') continue fn = get_val(line, headers, 'filename') if ftype == 'amdis': # AMDIS has '.FIN' sufffixes and other stuff, so # munge Filename to get it into right format cmp_lvl = 2 fn = op.splitext('/'.join(fn.split('\\')[-cmp_lvl:]))[0] # find if filtered filename overlaps with anything in the db for dt in db.children_of_type('file'): if fn in '/'.join(dt.rawdata.split(op.sep)): break else: continue info = {} # load all the predefined fields for k in cvtr: info[k] = get_val(line, headers, cvtr[k]) # create peak shapes for plotting if ftype == 'isodat': rt = float(info['p-s-time']) / 60. width = float(info['p-s-width']) / 60. t = np.linspace(rt - width, rt + width) data = [] for ion in ['44', '45', '46']: area = float(get_val(line, headers, 'rarea ' + ion + '[mvs]')) / 60. # bgd = float(get_val(line, headers, \ # 'bgd ' + ion + '[mv]')) height = float(get_val(line, headers, 'ampl. ' + ion + '[mv]')) # save the height at 44 into the info for linearity if ion == '44': info['p-s-ampl44'] = height # 0.8 is a empirical number to make things look better data.append(gaussian(t, x=rt, w=0.5 * area / height, h=height)) # save info if this is the main ref gas peak if info['name'].endswith('*'): ref_pk_info[dt] = info ts = Chromatogram(np.array(data).T, t, [44, 45, 46]) else: ts = Chromatogram(np.array([np.nan]), np.array([np.nan]), ['']) mapping[dt] += [Peak(info, ts)] # do drift correction if ftype == 'isodat': for dt in mapping: ref_pks = [] hgt44 = ref_pk_info[dt]['p-s-ampl44'] d18o = float(ref_pk_info[dt]['p-s-d18o']) d13c = float(ref_pk_info[dt]['p-s-d13c']) for pk in mapping[dt]: # if the d18o and height are similar, it's a ref peak if abs(pk.info['p-s-ampl44'] - hgt44) < 10. and \ abs(float(pk.info['p-s-d18o']) - d18o) < 2.: ref_pks.append(pk) # get out the dd13C values and times for the ref gas peaks d13cs = [float(pk.info['p-s-d13c']) for pk in ref_pks] dd13cs = np.array(d13cs) - d13c rts = [float(pk.info['p-s-time']) for pk in ref_pks] # try to fit a linear model through all of them p0 = [d13cs[0], 0] def errfunc(p, x, y): return p[0] + p[1] * x - y try: p, succ = leastsq(errfunc, p0, args=(np.array(rts), dd13cs)) except Exception: p = p0 # apply the linear model to get the dd13C linearity correction # for a given time and add it to the value of this peak for pk in mapping[dt]: pk.info['p-s-d13c'] = str(-errfunc(p, float(pk.info['p-s-time']), float(pk.info['p-s-d13c']))) # save everything with db: for dt in mapping: dt.children += mapping[dt]
def data(self): f = open(self.filename, 'rb') f.seek(0xC15) yunits = string_read(f) f.seek(0x116) nscans = struct.unpack('>i', f.read(4))[0] # get all wavelengths and times wvs = set() times = np.empty(nscans) npos = 0x1002 for i in range(nscans): f.seek(npos) npos += struct.unpack('<H', f.read(2))[0] times[i] = struct.unpack('<L', f.read(4))[0] nm_srt, nm_end, nm_stp = struct.unpack('<HHH', f.read(6)) n_wvs = np.arange(nm_srt, nm_end, nm_stp) / 20. wvs.update(set(n_wvs).difference(wvs)) wvs = list(wvs) ndata = np.empty((nscans, len(wvs)), dtype="<i4") npos = 0x1002 # try to speed up by preloading the function unpack = struct.unpack seek = f.seek read = f.read tell = f.tell for i in range(nscans): seek(npos) dlen = unpack('<H', read(2))[0] npos += dlen seek(tell() + 4) # skip time nm_srt, nm_end, nm_stp = unpack('<HHH', read(6)) seek(tell() + 8) # OLD CODE v = 0 pos = f.tell() for wv in np.arange(nm_srt, nm_end, nm_stp) / 20.: ov = unpack('<h', read(2))[0] if ov == -32768: v = unpack('<i', read(4))[0] else: v += ov ndata[i, wvs.index(wv)] = v seek(pos) # WORKING ON A FASTER WAY TO READ ALL THIS DATA BELOW # read in all the data # data = np.fromfile(f, dtype="<i2", count=int((dlen - 24) / 2)) # if there are any records marked -32768, we need to reinterpret # parts of the array as i4's # oob_idxs = np.where(data == -32768)[0] # locations of the cells to merge into 32-bit ints # big_idxs = np.repeat(oob_idxs, 2) + \ # np.tile([1, 2], oob_idxs.shape[0]) # big_data = data[big_idxs].view('<i4').copy() # remove the 32-bit cells, so the arrays the right size # data = np.delete(data, big_idxs).astype("<i4") # set the marker cells to the right values # oob_idxs = np.where(data == -32768)[0] # data[oob_idxs] = big_data # compute cumulative sums for each chunk # pidx = 0 # if data.shape[0] != ndata.shape[1]: # print(big_data) # for idx in np.hstack([oob_idxs, data.shape[0]]): # ndata[i, pidx:idx] = np.cumsum(data[pidx:idx]) # pidx = idx return Chromatogram(ndata / 2000., times / 60000., wvs, yunits=yunits)
def data(self): if self._data is not None: return self._data else: return Chromatogram()
def wrap_func(df, *args): # TODO: should vectorize to apply over all columns? return Chromatogram(f(df.values, *args), df.index, df.columns)