def test_compress(): a = Trace(np.array([10, 20, 30, 40, 50]), np.array([1, 2, 3, 4, 5]), name='X') zdata = 'eJxjZWBgEAHiaKUIpVgGhgZ7INsBiIC4' \ 'AYgXADEIqEBpEyhtB6VdoLSnAwAZfgbw' assert base64.b64encode(a.compress()).decode('ascii') == zdata
def setUp(self): # box peak info = {} trace = Trace(np.ones(10), np.arange(10), name=1) baseline = Trace([0, 9], [0, 0], name=1) c = PeakComponent(info, trace, baseline) self.peak = Peak('box', components=c)
def test_add(): t = np.array([1, 2, 3, 4, 5]) a = Trace(np.array([10, 20, 30, 40, 50]), t, name='X') b = Trace(np.array([11, 21, 31, 41, 51]), t, name='X') c = a + b assert np.all(np.equal(c.values, np.array([21, 41, 61, 81, 101]))) assert np.all(np.equal(c.index, np.array([1, 2, 3, 4, 5])))
def setUp(self): # box peak info = {} v = [ 0.043, 0.067, 0.094, 0.117, 0.131, 0.131, 0.117, 0.094, 0.067, 0.043 ] trace = Trace(v, np.arange(10), name=1) baseline = Trace([0, 9], [0, 0], name=1) c = PeakComponent(info, trace, baseline) self.peak = Peak('gaussian', components=c)
def trace(self, name='', tol=0.5, twin=None): if isinstance(name, (int, float, np.float32, np.float64)): name = str(name) else: name = name.lower() # name of the 2d trace, if it exists if any(t.startswith('#') for t in self.traces): t2d = [t[1:] for t in self.traces if t.startswith('#')][0] # clip out the starting 'MS' if present if name.startswith(t2d): name = name[len(t2d):] else: t2d = '' # this is the only string we handle; all others handled in subclasses if name in ['tic', 'x', '']: return self.total_trace(twin) elif name in self.traces: return self._trace(name, twin) elif t2d != '': # this file contains 2d data; find the trace in that return self.data.trace(name, tol, twin) else: return Trace()
def mrm_trace(self, parent=None, daughter=None, tol=0.5, twin=None): # TODO: should override `trace` and then call parent's `trace` method # if name is not an mrm trace if twin is None: twin = (-np.inf, np.inf) tme, ic = [], [] for t, off, bc, pc, minx, maxx, d_mz, p_mz, z in self._scan_iter([ 'ScanTime', 'SpectrumOffset', 'ByteCount', 'PointCount', 'MinX', 'MaxX', 'BasePeakMZ', 'MzOfInterest', 'TIC' ]): if t < twin[0]: continue elif t > twin[1]: break if parent is not None: if np.abs(parent - p_mz) > tol: continue if daughter is not None: if np.abs(daughter - d_mz) > tol: continue tme.append(t) ic.append(z) return Trace(np.array(ic), np.array(tme), name=str(str(parent) + '→' + str(daughter)))
def series_from_str(val, times, name=''): # TODO: generate this without needing the times? just the time length # we can store time-series data as a list of timepoints # in certain info fields and query it here def is_num(x): # stupid function to determine if something is a number try: float(x) return True except ValueError: return False if ',' in val: # turn the time list into a dictionary tpts = dict([tpt.split(':') for tpt in val.split(',')]) # get the valid times out valid_x = [v for v in tpts if is_num(v)] # generate arrays from them x = np.array([float(v) for v in valid_x]) y = np.array([float(tpts[v]) for v in valid_x]) srt_ind = np.argsort(x) if 'S' in tpts: # there's a "S"tart value defined d = np.interp(times, x[srt_ind], y[srt_ind], float(tpts['S'])) else: d = np.interp(times, x[srt_ind], y[srt_ind]) elif is_num(val): d = np.ones(times.shape) * float(val) else: d = np.ones(times.shape) * np.nan return Trace(d, times, name=name)
def basemz(df): """ The mz of the most abundant ion. """ # returns the d = np.array(df.columns)[df.values.argmax(axis=1)] return Trace(d, df.index, name='basemz')
def total_trace(self, twin=None): # TODO: use twin? f = open(self.filename, 'rb') # get number of scans to read in f.seek(0x5) if f.read(4) == 'GC': f.seek(0x142) else: f.seek(0x118) nscans = struct.unpack('>H', f.read(2))[0] # find the starting location of the data f.seek(0x10A) f.seek(2 * struct.unpack('>H', f.read(2))[0] - 2) tme = np.zeros(nscans) tic = np.zeros(nscans) for i in range(nscans): npos = f.tell() + 2 * struct.unpack('>H', f.read(2))[0] tme[i] = struct.unpack('>I', f.read(4))[0] / 60000. f.seek(npos - 4) tic[i] = struct.unpack('>I', f.read(4))[0] f.seek(npos) f.close() return Trace(tic, tme, name='TIC')
def total_trace(self, twin=None): f = open(self.filename, 'rb') f.seek(0x11A) start_time = struct.unpack('>f', f.read(4))[0] / 60000. # end_time = struct.unpack('>f', f.read(4))[0] / 60000. # end_time 0x11E '>i' # FIXME: why is there this del_ab code here? # f.seek(0x284) # del_ab = struct.unpack('>d', f.read(8))[0] data = [] f.seek(0x400) delt = 0 while True: try: inp = struct.unpack('>h', f.read(2))[0] except struct.error: break if inp == 32767: inp = struct.unpack('>i', f.read(4))[0] inp2 = struct.unpack('>H', f.read(2))[0] delt = 0 data.append(inp * 65534 + inp2) else: delt += inp data.append(data[-1] + delt) f.close() # TODO: 0.4/60.0 should be obtained from the file??? times = np.array(start_time + np.arange(len(data)) * (0.2 / 60.0)) # times = np.linspace(start_time, end_time, data.shape[0]) return Trace(np.array([data]).T, times, name='TIC')
def fft(ts): """ Perform a fast-fourier transform on a Trace """ t_step = ts.index[1] - ts.index[0] oc = np.abs(np.fft.fftshift(np.fft.fft(ts.values))) / len(ts.values) t = np.fft.fftshift(np.fft.fftfreq(len(oc), d=t_step)) return Trace(oc, t)
def _trace(self, name, twin): # TODO: read info from new style REG files df = read_multireg_file(open(self.filename, 'rb'), title=self._tr_names[name]) if 'Trace' not in df: return Trace([], []) ts = df['Trace'] ts.name = name return ts.twin(twin)
def data(self): # convenience function for reading in data def rd(f, st): return struct.unpack(st, f.read(struct.calcsize(st))) # open the file f = open(self.filename, 'rb') nscans = rd(f, 'ii')[1] if nscans == 0: self.data = Trace(np.array([]), np.array([]), []) return times = np.array(rd(f, nscans * 'd')) / 60.0 f.seek(f.tell() + 4) # number of scans again # set up the array of column indices indptr = np.empty(nscans + 1, dtype=int) indptr[0] = 0 # figure out the total number of points dpos = f.tell() tot_pts = 0 for scn in range(nscans): npts = rd(f, 'i')[0] # rd(f, npts * 'f' + 'i' + n_pts * 'f') f.seek(f.tell() + 8 * npts + 4) tot_pts += npts indptr[scn + 1] = tot_pts f.seek(dpos) ions = [] i_lkup = {} idxs = np.empty(tot_pts, dtype=int) vals = np.empty(tot_pts, dtype=float) for scn in range(nscans): npts = rd(f, 'i')[0] rd_ions = rd(f, npts * 'f') f.seek(f.tell() + 4) # number of points again abun = rd(f, npts * 'f') nions = set([int(i) for i in rd_ions if int(i) not in i_lkup]) i_lkup.update( dict((ion, i + len(ions) - 1) for i, ion in enumerate(nions))) ions += nions idxs[indptr[scn]:indptr[scn + 1]] = \ [i_lkup[int(i)] for i in rd_ions] vals[indptr[scn]:indptr[scn + 1]] = \ abun idxs += 1 data = scipy.sparse.csr_matrix((vals, idxs, indptr), shape=(nscans, len(ions)), dtype=float) return Chromatogram(data, times, ions)
def mzminus(df, minus=0, noise=10000): """ The abundances of ions which are minus below the molecular ion. """ mol_ions = ((df.values > noise) * df.columns).max(axis=1) - minus mol_ions[np.abs(mol_ions) < 0] = 0 d = np.abs(np.ones(df.shape) * df.columns - (mol_ions[np.newaxis].T * np.ones(df.shape))) < 1 d = (df.values * d).sum(axis=1) return Trace(d, df.index, name='m-' + str(minus))
def total_trace(self, twin=None): # TODO: use twin # TODO: only get the scans with totIonCurrent; if none found # calculate from the data r = Et.parse(self.filename).getroot() s = r.findall('*//m:scan', namespaces=self.ns) d = np.array([float(i.get('totIonCurrent')) for i in s]) t = np.array([t_to_min(i.get('retentionTime')) for i in s]) return Trace(d, t, name='TIC')
def loads(ast_str): """ Create a Trace from a suitably compressed string. """ data = zlib.decompress(ast_str) li = struct.unpack('<L', data[0:4])[0] lt = struct.unpack('<L', data[4:8])[0] n = data[8:8 + li].decode('utf-8') t = np.fromstring(data[8 + li:8 + li + lt]) d = np.fromstring(data[8 + li + lt:]) return Trace(d, t, name=n)
def total_trace(self, twin=None): r = Et.parse(self.filename).getroot() # get it from the chromatogram list c = r.find('.//m:cvParam[@accession="MS:1000235"]/..', namespaces=self.ns) if c is not None: q = './/m:cvParam[@accession="MS:1000595"]/..' index = self.read_binary(c.find(q, namespaces=self.ns)) q = './/m:cvParam[@accession="MS:1000515"]/..' values = self.read_binary(c.find(q, namespaces=self.ns)) return Trace(values, index, name='tic')
def total_trace(self, twin=None): if twin is None: twin = (-np.inf, np.inf) tme = [] tic = [] for t, z in self._scan_iter(['ScanTime', 'TIC']): if t < twin[0]: continue elif t > twin[1]: break tme.append(t) tic.append(z) return Trace(np.array(tic), np.array(tme), name='TIC')
def total_trace(self, twin=None): if twin is None: twin = (-np.inf, np.inf) times, y = [], [] for s in self.scans(twin): t = float(s.name) if t < twin[0]: continue if t > twin[1]: break times.append(t) y.append(sum(s.abn)) return Trace(y, times, name='tic')
def generate_chromatogram(n=5, twin=None): """ Generates a trace with n gaussian peaks distributed through it. """ if twin is None: twin = (0, 60) t = np.linspace(twin[0], twin[1], 300) peak_locs = twin[1] * np.random.random(n) peak_ws = 0.2 + 0.8 * np.random.random(n) peak_hs = 0.2 + 0.8 * np.random.random(n) y = np.zeros(len(t)) for peak_loc, peak_w, peak_h in zip(peak_locs, peak_ws, peak_hs): y += gaussian(t, x=peak_loc, w=peak_w, h=peak_h) y += np.random.normal(scale=0.01, size=len(t)) return Trace(y, t, ['X'])
def total_trace(self, twin=None): f = open(self.filename, 'rb') f.seek(0x11A) start_time = struct.unpack('>f', f.read(4))[0] / 60000. end_time = struct.unpack('>f', f.read(4))[0] / 60000. # TODO: figure out if this exists and where? # FID signal seems like 10x higher than it should be? # f.seek(0x284) # del_ab = 0.1 # struct.unpack('>d', f.read(8))[0] # data = [] f.seek(0x1800) data = np.fromfile(f, '<f8') times = np.linspace(start_time, end_time, data.shape[0]) return Trace(data, times, name='TIC')
def trace(self, name='', tol=0.5, twin=None): if twin is None: twin = (-np.inf, np.inf) if name in {'tic', 'x', ''}: return self.total_trace(twin) times, y = [], [] for s in self.scans(twin): t = float(s.name) if t < twin[0]: continue if t > twin[1]: break times.append(t) # TODO: this can be vectorized with numpy? y.append(sum(j for i, j in zip(s.x, s.abn) if np.abs(i - name) < tol)) return Trace(y, times, name=name)
def read_mh_trace(filename, trace_name): f = open(filename, 'rb') fdat = open(filename[:-3] + '.cg', 'rb') ttab = { 'pres': 'Pressure', 'flow': 'Flow', 'slvb': '%B', 'temp': 'Temperature of Left Heat Exchanger' } # convenience function for reading in data def rd(st): return struct.unpack(st, f.read(struct.calcsize(st))) f.seek(0x4c) num_traces = rd('<I')[0] for _ in range(num_traces): cloc = f.tell() f.seek(cloc + 2) sl = rd('<B')[0] cur_trace_name = rd('<' + str(sl) + 's')[0] if ttab[trace_name] == cur_trace_name: f.seek(f.tell() + 4) foff = rd('<Q')[0] npts = rd('<I')[0] + 2 # +2 for the extra time info fdat.seek(foff) pts = struct.unpack('<' + npts * 'd', fdat.read(8 * npts)) # TODO: pts[0] is not the true offset? t = pts[0] + pts[1] * np.arange(npts - 2) d = np.array(pts[2:]) # get the units f.seek(f.tell() + 40) sl = rd('<B')[0] y_units = rd('<' + str(sl) + 's')[0] if y_units == 'bar': d *= 0.1 # convert to MPa for metricness elif y_units == '': pass # TODO: ul/min to ml/min return Trace(d, t, name=trace_name) f.seek(cloc + 87)
def generate_gaussian(): t = np.linspace(0, 60, 300) y = gaussian(t, x=30, w=2, h=1) return Trace(y, t, ['X'])
def read_reg_file(f, foff=0x2D): """ Given a file handle for an old-style Agilent *.REG file, this will parse that file into a dictonary of key/value pairs (including any tables that are in the *.REG file, which will be parsed into lists of lists). """ # convenience function for reading in data def rd(st): return struct.unpack(st, f.read(struct.calcsize(st))) f.seek(0x19) if f.read(1) != b'A': # raise TypeError("Version of REG file is too new.") return {} f.seek(foff) nrecs = rd('<I')[0] # TODO: should be '<H' rec_tab = [rd('<HHIII') for n in range(nrecs)] names = {} f.seek(foff + 20 * nrecs + 4) for r in rec_tab: d = f.read(r[2]) if r[1] == 1539: # '0306' # this is part of the linked list too, but contains a # reference to a table cd = struct.unpack('<HIII21sI', d) names[cd[5]] = cd[4].decode('iso8859').strip('\x00') # except: # pass elif r[1] == 32769 or r[1] == 32771: # b'0180' or b'0380' names[r[4]] = d[:-1].decode('iso8859') elif r[1] == 32774: # b'0680' # this is a string that is referenced elsewhere (in a table) names[r[4]] = d[2:-1].decode('iso8859') elif r[1] == 32770: # b'0280' # this is just a flattened numeric array names[r[4]] = np.frombuffer(d, dtype=np.uint32, offset=4) data = {} f.seek(foff + 20 * nrecs + 4) for r in rec_tab: d = f.read(r[2]) if r[1] == 1538: # '0206' # this is part of a linked list if len(d) == 43: cd = struct.unpack('<HIII21sd', d) data[cd[4].decode('iso8859').strip('\x00')] = cd[5] else: pass elif r[1] == 1537: # b'0106' # name of property n = d[14:30].split(b'\x00')[0].decode('iso8859') # with value from names data[n] = names.get(struct.unpack('<I', d[35:39])[0], '') elif r[1] == 1793: # b'0107' # this is a table of values nrow = struct.unpack('<H', d[4:6])[0] ncol = struct.unpack('<H', d[16:18])[0] if ncol != 0: cols = [ struct.unpack('<16sHHHHHI', d[20 + 30 * i:50 + 30 * i]) for i in range(ncol) ] colnames = [ c[0].split(b'\x00')[0].decode('iso8859') for c in cols ] # TODO: type 2 is not a constant size? 31, 17 rty2sty = { 1: 'H', 3: 'I', 4: 'f', 5: 'H', 7: 'H', 8: 'd', 11: 'H', 12: 'H', 13: 'I', 14: 'I', 16: 'H' } coltype = '<' + ''.join( [rty2sty.get(c[3], str(c[2]) + 's') for c in cols]) lencol = struct.calcsize(coltype) tab = [] for i in reversed(range(2, nrow + 2)): rawrow = struct.unpack(coltype, d[-i * lencol:(1 - i) * lencol]) row = [] for j, p in enumerate(rawrow): if cols[j][3] == 3: row.append(names.get(p, str(p))) else: row.append(p) tab.append(row) data[names[r[4]]] = [colnames, tab] elif r[1] == 1281 or r[1] == 1283: # b'0105' or b'0305' fm = '<HHBIIhIdII12shIddQQB8sII12shIddQQB8s' m = struct.unpack(fm, d) nrecs = m[4] # number of points in table # x_units = names.get(m[8], '') x_arr = m[14] * names.get(m[9], np.arange(nrecs - 1)) y_arr = m[25] * names.get(m[20]) y_units = names.get(m[19], '') if y_units == 'bar': y_arr *= 0.1 # convert to MPa # TODO: what to call this? data['Trace'] = Trace(y_arr, x_arr, name='') # elif r[1] == 1025: # b'0104' # # lots of zeros? maybe one or two numbers? # # only found in REG entries that have long 0280 records # fm = '<HQQQIHHHHIIHB' # m = struct.unpack(fm, d) # print(m) # #print(r[1], len(d), binascii.hexlify(d)) # pass # elif r[1] == 512: # b'0002' # # either points to two null pointers or two other pointers # # (indicates start of linked list?) # print(r[1], len(d), binascii.hexlify(d)) # elif r[1] == 769 or r[1] == 772: # b'0103' or b'0403' # # points to 2nd, 3rd & 4th records (two 0002 records and a 0180) # b = binascii.hexlify # print(b(d[10:14]), b(d[14:18]), b(d[18:22])) return data
def trace_resolver(istr, analyses, twin=None): avail_sources = [ i.lstrip('#*') for a in analyses for i in a.trace.split(',') ] istr, source = token_source(istr, avail_sources) if source is None: return Trace() for a in analyses: if source in [i.lstrip('#*') for i in a.trace.split(',')]: df = a.datafile break else: df = None if istr in {'coda', 'rnie', 'wmsm'}: # TODO: allow more complicated options to turn # Chromatograms into plotable Traces # coda # Windig W: The use of the Durbin-Watson criterion for # noise and background reduction of complex liquid # chromatography/mass spectrometry data and a new algorithm # to determine sample differences. Chemometrics and # Intelligent Laboratory Systems. 2005, 77:206-214. # rnie # Yunfei L, Qu H, and Cheng Y: A entropy-based method # for noise reduction of LC-MS data. Analytica Chimica # Acta 612.1 (2008) # wmsm # Fleming C et al. Windowed mass selection method: # a new data processing algorithm for LC-MS data. # Journal of Chromatography A 849.1 (1999) 71-85. pass elif istr.startswith('m_'): if istr == 'm_': m = 0.0 else: m = float(istr.split('_')[1]) return mzminus(df.data, m) elif istr == 'molmz': return molmz(df.data) elif istr == 'basemz': return basemz(df.data) elif istr in {'r45std', 'r46std'}: # TODO: calculate isotopic data -> needs integrated peak # references of associated peaks in order to make these calculations pass # calculate isotopic reference for chromatogram # if name == 'r45std': # topion = 45 # else: # topion = 46 # std_specs = [o for o in \ # self.children_of_type('peak') \ # if o.info['p-type'] == 'Isotope Standard'] # x = [float(o.info['p-s-time']) for o in std_specs] # y = [o.area(topion) / o.area(44) for o in std_specs \ # if o.area(44) != 0] # if len(x) == 0 or len(y) == 0: # return self._const(0.0, twin) # p0 = [y[0], 0] # errfunc = lambda p, x, y: p[0] + p[1] * x - y # try: # p, succ = leastsq(errfunc, p0, args=(np.array(x), \ # np.array(y))) # except: # p = p0 # sim_y = np.array(errfunc(p, t, np.zeros(len(t)))) # return TimeSeries(sim_y, t, [name]) else: # interpret tolerances if ':' in istr: st = float(istr.split(':')[0]) en = float(istr.split(':')[1]) tol = 0.5 * (en - st) istr = 0.5 * (en + st) elif u'±' in istr: tol = float(istr.split(u'±')[1]) istr = float(istr.split(u'±')[0]) else: tol = 0.5 return df.trace(istr, tol, twin=twin)
def molmz(df, noise=10000): """ The mz of the molecular ion. """ d = ((df.values > noise) * df.columns).max(axis=1) return Trace(d, df.index, name='molmz')
def parse_ion_string(istr, analyses, twin=None): """ Recursive string parser that handles "ion" strings. """ if istr.strip() == '': return Trace() # remove (unnessary?) pluses from the front # TODO: plus should be abs? istr = istr.lstrip('+') # invert it if preceded by a minus sign if istr[0] == '-': return -parse_ion_string(istr[1:], analyses, twin) # this is a function or paranthesized expression if is_parans_exp(istr): if ')' not in istr: # unbalanced parantheses pass fxn = istr.split('(')[0] args = istr[istr.find('(') + 1:istr.find(')')].split(',') if fxn == '': # strip out the parantheses and continue istr = args[0] else: ts = parse_ion_string(args[0], analyses, twin) # FIXME return ts # return fxn_resolver(ts, fxn, *args[1:]) # all the complicated math is gone, so simple lookup if set(istr).intersection(set('+-/*()')) == set(): if istr in SHORTCUTS: # allow some shortcuts to pull out common ions return parse_ion_string(SHORTCUTS[istr], analyses, twin) elif istr[0] == '!' and all(i in '0123456789.' for i in istr[1:]): # TODO: should this handle negative numbers? return float(istr[1:]) elif istr == '!pi': return np.pi elif istr == '!e': return np.e else: return trace_resolver(istr, analyses, twin) # go through and handle operators for token in '/*+-^': if len(tokenize(istr, token)) != 1: ts = tokenize(istr, token) s = parse_ion_string(ts[0], analyses, twin) for t in ts[1:]: if token == '/': s /= parse_ion_string(t, analyses, twin) elif token == '*': s *= parse_ion_string(t, analyses, twin) elif token == '+': s += parse_ion_string(t, analyses, twin) elif token == '-': s -= parse_ion_string(t, analyses, twin) elif token == '^': s **= parse_ion_string(t, analyses, twin) return s raise Exception('Parser hit a point it shouldn\'t have!')
def ratio_series(ts, pks, r2, r1): sim_y = ratio_f(pks, r2, r1) return Trace(sim_y(ts.index), ts.index, name='{:.1f}/{:.1f}'.format(r2, r1))