def filter_fraction(pm: PeakMatrix, fraction_threshold: float, within_classes: bool = False, class_tag_type: Any = None, flag_name: str = 'fraction_flag'): """ PeakMatrix fraction filter. :param pm: the target peak matrix :param fraction_threshold: threshold of the sample fractions :param within_classes: whether to calculate the fraction array within each class. Default = False :param class_tag_type: tag type to unmask samples within the same class (e.g. "classLabel"). Default = None :param flag_name: name of the new flag. Default = 'fraction_flag' :rtype: PeakMatrix object This filter will calculate the fraction array over all samples or within each class (based on class_tag_type). The peaks with a fraction value smaller than the threshold will be unflagged. """ if not within_classes: pm.add_flag(flag_name, pm.fraction >= fraction_threshold) else: if class_tag_type is None: raise KeyError( 'must provide class tag type for within classes filtering') if not all([t.has_tag_type(class_tag_type) for t in pm.peaklist_tags]): raise AttributeError('not all tags have tag type [%s]' % class_tag_type) flg = np.zeros(pm.shape[1]) for tag in pm.tags_of(class_tag_type): with unmask_peakmatrix(pm, tag) as m: flg = np.logical_or(flg, (m.fraction >= fraction_threshold)) pm.add_flag(flag_name, flg) return pm
def filter_rsd(pm: PeakMatrix, rsd_threshold: Union[int, float], qc_tag: Any, on_attr: str = 'intensity', flag_name: str = 'rsd_flag'): """ PeakMatrix RSD filter. :param pm: the target peak matrix :param rsd_threshold: threshold of the RSD of the QC samples :param qc_tag: tag (label) to unmask qc samples :param on_attr: calculate RSD on given attribute. Default = "intensity" :param flag_name: name of the new flag. Default = 'rsd_flag' :rtype: PeakMatrix object This filter will calculate the RSD values of the QC samples. A peak with a QC RSD value larger than the threshold will be unflagged. """ rsd_values = pm.rsd(qc_tag, on_attr=on_attr) if np.any(np.isnan(rsd_values)): logging.warning( 'nan found in QC rsd values, filter might not work properly') pm.add_flag(flag_name, [not (np.isnan(v) or v > rsd_threshold) for v in rsd_values]) return pm
def load_peak_matrix_from_txt(filename: str, delimiter: str = '\t', samples_in_rows: bool = True, comprehensive: str = 'auto'): """ Loads a peak matrix from plain text file. :param filename: path to an exiting text-based peak matrix file :param delimiter: delimiter of the text lines. Default = '\t', i.e., TSV format :param samples_in_rows: whether or not the samples are stored in rows. Default = True :param comprehensive: whether the input is a 'comprehensive' or 'simple' version of the matrix. Default = 'auto', i.e., auto detect :rtype: PeakMatrix object """ if not os.path.isfile(filename): raise IOError('plain text file [%s] does not exist' % filename) with open(filename, 'r') as f: rlns = [x for x in f.readlines() if x != ''] dlns = [list(map(str.strip, x.split(delimiter))) for x in rlns] if any([len(x) != len(dlns[0]) for x in dlns[1:]]): raise IOError('data matrix size not match') if samples_in_rows: dlns = list(zip(*dlns)) if comprehensive == 'auto': comprehensive = ('flags' in dlns[0]) rdlns = list(zip(*dlns)) rsdrow = list(filter(lambda x: x[1][0] == 'rsd_all', enumerate(rdlns)))[0][0] def _parseflags(): fgs = [] for l, ln in enumerate(rdlns[rsdrow+1:]): if ln[0] == 'flags': break fgs += [(ln[0], list(map(eval, [x for x in ln[1:] if x != ''])))] return fgs flgs = _parseflags() if comprehensive else [] # must refactor if PeakMatrix.to_str changed pcol = rsdrow + len(flgs) + 2 if comprehensive else 1 pids = dlns[0][pcol:] def _parsetags(tgs): l = 0 for l, ln in enumerate(dlns[2:]): # line 1 = missing if not ln[0].startswith('tags_'): break tn, tv = ln[0][5:], ln[pcol:] tl = [x for x in enumerate(_evalv(tv)) if x[1] != ''] for i, v in tl: tgs[i].add_tag(v) if tn == 'untyped' else tgs[i].add_tag(v, tn) return l, tgs tnum, tags = 0, [PeakList_Tags() for _ in pids] if comprehensive: tnum, tags = _parsetags(tags) rlns = list(zip(*dlns[2 + tnum:])) mz = np.array([rlns[0]] * len(pids), dtype=float) ints = np.array(rlns[pcol:], dtype=float) pm = PeakMatrix(pids, tags, [('mz', mz), ('intensity', ints)]) for fn, fv in flgs: pm.add_flag(fn, fv, flagged_only = False) return pm
def load_peak_matrix_from_hdf5(filename: str, compatibility_mode: bool = False): """ Loads a peak matrix from a HDF5 file. :param filename: path to an existing HDF5 file :rtype: PeakMatrix object """ if not os.path.isfile(filename): raise IOError('HDF5 database [%s] does not exist' % filename) if not h5py.is_hdf5(filename): raise IOError('input file [%s] is not a valid HDF5 database' % filename) if compatibility_mode: logging.warning('DeprecationWarning: loading HDF file in the old format') f = h5py.File(filename, 'r') if compatibility_mode else ptb.open_file(filename, mode = 'r') def _old_loadpm(): dset = f['mz'] if _convByteStr(dset.attrs.get('class', '')) != 'PeakMatrix': raise IOError('input database is not a valid PeakMatrix') attl = dset.attrs['attributes'].astype(str) pids = dset.attrs['peaklist_ids'].astype(str) mask = dset.attrs['mask'] tatt = sorted([x for x in dset.attrs.keys() if x.startswith('peaklist_tags_')], key=lambda x: int(x[14:])) ptgs = [PeakList_Tags(*[Tag(_eval(v), None if t == 'None' else t) for t,v in map(lambda x: x.astype(str), tags)]) for tags in [dset.attrs[x] for x in tatt]] flgs = [(fn, dset.attrs[fn]) for fn in dset.attrs['flag_names'].astype(str)] alst = [(attr, np.array(f[attr]).astype(f[attr].attrs['dtype'])) for attr in attl] return pids, ptgs, alst, mask, flgs def _loadpm(): dset = f.root.mz if dset.attrs.data_class != 'PeakMatrix': raise IOError('input database is not a valid PeakMatrix') attl = dset.attrs.attributes pids = dset.attrs.peaklist_ids mask = dset.attrs.mask tatt = sorted([x for x in dset.attrs._f_list('user') if x.startswith('peaklist_tags_')], key = lambda x: int(x[14:])) ptgs = [PeakList_Tags(*[Tag(_eval(v), None if t == 'None' else t) for t, v in map(lambda x: x.astype(str), tags)]) for tags in [dset.attrs[x] for x in tatt]] flgs = [(flg, dset.attrs[flg]) for flg in dset.attrs.flag_names] alst = [(attr, f.root[attr].read().astype(f.root[attr].attrs.dtype)) for attr in attl] return pids, ptgs, alst, mask, flgs res = (_old_loadpm if compatibility_mode else _loadpm)() f.close() pm = PeakMatrix(*res[:3]) pm.mask = res[3] for fn, fv in res[4]: pm.add_flag(fn, fv, flagged_only=False) return pm
def load_peak_matrix_from_hdf5(filename): """ Loads a peak matrix from a HDF5 file. :param filename: path to an existing HDF5 file :rtype: PeakMatrix object """ if not os.path.isfile(filename): raise IOError('HDF5 database [%s] does not exist' % filename) if not h5py.is_hdf5(filename): raise IOError('input file [%s] is not a valid HDF5 database' % filename) f = h5py.File(filename, 'r') if 'mz' not in f: raise IOError('input database missing crucial attribute [mz]') dset = f['mz'] if dset.attrs.get('class', '') != 'PeakMatrix': raise IOError('input database is not a valid PeakMatrix') attl = dset.attrs['attributes'] pids = dset.attrs['peaklist_ids'] mask = dset.attrs['mask'] tatt = sorted(filter(lambda x: x.startswith('peaklist_tags_'), dset.attrs.keys()), key=lambda x: int(x[14:])) ptgs = [ PeakList_Tags( *[Tag(_eval(v), None if t == 'None' else t) for t, v in tags]) for tags in map(lambda x: dset.attrs[x], tatt) ] flgs = [(fn, dset.attrs[fn]) for fn in dset.attrs['flag_names']] flgs = [(fn, _unpackBool(fv) if fv.dtype.kind == 'u' and np.all(fv[:len(_BOOL_HEADERS)] == _BOOL_HEADERS) else \ _unpackMeta(fv) if fv.dtype.kind == 'S' and fv[-1] == '\xFF' else fv) for fn,fv in flgs] alst = [(attr, np.array(f[attr]).astype(f[attr].attrs['dtype'])) for attr in attl] pm = PeakMatrix(pids, ptgs, alst) pm.mask = mask for fn, fv in flgs: pm.add_flag(fn, fv, flagged_only=False) return pm
def filter_blank_peaks(pm: PeakMatrix, blank_tag: Any, fraction_threshold: Union[int, float] = 1, fold_threshold: Union[int, float] = 1, method: str = 'mean', rm_blanks: bool = True, flag_name: str = 'blank_flag'): """ PeakMatrix blank filter. :param pm: the target peak matrix :param blank_tag: tag (label) to mask blank samples. e.g Tag("blank", "classLabel") :param fraction_threshold: threshold of the sample fractions. Default = 1 :param fold_threshold: threshold of the blank sample intensity folds. Default = 1 :param method: method to calculate blank sample intensity array. Valid values include 'mean', 'median', and 'max'. Default = 'mean' :param rm_blanks: whether to remove (not mask) blank samples after filtering :param flag_name: name of the new flag. Default = 'blank_flag' :rtype: PeakMatrix object This filter will calculate the intensity array of the blanks using the "method", and compare with the intensities of the other samples. If fraction_threshold% of the intensity values of a peak are smaller than the blank intensities x fold_threshold, this peak will be unflagged. """ if not any([blank_tag in x for x in pm.peaklist_tags]): raise ValueError('blank tag [%s] does not exist' % blank_tag) if method not in ('mean', 'median', 'max'): raise ValueError('filter method must be mean, median or max') with unmask_peakmatrix(pm, blank_tag) as m: ints = m.intensity_matrix[0] if m.shape[0] == 1 else \ np.max(m.intensity_matrix, axis=0) if method == 'max' else \ np.array([getattr(np, method)(x) for x in m.intensity_matrix.T]) ints *= fold_threshold with mask_peakmatrix(pm, blank_tag) as m: faild_int = np.sum(m.intensity_matrix >= ints, axis=0) < (fraction_threshold * m.shape[0]) m.add_flag(flag_name, ~((ints > 0) & faild_int)) if rm_blanks: pm = pm.remove_samples( np.where([x.has_tag(blank_tag) for x in pm.peaklist_tags])[0]) return pm
def _createPeakMatrix(): pids, tags = list( zip(*[ ('sample_1_1', PeakList_Tags('sample', treatment='compound_1', time_point='1hr', plate=1, order=1)), ('sample_1_2', PeakList_Tags('sample', treatment='compound_1', time_point='6hr', plate=1, order=2)), ('QC_1', PeakList_Tags('qc', plate=1, order=3)), ('sample_2_1', PeakList_Tags('sample', treatment='compound_2', time_point='1hr', plate=2, order=1)), ('sample_2_2', PeakList_Tags('sample', treatment='compound_2', time_point='6hr', plate=2, order=2)), ('QC_2', PeakList_Tags('qc', plate=2, order=3)), ])) mzs = np.tile(np.arange(0, 1000, step=100, dtype=float) + 1, (6, 1)) ints = np.arange(60, dtype=float).reshape((6, 10)) / 20. ics = np.array([[2] * 10] * 6) # simulate missing values for m in (mzs, ints, ics): np.fill_diagonal(m, 0) m[:, 2] = 0 return PeakMatrix(pids, tags, [('mz', mzs), ('intensity', ints), ('intra_count', ics)])
def _createPeakMatrix(): pids, tags = zip(*[ ('sample_1_1', PeakList_Tags('sample', treatment='compound_1', time_point='1hr', plate=1, order=1)), ('sample_1_2', PeakList_Tags('sample', treatment='compound_1', time_point='6hr', plate=1, order=2)), ('QC_1', PeakList_Tags('qc', plate=1, order=3)), ('Blank_1', PeakList_Tags('blank', plate=1, order=4)), ('sample_2_1', PeakList_Tags('sample', treatment='compound_2', time_point='1hr', plate=2, order=1)), ('sample_2_2', PeakList_Tags('sample', treatment='compound_2', time_point='6hr', plate=2, order=2)), ('QC_2', PeakList_Tags('qc', plate=2, order=3)), ('Blank_2', PeakList_Tags('blank', plate=2, order=4)), ]) mzs = np.tile(np.arange(0, 1000, step=100, dtype=float), (8, 1)) ints = np.arange(80, dtype=float).reshape((8, 10)) / 20. ics = np.array([[1, 2] * 5] * 8) return PeakMatrix(pids, tags, (('mz', mzs), ('intensity', ints), ('intra_count', ics)))
def align_peaks(peaks: Sequence[PeakList], ppm: float = 2.0, block_size: int = 5000, fixed_block: bool = True, edge_extend: Union[int, float] = 10, ncpus: Union[int, None] = None): """ Cluster and align peaklists into a peak matrix. :param peaks: list of peaklists for alignment :param ppm: the hierarchical clustering cutting height, i.e., ppm range for each aligned mz value. Default = 2.0 :param block_size: number peaks in each centre clustering block. This can be a exact or approximate number depends on the fixed_block parameter. Default = 5000 :param fixed_block: whether the blocks contain fixed number of peaks. Default = True :param edge_extend: ppm range for the edge blocks. Default = 10 :param ncpus: number of CPUs for parallel clustering. Default = None, indicating using as many as possible :rtype: PeakMatrix object .. figure:: images/alignment.png :align: center This function uses hierarchical clustering to align the mz values of the input peaklists. The alignment "width" is decided by the parameter of ppm. Due to a large number of peaks, this function splits them into blocks with fixed or approximate length, and clusters in a parallel manner on multiple CPUs. When running, the edge blocks are clustered first to prevent separating the same peak into two adjacent centre blocks. The size of the edge blocks is decided by edge_extend. The clustering of centre blocks is conducted afterwards. After merging the clustering results, all the attributes (mz, intensity, snr, etc.) are aligned into matrix accordingly. If multiple peaks from the same sample are clustered into one mz value, their attributes are averaged (for real value attributes e.g. mz and intensity) or concatenated (string, unicode, or bool attributes). The flag attributes are ignored. The number of these overlapping peaks is recorded in a new intra_count attribute matrix. """ # remove empty peaklists emlst = np.array([x.size == 0 for x in peaks]) if np.sum(emlst) > 0: logging.warning( 'droping empty peaklist(s) [%s]' % str.join(',', map(str, [p.ID for e, p in zip(emlst, peaks) if e]))) peaks = [p for e, p in zip(emlst, peaks) if not e] if len(peaks) == 0: raise ValueError('all input peaklists for alignment are empty') # obtain attrs attrs = peaks[0].attributes if attrs[:2] != ('mz', 'intensity'): raise AttributeError('PANIC: peak attributes in wrong order') if not all([attrs == x.attributes for x in peaks]): raise ValueError('peak attributes not the same') if 'intra_count' in attrs: raise AttributeError( 'preserved attribute name [intra_count] already exists') attrs = [x for x in attrs if x not in peaks[0].flag_attributes] # flags should be excluded # single peaklist if len(peaks) == 1: attrlst = [(a, peaks[0][a].reshape((1, -1))) for a in attrs] + \ [('intra_count', np.ones((1, peaks[0].size)))] return PeakMatrix([peaks[0].ID], [peaks[0].tags], attrlst) # flatten f_pids = np.hstack([[p.ID] * p.size for p in peaks]) f_attrs = [np.hstack([p[attr] for p in peaks]) for attr in attrs] sortids = np.argsort(f_attrs[0]) # attrs[0] -> mz values s_pids = f_pids[sortids] s_attrs = [x[sortids] for x in f_attrs] # cluster clusters = _cluster_peaks_map(s_attrs[0], ppm, block_size, fixed_block, edge_extend, ncpus) cids = _cluster_peaks_reduce(clusters) # align a_pids, a_attrms = _align_peaks(cids, s_pids, *s_attrs) attrs += ('intra_count', ) # for cM # sort by original pid pids = f_pids[sorted(np.unique(f_pids, return_index=True)[1])] pdct = dict((i, mi) for mi, i in enumerate(a_pids)) porder = [pdct[i] for i in pids] o_attrms = [x[porder] if x is not None else None for x in a_attrms] return PeakMatrix(pids, [p.tags for p in peaks], [x for x in zip(attrs, o_attrms) if x[1] is not None])