def fromsarray(cls, array: np.ndarray, dtype: Optional[Union[str, type, np.ndarray.dtype]] = None, headerpos: Optional[Union[Sequence[int], np.ndarray]] = None) -> Table: _r = re.compile('#<::([<>|]?[biufcmMOSUV]\\d*)::>') _findt = lambda x: (lambda v: v[0] if len(v) > 0 else '')(_r.findall(x)) if missing(headerpos): mtab = np.vectorize(_findt)(array[:100,:100]) dpos = np.c_[np.where(mtab != '')] if dpos.shape[0] >= 2: raise ValueError('string array has multiple headers') if dpos.shape[0] == 0: raise ValueError('string array has no header in the first 100 rows / cols') headerpos = dpos[0] rids, cids = headerpos if missing(dtype): dtype = _findt(array[rids,cids]) if dtype == '': raise ValueError('unknown array data type') ridx = StructuredArray.fromsarray(array[rids:,:cids].T) if cids > 0 else None cidx = StructuredArray.fromsarray(array[:rids,cids:]) if rids > 0 else None rnam = array[rids+1:,cids] if np.all(rnam == smap(range(rnam.shape[0]), lambda x: f'[{x}]')): rnam = None cnam = array[rids,cids+1:] if np.all(cnam == smap(range(cnam.shape[0]), lambda x: f'[{x}]')): cnam = None dmtx = array[rids+1:,cids+1:] return Table(dmtx, dtype = dtype, rownames = rnam, colnames = cnam, rowindex = ridx, colindex = cidx)
def _fmt(mtx, rnam, cnam, ridx, cidx): nr, nc = mtx.shape if missing(rnam): rnam = smap(range(nr), lambda x: f'[{x}]') if missing(cnam): cnam = smap(range(nc), lambda x: f'[{x}]') _sln = lambda x,sr,hd,tl,rp: (smap(x[:hd],str) + [rp] + smap(x[tl:],str)) if sr else smap(x, str) _scol = lambda x: _sln(x, nc > strinkcols, 3, -1, ' ... ') _srow = lambda x: _sln(x, nr > strinkrows, 5, -3, '') slns = [_scol(cnam)] + \ ([_scol(ln) for ln in mtx] if nr <= strinkrows else ([_scol(ln) for ln in mtx[:5]] + [_scol([' ... ... '] + [''] * (nc-1))] + [_scol(ln) for ln in mtx[-3:]])) slns = [['#'] + slns[0]] + [[n] + ln for n,ln in zip(_srow(rnam), slns[1:])] nri = ridx.size if available(ridx) else 0 nci = cidx.size if available(cidx) else 0 if nci > 0: slns = [[f'<{k}>'] + _scol(cidx[k]) for k in cidx.names] + slns if nri > 0: sidx = [[''] * nci + [f'<{k}>'] + _srow(ridx[k]) for k in ridx.names] slns = [list(ix) + ln for ix,ln in zip(zip(*sidx), slns)] def _sfmt(lns, pos): size = max(collapse(smap(lns, lambda x: smap(x[pos], lambda v: len(v) if v not in (' ... ', ' ... ... ') else 0)))) + 1 for ln in lns: ln[pos] = smap(ln[pos], lambda x: '{0:>{1}s}'.format(x, size) if x != ' ... ' else x) return lns if nri > 0: slns = _sfmt(slns, slice(None,nri)) slns = _sfmt(slns, slice(nri,nri+1)) slns = _sfmt(slns, slice(nri+1,None)) return smap(slns, lambda ln: paste(ln, sep = delimiter))
def asMatrix(val: Iterable[Iterable], nrow: Optional[int] = None, ncol: Optional[int] = None, rownames: Optional[Iterable] = None, colnames: Optional[Iterable] = None) -> robj.Matrix: if not (isinstance(val, np.ndarray) and val.ndim == 2): val = np.asarray(smap(val, ll)) if missing(nrow) and missing(ncol): nrow, ncol = val.shape matx = robj.r.matrix(val, nrow=nrow, ncol=ncol) if available(rownames): matx.rownames = robj.StrVector(np.asarray(ll(rownames), dtype=str)) if available(colnames): matx.colnames = robj.StrVector(np.asarray(ll(colnames), dtype=str)) return matx
def execute(self, query: str) -> SQLiteWrapper: if missing(self._dbconn): raise IOError('database not connected') try: self._dbconn.execute(query) except Exception as e: logging.warning('sqlite execution failed: %s', str(e)) return self
def apply(func: str, *args: Any, **kwargs: Any) -> Any: args = pickmap(args, missing, robj.NULL) kwargs = { k: (robj.NULL if missing(v) else v) for k, v in kwargs.items() } return getattr(robj.r, func)(*args, **kwargs)
def put(self, pos: Indices2D, value: Any, axis: Optional[int] = 0, inline: bool = False) -> StructuredArray: narr = self if inline else self.copy() vals = self._parsevals(value) if isstring(pos): if not isinstance(vals, np.ndarray): raise ValueError('input array not in 1-dimensional') if missing(narr._length): narr._length = vals.shape[0] elif narr._length != vals.shape[0]: raise ValueError('input array size not match') narr._arrs[pos] = vals.copy() else: sids, aids = self._parseids(pos, axis=axis) if not isinstance(vals, list): for k in sids: narr._arrs[k][aids] = vals else: if len(sids) != len(vals): raise ValueError('input names and values size not match') for k, vals in zip(sids, vals): narr._arrs[k][aids] = vals return narr
def _exec(self, pms): params, stdin, timeout = pms # for multiproc exlst = [self._bin] + ([] if missing(params) else smap( params, lambda x: str(x).strip())) if self._shell: exlst = paste(smap(exlst, lambda x: x.replace(' ', r'\ ')), sep=' ') procs = Popen(exlst, stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=self._shell) try: rvals = procs.communicate(input=stdin, timeout=timeout) rstrs = smap( rvals, lambda x: '' if x is None else x.decode('utf-8').strip()) rcode = procs.returncode except TimeoutExpired: procs.kill() rstrs = ['subprocess terminated as timeout expired', ''] rcode = 124 prstr = paste(rstrs, sep=' | ') if rcode in self._ncode: logging.log((logging.DEBUG if self._mute else logging.INFO), prstr) else: raise RuntimeError(f'execution failed [{rcode}]:\n{prstr}') return rcode, rstrs
def delete(self, pos: Indices2D, axis: Optional[int] = 0, inline: bool = False) -> StructuredArray: narr = self if inline else self.copy() if isstring(pos): del narr._arrs[pos] return narr sids, aids = self._parseids(pos, axis=axis, mapslice=False) slic = isinstance(sids, slice) and sids == slice(None) alic = isinstance(aids, slice) and aids == slice(None) and (missing(axis) or axis == 0) if slic and alic: narr._arrs = OrderedDict() narr._length = None elif slic and not alic: if listable(aids) and len(aids) == 1 and aids[0] < 0: aids = aids[ 0] # fix the issue that currently negative indices are ignored by np.delete for k, v in narr._arrs.items(): narr._arrs[k] = np.delete(v, aids) narr._length = len(narr._arrs[l(narr._arrs.keys())[0]]) elif not slic and alic: if isinstance(sids, slice) or sids.dtype.kind not in ('S', 'U'): sids = narr.names[sids] for k in sids: del narr._arrs[k] else: raise IndexError('unable to delete portion of the array') return narr
def __init__(self, items: Optional[Union[Iterable, Mapping, np.ndarray, StructuredArray]] = None, **kwargs: Iterable): if isinstance(items, StructuredArray): self._arrs, self._length = items._arrs.copy(), items._length return vals = [(k, items[k]) for k in items.dtype.names] if isinstance(items, np.ndarray) and available(items.dtype.names) else \ items.items() if ismapping(items) else \ items if iterable(items) else \ kwargs.items() if missing(items) else None if missing(vals): raise TypeError('unknow data type') self._arrs = OrderedDict() self._length = None for k, v in vals: self[k] = v
def close(self, commit: bool = True) -> SQLiteWrapper: if missing(self._dbconn): logging.warning( 'connection to database [%s] already closed, ignore', fileTitle(self._dbfile)) else: if commit: self._dbconn.commit() self._dbconn.close() self._dbconn = None return self
def insert(self, pos: Indices, value: Union[str, Iterable[str]], inline: bool = False) -> NamedIndex: if missing(pos): return self.append(value, inline) pos = self._parseids(pos) val = self._parsevals(value) nid = self if inline else self.copy() nid._names = np.insert(nid._names, pos, val) nid._reindex() return nid
def __init__(self, binary: Union[str, Path], shell: bool = False, normcodes: Union[int, Iterable[int]] = 0, mute: bool = False): self._bin = self.which(binary) if missing(self._bin): raise RuntimeError(f'binary executable [{binary}] not reachable') self._shell = shell self._ncode = (normcodes, ) if isinstance(normcodes, int) else ll(normcodes) self._mute = mute
def mapexec(self, params: Optional[Iterable[Sequence]] = None, stdin: Optional[Iterable[Union[bytes, str]]] = None, timeout: Optional[int] = None, nthreads: Optional[int] = None, nprocs: Optional[int] = None) -> List[Tuple[int, List[str]]]: if available(params): params = ll(params) if available(stdin): stdin = ll(stdin) if available(params) and available( stdin) and len(params) != len(stdin): raise RuntimeError('parameters and stdins size not match') if missing(params) and missing(stdin): raise RuntimeError('both parameters and stdins are missing') n = len(params) mpms = [(p, s, timeout) for p, s in zip(optional(params, [None] * n), optional(stdin, [None] * n))] _map = partial(pmap, nprocs = nprocs) if available(nprocs) else \ partial(tmap, nthreads = nthreads) if available(nthreads) else smap return _map(mpms, self._exec)
def idsof(self, names: Union[str, Iterable[str]], safe: bool = False) -> Union[None, int, List[Union[None, int]]]: if isstring(names): ids = self._nidct.get(names, None) if not safe and missing(ids): raise KeyError(f'unknown index name {names}') else: ids = [ self._nidct.get(n, None) if isstring(n) else n for n in names ] if not safe and checkany(ids, missing): raise KeyError('unknown index name(s)') return ids
def asVector(val: Iterable, names: Optional[Iterable] = None) -> robj.Vector: val = np.asarray(ll(val)) vect = { 'i': robj.IntVector, 'u': robj.IntVector, 'f': robj.FloatVector, 'b': robj.BoolVector, 'S': robj.StrVector, 'U': robj.StrVector, }.get(val.dtype.kind, lambda x: None)(val) if missing(vect): raise TypeError(f'unknown vector type [{val.dtype.kind}]') if available(names): vect.names = robj.StrVector(np.asarray(ll(names), dtype=str)) return vect
def _parseids(self, idx, axis = None, mapslice = True): if missing(axis): rids, cids = (idx, slice(None)) if not isinstance(idx, tuple) else \ (idx[0], slice(None)) if len(idx) == 1 else idx else: if isinstance(idx, tuple): raise IndexError('too many dimensions for array') if axis not in (0, 1): raise IndexError('invalid axis value') rids, cids = (idx, slice(None)) if axis == 0 else (slice(None), idx) def _wrap(ids, num, names): if ids is None: return slice(None) if not mapslice else np.arange(num) if isinstance(ids, slice): return ids if not mapslice else np.arange(num)[ids] return self._mapids(ids, names) rids = _wrap(rids, self.nrow, self._rnames) cids = _wrap(cids, self.ncol, self._cnames) return rids, cids
def saverdata(self, fname, *, dataobj: str = 'data.matrix', ridxobj: Optional[str] = 'row.index', cidxobj: Optional[str] = 'col.index', transpose: bool = True) -> bool: if missing(rw): raise RuntimeError('RWrapper not available for this installation') checkOutputFile(fname) dm, rn, cn, ri, ci = (self._dmatx, self._rnames, self._cnames, self._rindex, self._cindex) if not transpose else \ (self._dmatx.T, self._cnames, self._rnames, self._cindex, self._rindex) dmtx = rw.asMatrix(dm, rownames = rn, colnames = cn) rw.assign(dmtx, dataobj) if available(ri): rw.assign(rw.r['data.frame'](**{k: rw.asVector(v) for k,v in ri.fields}), ridxobj) if available(ci): rw.assign(rw.r['data.frame'](**{k: rw.asVector(v) for k,v in ci.fields}), cidxobj) vnames = [dataobj] + ([ridxobj] if available(ri) else []) + ([cidxobj] if available(ci) else []) rw.run(f'save({paste(vnames, sep = ",")}, file = "{fname}")') # avoid bug in rw.save return os.path.isfile(fname)
def loadrdata(cls, fname: Union[str, Path], dataobj: str, *, ridxobj: Optional[str] = None, cidxobj: Optional[str] = None, transposed: bool = True) -> Table: if missing(rw): raise RuntimeError('RWrapper not available for this installation') checkInputFile(fname) rw.r.load(fname) dm, rn, cn = np.array(rw.r[dataobj]), rw.run(f'rownames({dataobj})'), rw.run(f'colnames({dataobj})') # stupid numpy conversion rn = None if rn is rw.null else np.array(rn) cn = None if cn is rw.null else np.array(cn) def _parseidx(iname): idx = rw.r[iname] return zip(idx.dtype.names, zip(*idx)) ri = _parseidx(ridxobj) if available(ridxobj) else None ci = _parseidx(cidxobj) if available(cidxobj) else None if transposed: dm, rn, cn, ri, ci = dm.T, cn, rn, ci, ri ntab = Table(dm, rownames = rn, colnames = cn, rowindex = ri, colindex = ci) return ntab
def _parseids(self, idx, axis=None, mapslice=True): if missing(axis): sids, aids = (idx, slice(None)) if not isinstance(idx, tuple) else \ (idx[0], slice(None)) if len(idx) == 1 else idx else: if isinstance(idx, tuple): raise IndexError('too many dimensions for array') if axis not in (0, 1): raise IndexError('invalid axis value') sids, aids = (idx, slice(None)) if axis == 0 else (slice(None), idx) def _wrap(ids): if ids is None: return slice(None) if isinstance(ids, slice): return ids if not listable(ids): return [ids] return ids sids, aids = smap((sids, aids), _wrap) if (isinstance(sids, slice) and mapslice) or ( listable(sids) and checkany(sids, lambda x: not isstring(x))): sids = self.names[sids] return sids, aids
def __array_wrap__(self, arr): if missing(arr.dtype.names): raise TypeError( 'cannot assign non-structured ndarray to StructuredArray') return StructuredArray(arr)
def _mapids(ids, names): if isinstance(ids, NamedIndex): ids = np.array(ids) if not listable(ids): ids = [ids] if not checkany(ids, isstring): return ids if missing(names): raise KeyError('table names not set') return names.idsof(ids, safe = False)
def insert(self, pos: Indices, value: Table, axis: int = 0, inline: bool = False) -> Table: if not isinstance(value, Table): raise TypeError('unknown input data type') ntab = self if inline else self.copy() if axis == 0: if value.ncol != ntab.ncol: raise IndexError('input table has different number of columns') if available(pos): pos = self._mapids(pos, self._rnames) ntab._dmatx = np.vstack([ntab._dmatx, value._dmatx.astype(ntab.dtype)]) if missing(pos) else \ np.insert(ntab._dmatx, pos, value._dmatx.astype(ntab.dtype), axis = 0) if available(ntab._cnames) and available(value._cnames) and np.any(value._cnames != ntab._cnames): raise IndexError('input table has different column names') if available(ntab._cindex) and available(value._cindex) and value._cindex != ntab._cindex: raise IndexError('input table has different column index') if available(ntab._rnames): ntab._rnames.insert(pos, value._rnames, inline = True) if available(ntab._rindex): ntab._rindex.insert(pos, value._rindex, inline = True) elif axis == 1: if value.nrow != ntab.nrow: raise IndexError('input table has different number of rows') if available(pos): pos = self._mapids(pos, self._cnames) ntab._dmatx = np.hstack([ntab._dmatx, value._dmatx.astype(ntab.dtype)]) if missing(pos) else \ np.insert(ntab._dmatx, pos, value._dmatx.astype(ntab.dtype), axis = 1) if available(ntab._rnames) and available(value._rnames) and np.any(value._rnames != ntab._rnames): raise IndexError('input table has different row names') if available(ntab._rindex) and available(value._rindex) and value._rindex != self._rindex: raise IndexError('input table has different row index') if available(ntab._cnames): ntab._cnames.insert(pos, value._cnames, inline = True) if available(ntab._cindex): ntab._cindex.insert(pos, value._cindex, inline = True) else: raise IndexError(f'unsupported axis [{axis}]') return ntab
def cidx_(self, value): if missing(value): self._cindex = None; return self._cindex = StructuredArray(value) if self._cindex.size != 0 and self._cindex.length != self.ncol: raise ValueError('input column index size not match')
def ridx_(self, value): if missing(value): self._rindex = None; return self._rindex = StructuredArray(value) if self._rindex.size != 0 and self._rindex.length != self.nrow: raise ValueError('input row index size not match')
def cols_(self, value): if missing(value): self._cnames = None; return self._cnames = NamedIndex(value) if self._cnames.size != self.ncol: raise ValueError('input column names size not match')
def rows_(self, value): if missing(value): self._rnames = None; return self._rnames = NamedIndex(value) if self._rnames.size != self.nrow: raise ValueError('input row names size not match')
def commit(self) -> SQLiteWrapper: if missing(self._dbconn): raise IOError('database not connected') self._dbconn.commit() return self