def where(self, condition, user_dict=None): """Iterate over values fulfilling a condition.""" dset = self.blzarr # Get rid of the leading dimension on which we iterate dshape = datashape.from_numpy(dset.shape[1:], dset.dtype) for el in dset.where(condition): yield DyND_DDesc(nd.array(el, type=str(dshape)))
def __init__(self, data, dshape=None, metadata=None, layout=None, params=None): # Datashape # --------- if isinstance(dshape, basestring): dshape = _dshape(dshape) if not dshape: # The user just passed in a raw data source, try # and infer how it should be layed out or fail # back on dynamic types. self._datashape = dshape = CTableSource.infer_datashape(data) else: # The user overlayed their custom dshape on this # data, check if it makes sense CTableSource.check_datashape(data, given_dshape=dshape) self._datashape = dshape # Source # ------ if isinstance(data, ByteProvider): self.data = data if isinstance(data, dict): ct = self.from_dict(data) self._axes = data.keys() dshape = from_numpy(ct.shape, ct.dtype) self.data = CTableSource(ct, dshape=dshape, params=params) self._datashape = dshape elif isinstance(data, (list, tuple)): self.data = CTableSource(data, dshape=dshape, params=params) # Pull the labels from the datashape self._axes = self._datashape[-1].names else: raise ValueError # children graph nodes self.children = [] self.space = Space(self.data) # Layout # ------ if layout: self._layout = layout elif not layout: self._layout = self.data.default_layout() # Metadata # -------- self._metadata = NDTable._metaheader + (metadata or []) # Parameters # ---------- self.params = params
def test_from_numpy_fields(self): import numpy as np dt = np.dtype('i4,i8,f8') ds = datashape.from_numpy((), dt) self.assertEqual(ds.names, ['f0', 'f1', 'f2']) self.assertEqual(ds.types, [datashape.int32, datashape.int64, datashape.float64])
def where(self, condition): """Iterate over values fulfilling a condition.""" f = tb.open_file(self.path, mode='r') dset = f.get_node(self.datapath) # Get rid of the leading dimension on which we iterate dshape = datashape.from_numpy(dset.shape[1:], dset.dtype) for el in dset.where(condition): yield DyND_DDesc(nd.array(el[:], type=str(dshape))) dset._v_file.close()
def __iter__(self): f = tb.open_file(self.path, mode='r') dset = f.get_node(self.datapath) # Get rid of the leading dimension on which we iterate dshape = datashape.from_numpy(dset.shape[1:], dset.dtype) for el in dset: if hasattr(el, "nrow"): yield DyND_DDesc(nd.array(el[:], type=str(dshape))) else: yield DyND_DDesc(nd.array(el, type=str(dshape))) dset._v_file.close()
def test_ndarray_into_table(self, dt_tb, dt_data): dtype = ds.from_numpy(dt_data.shape, dt_data.dtype) t = PyTables(dt_tb, '/out', dtype) try: res = into(np.ndarray, into(t, dt_data, filename=dt_tb, datapath='/out')) for k in res.dtype.fields: lhs, rhs = res[k], dt_data[k] if (issubclass(np.datetime64, lhs.dtype.type) and issubclass(np.datetime64, rhs.dtype.type)): lhs, rhs = lhs.astype('M8[us]'), rhs.astype('M8[us]') assert np.array_equal(lhs, rhs) finally: t._v_file.close()
def discover_h5py_dataset(d): dshape = datashape.from_numpy(d.shape, d.dtype) shape, measure = dshape.shape, dshape.measure if not isrecord(measure): if dshape == datashape.object_: args = shape + (datashape.string,) return DataShape(*args) return dshape else: records = list(record_dshape_replace(measure, datashape.object_, datashape.string)) args = shape + (datashape.Record(records),) return DataShape(*args)
def discover_h5py_dataset(d): dshape = datashape.from_numpy(d.shape, d.dtype) shape, measure = dshape.shape, dshape.measure if not isrecord(measure): if dshape == datashape.object_: args = shape + (datashape.string, ) return DataShape(*args) return dshape else: records = list( record_dshape_replace(measure, datashape.object_, datashape.string)) args = shape + (datashape.Record(records), ) return DataShape(*args)
def __init__(self, path, datapath, mode='r', schema=None, dshape=None, **kwargs): self.path = path self.datapath = datapath self.mode = mode if schema and not dshape: dshape = 'var * ' + str(schema) # TODO: provide sane defaults for kwargs # Notably chunks and maxshape if dshape: dshape = datashape.dshape(dshape) shape = dshape.shape dtype = datashape.to_numpy_dtype(dshape[-1]) if shape[0] == datashape.Var(): kwargs['chunks'] = True kwargs['maxshape'] = kwargs.get('maxshape', (None, ) + shape[1:]) shape = (0, ) + tuple(map(int, shape[1:])) with h5py.File(path, mode) as f: dset = f.get(datapath) if dset is None: if dshape is None: raise ValueError('No dataset or dshape provided') else: f.create_dataset(datapath, shape, dtype=dtype, **kwargs) else: dshape2 = datashape.from_numpy(dset.shape, dset.dtype) dshape = dshape2 # TODO: test provided dshape against given dshape # if dshape and dshape != dshape2: # raise ValueError('Inconsistent datashapes.' # '\nGiven: %s\nFound: %s' % (dshape, dshape2)) attributes = self.attributes() if attributes['chunks']: # is there a better way to do this? words = str(dshape).split(' * ') dshape = 'var * ' + ' * '.join(words[1:]) dshape = datashape.dshape(dshape) self._dshape = dshape self._schema = schema
def discover_events(event, **kwargs): df = pandas.DataFrame( [ list(event._asdict().values()) + [ event.startdate, event.enddate, event.duration, ] for event in [event,] ], columns=columns, ) shape = (len(df),) dtype = df.values.dtype return from_numpy(shape, dtype)
def discover_events(event, **kwargs): df = pandas.DataFrame( [ list(event._asdict().values()) + [ event.startdate, event.enddate, event.duration, ] for event in [event,] ], columns=list(self.api["schemas"]["Event"]["properties"].keys()) + [ "startdate", "enddate", "duration", ] ) shape = (len(df),) dtype = df.values.dtype return from_numpy(shape, dtype)
def __init__(self, path, datapath, mode='r', schema=None, dshape=None, **kwargs): self.path = path self.datapath = datapath self.mode = mode if schema and not dshape: dshape = 'var * ' + str(schema) # TODO: provide sane defaults for kwargs # Notably chunks and maxshape if dshape: dshape = datashape.dshape(dshape) shape = dshape.shape dtype = datashape.to_numpy_dtype(dshape[-1]) if shape[0] == datashape.Var(): kwargs['chunks'] = True kwargs['maxshape'] = kwargs.get('maxshape', (None,) + shape[1:]) shape = (0,) + tuple(map(int, shape[1:])) with h5py.File(path, mode) as f: dset = f.get(datapath) if dset is None: if dshape is None: raise ValueError('No dataset or dshape provided') else: f.create_dataset(datapath, shape, dtype=dtype, **kwargs) else: dshape2 = datashape.from_numpy(dset.shape, dset.dtype) dshape = dshape2 # TODO: test provided dshape against given dshape # if dshape and dshape != dshape2: # raise ValueError('Inconsistent datashapes.' # '\nGiven: %s\nFound: %s' % (dshape, dshape2)) attributes = self.attributes() if attributes['chunks']: # is there a better way to do this? words = str(dshape).split(' * ') dshape = 'var * ' + ' * '.join(words[1:]) dshape = datashape.dshape(dshape) self._dshape = dshape self._schema = schema
def promote(lhs, rhs): """Promote two scalar dshapes to a possibly larger, but compatibile type Examples -------- >>> from datashape import int32, int64, Option >>> x = Option(int32) >>> y = int64 >>> promote(x, y) ?int64 Notes ---- This uses ``numpy.promote_types`` for type promotion logic. See the numpy documentation at http://docs.scipy.org/doc/numpy/reference/generated/numpy.promote_types.html """ left, right = getattr(lhs, 'ty', lhs), getattr(rhs, 'ty', rhs) dtype = np.promote_types(datashape.to_numpy_dtype(left), datashape.to_numpy_dtype(right)) dshape = datashape.from_numpy((), dtype) return optionify(lhs, rhs, dshape)
def test_ascii_string(self): assert (from_numpy((2,), np.dtype('S7')) == dshape('2 * string[7, "ascii"]'))
def test_date(self): for d in ('D', 'M', 'Y', 'W'): assert from_numpy((2, ), np.dtype('M8[%s]' % d)) == dshape('2 * date')
def discover(data): return datashape.from_numpy(data.shape, data.dtype)
def test_date(self): for d in ('D', 'M', 'Y', 'W'): assert from_numpy((2,), np.dtype('M8[%s]' % d)) == dshape('2 * date')
def __getitem__(self, mask): ct = (self.data.ca[mask]) dshape = from_numpy(ct.shape, ct.dtype) source = CTableSource(ct, dshape=dshape) return Table(source, dshape=dshape)
def discover(data): val = data.value return datashape.from_numpy(val.shape, val.dtype)
def discover(d): s = str(datashape.from_numpy(d.shape, d.dtype)) return dshape(s.replace('object', 'string'))
def discover_dask_array(a, **kwargs): return from_numpy(a.shape, a.dtype)
def __iter__(self): dset = self.blzarr # Get rid of the leading dimension on which we iterate dshape = datashape.from_numpy(dset.shape[1:], dset.dtype) for el in self.blzarr: yield DyND_DDesc(nd.array(el, type=str(dshape)))
def test_string(self): assert (from_numpy((2, ), np.dtype('U7')) == dshape('2 * string[7, "U32"]'))
def test_ascii_string(self): assert (from_numpy((2, ), np.dtype('S7')) == dshape('2 * string[7, "ascii"]'))
def test_timedelta(self): for d in _units: assert from_numpy((2,), np.dtype('m8[%s]' % d)) == \ dshape('2 * timedelta[unit=%r]' % d)
def handle(conn, arrname): """Obtain an array handle to an existing SciDB array""" scidbpy_arr = conn.wrap_array(arrname) dshape = from_numpy(scidbpy_arr.shape, scidbpy_arr.dtype) return SciDBDataDescriptor(dshape, Query(arrname, ()), conn)
def discover_tables_node(n): return datashape.from_numpy(n.shape, n.dtype)
def dshape(self): # This cannot be cached because the BLZ can change the dshape obj = self.blzarr return datashape.from_numpy(obj.shape, obj.dtype)
def discover(c): dshape = from_numpy(c.shape, c.dtype) return { 'time64': datetime_, 'time32': date_ }.get(c.type, dshape.subshape[1])
def append(self, data): self.data.ca.append(data) # Update the shape shape, dtype = self.data.ca.shape, self.data.ca.dtype self._datashape = from_numpy(shape, dtype)
def dshape(self): # This cannot be cached because the Array can change the dshape with tb.open_file(self.path, mode='r') as f: dset = f.get_node(self.datapath) odshape = datashape.from_numpy(dset.shape, dset.dtype) return odshape
def discover(t): return datashape.from_numpy(t.shape, t.dtype)
def discover_bcolz(c, **kwargs): return datashape.from_numpy(c.shape, c.dtype)
def test_datetime(self): keys = 'h', 'm', 's', 'ms', 'us', 'ns', 'ps', 'fs', 'as' for k in keys: assert from_numpy((2,), np.dtype('M8[%s]' % k)) == dshape('2 * datetime')
def test_int32(self): assert from_numpy((2,), 'int32') == dshape('2 * int32') assert from_numpy((2,), 'i4') == dshape('2 * int32')
def test_struct(self): dtype = np.dtype([('x', '<i4'), ('y', '<i4')]) result = from_numpy((2, ), dtype) assert result == dshape('2 * {x: int32, y: int32}')
def test_string(self): assert (from_numpy((2,), np.dtype('U7')) == dshape('2 * string[7, "U32"]'))
def test_datetime(self): keys = 'h', 'm', 's', 'ms', 'us', 'ns', 'ps', 'fs', 'as' for k in keys: assert from_numpy((2, ), np.dtype('M8[%s]' % k)) == dshape('2 * datetime')
def test_struct(self): dtype = np.dtype([('x', '<i4'), ('y', '<i4')]) result = from_numpy((2,), dtype) assert result == dshape('2 * {x: int32, y: int32}')
def test_int32(self): assert from_numpy((2, ), 'int32') == dshape('2 * int32') assert from_numpy((2, ), 'i4') == dshape('2 * int32')
def var_dshape(v): return datashape.from_numpy(v.shape, v.dtype)
def _eval_blocks(expression, vars, vlen, rowsize, vm, **kwargs): """Perform the evaluation in blocks.""" # Compute the optimal block size (in elements) # The next is based on experiments, but YMMV if vm == "numexpr": # If numexpr, make sure that operands fit in L3 chache bsize = 2**20 # 1 MB is common for L3 else: # If python, make sure that operands fit in L2 chache bsize = 2**17 # 256 KB is common for L2 bsize //= rowsize # Evaluation seems more efficient if block size is a power of 2 bsize = 2 ** (int(math.log(bsize, 2))) if vlen < 100*1000: bsize //= 8 elif vlen < 1000*1000: bsize //= 4 elif vlen < 10*1000*1000: bsize //= 2 # Protection against too large rowsizes if bsize == 0: bsize = 1 vars_ = {} # Convert operands into Blaze arrays and get temporaries for vars maxndims = 0 for name in dict_viewkeys(vars): var = vars[name] if not hasattr(var, "dshape"): # Convert sequences into regular Blaze arrays vars[name] = var = array(var) if hasattr(var, "__len__"): ndims = len(var.dshape.shape) if ndims > maxndims: maxndims = ndims if len(var) > bsize: # Variable is too large; get a container for a chunk res_shape, res_dtype = datashape.to_numpy(var.dshape) res_shape = list(res_shape) res_shape[0] = bsize dshape = datashape.from_numpy(res_shape, res_dtype) vars_[name] = empty(dshape) if 'ddesc' in kwargs and kwargs['ddesc'] is not None: res_ddesc = True else: res_ddesc = False for i in xrange(0, vlen, bsize): # Correction for the block size if i+bsize > vlen: bsize = vlen - i # Get buffers for vars for name in dict_viewkeys(vars): var = vars[name] if hasattr(var, "__len__") and len(var) > bsize: vars_[name] = var[i:i+bsize] else: if hasattr(var, "__getitem__"): vars_[name] = var[:] else: vars_[name] = var # Perform the evaluation for this block # We need array evals if vm == "python": res_block = eval(expression, vars_) dynd_block = blaze_eval(res_block).ddesc.dynd_arr() else: res_block = numexpr.evaluate(expression, local_dict=vars_) # numexpr returns a numpy array, and we need dynd/blaze ones dynd_block = nd.array(res_block) res_block = array(res_block) if i == 0: scalar = False dim_reduction = False # Detection of reduction operations if res_block.dshape.shape == (): scalar = True result = dynd_block continue elif len(res_block.dshape.shape) < maxndims: dim_reduction = True result = dynd_block continue block_shape, block_dtype = datashape.to_numpy(res_block.dshape) out_shape = list(block_shape) if res_ddesc: out_shape[0] = 0 dshape = datashape.from_numpy(out_shape, block_dtype) result = empty(dshape, **kwargs) append(result, dynd_block) else: out_shape[0] = vlen dshape = datashape.from_numpy(out_shape, block_dtype) result = empty(dshape, **kwargs) # The next is a workaround for bug #183 #result[:bsize] = res_block result[:bsize] = dynd_block else: if scalar: result += dynd_block result = result.eval() elif dim_reduction: if len(res_block) < len(result): result[:bsize] += dynd_block else: result += dynd_block result = result.eval() elif res_ddesc: append(result, dynd_block) else: # The next is a workaround for bug #183 #result[i:i+bsize] = res_block result[i:i+bsize] = dynd_block # Scalars and dim reductions generate dynd array for workaround # different issues in Blaze array operations (see #197) if isinstance(result, nd.array): if scalar: return array(result) else: # If not an scalar pass the arguments (persistency, etc.) return array(result, **kwargs) return result
def handle(conn, arrname): """Obtain an array handle to an existing SciDB array""" scidbpy_arr = conn.wrap_array(arrname) dshape = from_numpy(scidbpy_arr.shape, scidbpy_arr.dtype) return SciDB_DDesc(dshape, Query(arrname, ()), conn)
def dshape(self): # This cannot be cached because the Array can change the dshape with tb.open_file(self.filename, mode='r') as f: h5arr = f.get_node(f.root, self.datapath) odshape = datashape.from_numpy(h5arr.shape, h5arr.dtype) return odshape
def discover(c): dshape = from_numpy(c.shape, c.dtype) return {'time64': datetime_, 'time32': date_}.get(c.type, dshape.subshape[1])
def dshape(self): # This cannot be cached because the Array can change the dshape with netCDF4.Dataset(self.path, mode='r') as f: dset = get_node(f, self.datapath) odshape = datashape.from_numpy(dset.shape, dset.dtype) return odshape