def out_dshape(self, input_dshape): cats = self.categorizer.categories(input_dshape) red_shape = self.reduction.out_dshape(input_dshape) return dshape(Record([(c, red_shape) for c in cats]))
def schema(self): d = self._child.schema[0].dict return DataShape(Record([(name, d[name]) for name in self.fields]))
def schema(self): subs = dict(self.labels) param = self._child.dshape.measure.parameters[0] return DataShape( Record([[subs.get(name, name), dtype] for name, dtype in param]))
def out_dshape(self, in_dshape): return dshape( Record([(k, v.out_dshape(in_dshape)) for (k, v) in zip(self.keys, self.values)]))
def test_empty_struct(self): self.assertEqual(dshape('{}'), DataShape(Record([])))
def _schema(self): measure = self._child.schema.measure d = getattr(measure, 'value', measure).dict return DataShape(Record((name, d[name]) for name in self.fields))
def out_dshape(self, input_dshape): cats = input_dshape.measure[self.column].categories return dshape(Record([(c, ct.int32) for c in cats]))
def discover_h5py_group_file(g): return DataShape(Record([[k, discover(v)] for k, v in g.items()]))
def Data(data, dshape=None, name=None, fields=None, columns=None, schema=None, **kwargs): if columns: raise ValueError("columns argument deprecated, use fields instead") if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") sub_uri = '' if isinstance(data, _strtypes): if '::' in data: data, sub_uri = data.split('::') data = resource(data, schema=schema, dshape=dshape, columns=columns, **kwargs) if (isinstance(data, Iterator) and not isinstance(data, tuple(not_an_iterator))): data = tuple(data) if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if not dshape: dshape = discover(data) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) elif isscalar(dshape.measure) and fields: types = (dshape.measure, ) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape[:-1] + (schema, ))) elif isrecord(dshape.measure) and fields: ds = discover(data) assert isrecord(ds.measure) names = ds.measure.names if names != fields: raise ValueError( 'data column names %s\n' '\tnot equal to fields parameter %s,\n' '\tuse Data(data).relabel(%s) to rename ' 'fields' % (names, fields, ', '.join('%s=%r' % (k, v) for k, v in zip(names, fields)))) types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) ds = datashape.dshape(dshape) result = InteractiveSymbol(data, ds, name) if sub_uri: for field in sub_uri.split('/'): if field: result = result[field] return result
def discover_row_proxy(rp): return Record(list(zip(rp.keys(), map(discover, rp.values()))))
def _schema(self): """ Examples -------- >>> from blaze import symbol >>> t = symbol('t', 'var * {name: string, amount: int}') >>> s = symbol('t', 'var * {name: string, id: int}') >>> join(t, s).schema dshape("{name: string, amount: int32, id: int32}") >>> join(t, s, how='left').schema dshape("{name: string, amount: int32, id: ?int32}") Overlapping but non-joined fields append _left, _right >>> a = symbol('a', 'var * {x: int, y: int}') >>> b = symbol('b', 'var * {x: int, y: int}') >>> join(a, b, 'x').fields ['x', 'y_left', 'y_right'] """ option = lambda dt: dt if isinstance(dt, Option) else Option(dt) on_left = self.on_left if not isinstance(on_left, list): on_left = on_left, on_right = self.on_right if not isinstance(on_right, list): on_right = on_right, right_types = keymap( dict(zip(on_right, on_left)).get, self.rhs.dshape.measure.dict, ) joined = ((name, promote(dt, right_types[name], promote_option=False)) for n, (name, dt) in enumerate( filter( compose(op.contains(on_left), first), self.lhs.dshape.measure.fields, ))) left = [(name, dt) for name, dt in zip( self.lhs.fields, types_of_fields(self.lhs.fields, self.lhs)) if name not in on_left] right = [(name, dt) for name, dt in zip( self.rhs.fields, types_of_fields(self.rhs.fields, self.rhs)) if name not in on_right] # Handle overlapping but non-joined case, e.g. left_other = set(name for name, dt in left if name not in on_left) right_other = set(name for name, dt in right if name not in on_right) overlap = left_other & right_other left_suffix, right_suffix = self.suffixes left = ((name + left_suffix if name in overlap else name, dt) for name, dt in left) right = ((name + right_suffix if name in overlap else name, dt) for name, dt in right) if self.how in ('right', 'outer'): left = ((name, option(dt)) for name, dt in left) if self.how in ('left', 'outer'): right = ((name, option(dt)) for name, dt in right) return dshape(Record(chain(joined, left, right)))
def discover_sqlalchemy_column(c): meta = Option if getattr(c, 'nullable', True) else identity return Record([(c.name, meta(discover(c.type)))])
def _bound_symbol(cls, data_source, dshape, name, fields, schema, **kwargs): if schema and dshape: raise ValueError( 'Please specify one of schema= or dshape= keyword arguments', ) if isinstance(data_source, BoundSymbol): return _bound_symbol(cls, data_source.data, dshape, name, fields, schema, **kwargs) if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if isinstance(data_source, _strtypes): data_source = resource(data_source, schema=schema, dshape=dshape, **kwargs) if (isinstance(data_source, Iterator) and not isinstance(data_source, tuple(not_an_iterator))): data_source = tuple(data_source) if not dshape: dshape = discover(data_source) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) elif isscalar(dshape.measure) and fields: types = (dshape.measure, ) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape[:-1] + (schema, ))) elif isrecord(dshape.measure) and fields: ds = discover(data_source) assert isrecord(ds.measure) names = ds.measure.names if names != fields: raise ValueError( 'data column names %s\n' '\tnot equal to fields parameter %s,\n' '\tuse data(data_source).relabel(%s) to rename ' 'fields' % ( names, fields, ', '.join('%s=%r' % (k, v) for k, v in zip(names, fields)), ), ) types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) ds = datashape.dshape(dshape) if name is generate: if istabular(dshape): name = next(_names) else: name = None return cls(data_source, ds, name)
def test_id_take_last_in_group_macro(self): """ output (expected): other value 2014-01-01 Equity(65 [A]) NaN 1 Equity(66 [B]) NaN 1 Equity(67 [C]) NaN 1 2014-01-02 Equity(65 [A]) 1 2 Equity(66 [B]) 1 2 Equity(67 [C]) 1 2 2014-01-03 Equity(65 [A]) 2 2 Equity(66 [B]) 2 2 Equity(67 [C]) 2 2 """ T = pd.Timestamp df = pd.DataFrame( columns=['asof_date', 'timestamp', 'other', 'value'], data=[ [T('2014-01-01'), T('2014-01-01 00'), np.nan, 1], [T('2014-01-01'), T('2014-01-01 01'), np.nan, np.nan], [T('2014-01-02'), T('2014-01-02 00'), 1, np.nan], [T('2014-01-02'), T('2014-01-02 01'), np.nan, 2], [T('2014-01-03'), T('2014-01-03 00'), 2, np.nan], [T('2014-01-03'), T('2014-01-03 01'), 3, 3], ], ) fields = OrderedDict(self.macro_dshape.measure.fields) fields['other'] = fields['value'] with tmp_asset_finder() as finder: expected = pd.DataFrame( columns=[ 'other', 'value', ], data=[ [np.nan, 1], # 2014-01-01 Equity(65 [A]) [np.nan, 1], # Equity(66 [B]) [np.nan, 1], # Equity(67 [C]) [1, 2], # 2014-01-02 Equity(65 [A]) [1, 2], # Equity(66 [B]) [1, 2], # Equity(67 [C]) [2, 2], # 2014-01-03 Equity(65 [A]) [2, 2], # Equity(66 [B]) [2, 2], # Equity(67 [C]) ], index=pd.MultiIndex.from_product( (self.dates, finder.retrieve_all(self.sids)), ), ) self._test_id( df, var * Record(fields), expected, finder, ('value', 'other'), )
def test_id_take_last_in_group(self): T = pd.Timestamp df = pd.DataFrame( columns=['asof_date', 'timestamp', 'sid', 'other', 'value'], data=[ [T('2014-01-01'), T('2014-01-01 00'), 65, 0, 0], [T('2014-01-01'), T('2014-01-01 01'), 65, 1, np.nan], [T('2014-01-01'), T('2014-01-01 00'), 66, np.nan, np.nan], [T('2014-01-01'), T('2014-01-01 01'), 66, np.nan, 1], [T('2014-01-01'), T('2014-01-01 00'), 67, 2, np.nan], [T('2014-01-01'), T('2014-01-01 01'), 67, np.nan, np.nan], [T('2014-01-02'), T('2014-01-02 00'), 65, np.nan, np.nan], [T('2014-01-02'), T('2014-01-02 01'), 65, np.nan, 1], [T('2014-01-02'), T('2014-01-02 00'), 66, np.nan, np.nan], [T('2014-01-02'), T('2014-01-02 01'), 66, 2, np.nan], [T('2014-01-02'), T('2014-01-02 00'), 67, 3, 3], [T('2014-01-02'), T('2014-01-02 01'), 67, 3, 3], [T('2014-01-03'), T('2014-01-03 00'), 65, 2, np.nan], [T('2014-01-03'), T('2014-01-03 01'), 65, 2, np.nan], [T('2014-01-03'), T('2014-01-03 00'), 66, 3, 3], [T('2014-01-03'), T('2014-01-03 01'), 66, np.nan, np.nan], [T('2014-01-03'), T('2014-01-03 00'), 67, np.nan, np.nan], [T('2014-01-03'), T('2014-01-03 01'), 67, np.nan, 4], ], ) fields = OrderedDict(self.dshape.measure.fields) fields['other'] = fields['value'] with tmp_asset_finder() as finder: expected = pd.DataFrame( columns=['other', 'value'], data=[ [1, 0], # 2014-01-01 Equity(65 [A]) [np.nan, 1], # Equity(66 [B]) [2, np.nan], # Equity(67 [C]) [1, 1], # 2014-01-02 Equity(65 [A]) [2, 1], # Equity(66 [B]) [3, 3], # Equity(67 [C]) [2, 1], # 2014-01-03 Equity(65 [A]) [3, 3], # Equity(66 [B]) [3, 3], # Equity(67 [C]) ], index=pd.MultiIndex.from_product( (self.dates, finder.retrieve_all(self.sids)), ), ) self._test_id( df, var * Record(fields), expected, finder, ('value', 'other'), )
def out_dshape(self, input_dshape): cats = input_dshape.measure[self.cat_column].categories red_shape = self.reduction.out_dshape(input_dshape) return dshape(Record([(c, red_shape) for c in cats]))