def concrete_head(expr, n=10): """ Return head of computed expression """ if not expr._resources(): raise ValueError("Expression does not contain data resources") if not iscollection(expr.dshape): return compute(expr) head = expr.head(n + 1) if not iscollection(expr.dshape): return odo(head, object) elif isrecord(expr.dshape.measure): return odo(head, DataFrame) else: df = odo(head, DataFrame) df.columns = [expr._name] return df result = compute(head) if len(result) == 0: return DataFrame(columns=expr.fields) if isrecord(expr.dshape.measure): return odo(result, DataFrame, dshape=expr.dshape) else: df = odo(result, DataFrame, dshape=expr.dshape) df.columns = [expr._name] return df
def Data(data, dshape=None, name=None, fields=None, columns=None, schema=None, **kwargs): sub_uri = '' if isinstance(data, _strtypes): if '::' in data: data, sub_uri = data.split('::') data = resource(data, schema=schema, dshape=dshape, columns=columns, **kwargs) if (isinstance(data, Iterator) and not isinstance(data, tuple(not_an_iterator))): data = tuple(data) if columns: warnings.warn("columns kwarg deprecated. Use fields instead", DeprecationWarning) if columns and not fields: fields = columns if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if not dshape: dshape = discover(data) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema,))) elif isscalar(dshape.measure) and fields: types = (dshape.measure,) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape[:-1] + (schema,))) elif isrecord(dshape.measure) and fields: ds = discover(data) assert isrecord(ds.measure) names = ds.measure.names if names != fields: raise ValueError('data column names %s\n' '\tnot equal to fields parameter %s,\n' '\tuse Data(data).relabel(%s) to rename fields' % (names, fields, ', '.join('%s=%r' % (k, v) for k, v in zip(names, fields)))) types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema,))) ds = datashape.dshape(dshape) result = InteractiveSymbol(data, ds, name) if sub_uri: for field in sub_uri.split('/'): if field: result = result[field] return result
def Data(data, dshape=None, name=None, fields=None, columns=None, schema=None, **kwargs): if columns: raise ValueError("columns argument deprecated, use fields instead") if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") if isinstance(data, InteractiveSymbol): return Data(data.data, dshape, name, fields, columns, schema, **kwargs) if isinstance(data, _strtypes): data = resource(data, schema=schema, dshape=dshape, columns=columns, **kwargs) if (isinstance(data, Iterator) and not isinstance(data, tuple(not_an_iterator))): data = tuple(data) if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if not dshape: dshape = discover(data) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) elif isscalar(dshape.measure) and fields: types = (dshape.measure, ) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape[:-1] + (schema, ))) elif isrecord(dshape.measure) and fields: ds = discover(data) assert isrecord(ds.measure) names = ds.measure.names if names != fields: raise ValueError( 'data column names %s\n' '\tnot equal to fields parameter %s,\n' '\tuse Data(data).relabel(%s) to rename ' 'fields' % (names, fields, ', '.join('%s=%r' % (k, v) for k, v in zip(names, fields)))) types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) ds = datashape.dshape(dshape) return InteractiveSymbol(data, ds, name)
def sort(child, key=None, ascending=True): """ Sort a collection Parameters ---------- key : str, list of str, or Expr Defines by what you want to sort. * A single column string: ``t.sort('amount')`` * A list of column strings: ``t.sort(['name', 'amount'])`` * An expression: ``t.sort(-t.amount)`` If sorting a columnar dataset, the ``key`` is ignored, as it is not necessary: * ``t.amount.sort()`` * ``t.amount.sort('amount')`` * ``t.amount.sort('foobar')`` are all equivalent. ascending : bool, optional Determines order of the sort """ if ascending not in (True, False): # NOTE: this test is to guard against users saying `x.sort('a', 'b')` # when they should have said `x.sort(['a', 'b'])`. msg = "ascending must be True or False, given {}" raise ValueError(msg.format(ascending)) if not isrecord(child.dshape.measure): if key is None or isinstance(key, _strtypes): # Handle this case separately. return Sort(child, None, ascending) msg = "sort key {!r} not valid for schema {!r}" raise ValueError(msg.format(key, child.dshape.measure)) if key is None and isrecord(child.dshape.measure): key = child.dshape.measure.names if isinstance(key, (list, tuple)): key = keys_to_validate = tuple(key) else: keys_to_validate = (key,) for k in keys_to_validate: if k is None: msg = "sort key {!r} not valid for schema {!r}" raise ValueError(msg.format(k, child.dshape.measure)) elif isinstance(k, _strtypes): if k not in child.dshape.measure.names: msg = "sort key {} is not a column of schema {}" raise ValueError(msg.format(k, child.dshape.measure)) elif not isinstance(k, Expr): msg = "sort key {} is not a string column name or an expression." raise ValueError(msg.format(k)) return Sort(child, key, ascending)
def create_from_datashape(group, ds, name=None, **kwargs): if isinstance(group, type): group = h5py.File(kwargs['path']) assert isrecord(ds) for name, sub_ds in ds[0].dict.items(): if isrecord(sub_ds): g = group.require_group(name) create_from_datashape(g, sub_ds, **kwargs) else: dataset_from_dshape(file=group.file, datapath='/'.join([group.name, name]), ds=sub_ds, **kwargs)
def create_from_datashape(group, ds, name=None, **kwargs): assert isrecord(ds) if isinstance(ds, DataShape) and len(ds) == 1: ds = ds[0] for name, sub_ds in ds.dict.items(): if isrecord(sub_ds): g = group.require_group(name) create_from_datashape(g, sub_ds, **kwargs) else: dataset_from_dshape(file=group.file, datapath='/'.join([group.name, name]), ds=sub_ds, **kwargs)
def Data(data, dshape=None, name=None, fields=None, columns=None, schema=None, **kwargs): if columns: raise ValueError("columns argument deprecated, use fields instead") if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") if isinstance(data, InteractiveSymbol): return Data(data.data, dshape, name, fields, columns, schema, **kwargs) if isinstance(data, _strtypes): data = resource(data, schema=schema, dshape=dshape, columns=columns, **kwargs) if (isinstance(data, Iterator) and not isinstance(data, tuple(not_an_iterator))): data = tuple(data) if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if not dshape: dshape = discover(data) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema,))) elif isscalar(dshape.measure) and fields: types = (dshape.measure,) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape[:-1] + (schema,))) elif isrecord(dshape.measure) and fields: ds = discover(data) assert isrecord(ds.measure) names = ds.measure.names if names != fields: raise ValueError('data column names %s\n' '\tnot equal to fields parameter %s,\n' '\tuse Data(data).relabel(%s) to rename ' 'fields' % (names, fields, ', '.join('%s=%r' % (k, v) for k, v in zip(names, fields)))) types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema,))) ds = datashape.dshape(dshape) return InteractiveSymbol(data, ds, name)
def test_base(): for expr, exclusions in expressions.items(): if iscollection(expr.dshape): model = into(DataFrame, into(np.ndarray, expr._subs({t: Data(base, t.dshape)}))) else: model = compute(expr._subs({t: Data(base, t.dshape)})) print('\nexpr: %s\n' % expr) for source in sources: if id(source) in map(id, exclusions): continue print('%s <- %s' % (typename(model), typename(source))) T = Data(source) if iscollection(expr.dshape): result = into(type(model), expr._subs({t: T})) if isscalar(expr.dshape.measure): assert set(into(list, result)) == set(into(list, model)) else: assert df_eq(result, model) elif isrecord(expr.dshape): result = compute(expr._subs({t: T})) assert into(tuple, result) == into(tuple, model) else: result = compute(expr._subs({t: T})) try: result = result.scalar() except AttributeError: pass assert result == model
def __init__(self, data, dshape, name=None): self.data = data self.dshape = dshape self._name = name or (next(names) if isrecord(dshape.measure) else None) self._hash = None
def test_base(): for expr, exclusions in expressions.items(): if iscollection(expr.dshape): model = into( DataFrame, into(np.ndarray, expr._subs({t: Data(base, t.dshape)}))) else: model = compute(expr._subs({t: Data(base, t.dshape)})) print('\nexpr: %s\n' % expr) for source in sources: if id(source) in map(id, exclusions): continue print('%s <- %s' % (typename(model), typename(source))) T = Data(source) if iscollection(expr.dshape): result = into(type(model), expr._subs({t: T})) if isscalar(expr.dshape.measure): assert set(into(list, result)) == set(into(list, model)) else: assert df_eq(result, model) elif isrecord(expr.dshape): result = compute(expr._subs({t: T})) assert into(tuple, result) == into(tuple, model) else: result = compute(expr._subs({t: T})) try: result = result.scalar() except AttributeError: pass assert result == model
def __new__(cls, data, dshape, name=None): return super(Symbol, cls).__new__( cls, data, dshape, name or (next(names) if isrecord(dshape.measure) else None), )
def create_from_datashape(engine, ds, **kwargs): assert isrecord(ds) metadata = sa.MetaData(engine) for name, sub_ds in ds[0].dict.items(): t = dshape_to_table(name, sub_ds, metadata=metadata) t.create() return engine
def create_from_datashape(engine, ds, schema=None, **kwargs): assert isrecord(ds), 'datashape must be Record type, got %s' % ds metadata = metadata_of_engine(engine, schema=schema) for name, sub_ds in ds[0].dict.items(): t = dshape_to_table(name, sub_ds, metadata=metadata) t.create() return engine
def create_from_datashape(engine, ds, **kwargs): assert isrecord(ds) metadata = metadata_of_engine(engine) for name, sub_ds in ds[0].dict.items(): t = dshape_to_table(name, sub_ds, metadata=metadata) t.create() return engine
def create_from_datashape(engine, ds, schema=None, foreign_keys=None, primary_key=None, **kwargs): assert isrecord(ds), "datashape must be Record type, got %s" % ds metadata = sa.MetaData(engine, schema=schema) for name, sub_ds in ds[0].dict.items(): t = dshape_to_table(name, sub_ds, metadata=metadata, foreign_keys=foreign_keys, primary_key=primary_key) t.create() return engine
def dshape_to_table(name, ds, metadata=None, foreign_keys=None, primary_key=None): """ Create a SQLAlchemy table from a datashape and a name >>> dshape_to_table('bank', '{name: string, amount: int}') # doctest: +NORMALIZE_WHITESPACE Table('bank', MetaData(bind=None), Column('name', Text(), table=<bank>, nullable=False), Column('amount', Integer(), table=<bank>, nullable=False), schema=None) """ if isinstance(ds, str): ds = dshape(ds) if not isrecord(ds.measure): raise TypeError('dshape measure must be a record type e.g., ' '"{a: int64, b: int64}". Input measure is %r' % ds.measure) if metadata is None: metadata = sa.MetaData() if foreign_keys is None: foreign_keys = {} validate_foreign_keys(ds, foreign_keys) cols = dshape_to_alchemy(ds, primary_key=primary_key or frozenset()) cols.extend( sa.ForeignKeyConstraint([column_name], [referent]) for column_name, referent in foreign_keys.items()) t = sa.Table(name, metadata, *cols, schema=metadata.schema) return attach_schema(t, t.schema)
def dshape_to_table(name, ds, metadata=None, foreign_keys=None, primary_key=None): """ Create a SQLAlchemy table from a datashape and a name >>> dshape_to_table('bank', '{name: string, amount: int}') # doctest: +NORMALIZE_WHITESPACE Table('bank', MetaData(bind=None), Column('name', Text(), table=<bank>, nullable=False), Column('amount', Integer(), table=<bank>, nullable=False), schema=None) """ if isinstance(ds, str): ds = dshape(ds) if not isrecord(ds.measure): raise TypeError('dshape measure must be a record type e.g., ' '"{a: int64, b: int64}". Input measure is %r' % ds.measure) if metadata is None: metadata = sa.MetaData() if foreign_keys is None: foreign_keys = {} validate_foreign_keys(ds, foreign_keys) cols = dshape_to_alchemy(ds, primary_key=primary_key or frozenset()) cols.extend(sa.ForeignKeyConstraint([column_name], [referent]) for column_name, referent in foreign_keys.items()) t = sa.Table(name, metadata, *cols, schema=metadata.schema) return attach_schema(t, t.schema)
def __dir__(self): result = dir(type(self)) if isrecord(self.dshape.measure) or isinstance(self.dshape.measure, datashape.Map) and self.fields: result.extend(map(valid_identifier, self.fields)) result.extend(toolz.merge(schema_methods(self.dshape.measure), dshape_methods(self.dshape))) return sorted(set(filter(isvalid_identifier, result)))
def Data(data, dshape=None, name=None, fields=None, columns=None, schema=None, **kwargs): sub_uri = '' if isinstance(data, _strtypes): if '::' in data: data, sub_uri = data.split('::') data = resource(data, schema=schema, dshape=dshape, columns=columns, **kwargs) if (isinstance(data, Iterator) and not isinstance(data, tuple(not_an_iterator))): data = tuple(data) if columns: warnings.warn("columns kwarg deprecated. Use fields instead", DeprecationWarning) if columns and not fields: fields = columns if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if not dshape: dshape = discover(data) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) elif isscalar(dshape.measure) and fields: types = (dshape.measure, ) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape[:-1] + (schema, ))) elif isrecord(dshape.measure) and fields: types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) ds = datashape.dshape(dshape) name = name or next(names) result = InteractiveSymbol(data, ds, name) if sub_uri: for field in sub_uri.split('/'): if field: result = result[field] return result
def data(data_source, dshape=None, name=None, fields=None, schema=None, **kwargs): if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") if isinstance(data_source, _Data): return data(data_source.data, dshape, name, fields, schema, **kwargs) if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if isinstance(data_source, _strtypes): data_source = resource(data_source, schema=schema, dshape=dshape, **kwargs) if isinstance(data_source, Iterator) and not isinstance(data_source, tuple(not_an_iterator)): data_source = tuple(data_source) if not dshape: dshape = discover(data_source) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema,))) elif isscalar(dshape.measure) and fields: types = (dshape.measure,) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape[:-1] + (schema,))) elif isrecord(dshape.measure) and fields: ds = discover(data_source) assert isrecord(ds.measure) names = ds.measure.names if names != fields: raise ValueError( "data column names %s\n" "\tnot equal to fields parameter %s,\n" "\tuse data(data_source).relabel(%s) to rename " "fields" % (names, fields, ", ".join("%s=%r" % (k, v) for k, v in zip(names, fields))) ) types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema,))) ds = datashape.dshape(dshape) return _Data(data_source, ds, name)
def Data(data, dshape=None, name=None, fields=None, columns=None, schema=None, **kwargs): sub_uri = '' if isinstance(data, _strtypes): if '::' in data: data, sub_uri = data.split('::') data = resource(data, schema=schema, dshape=dshape, columns=columns, **kwargs) if (isinstance(data, Iterator) and not isinstance(data, tuple(not_an_iterator))): data = tuple(data) if columns: warnings.warn("columns kwarg deprecated. Use fields instead", DeprecationWarning) if columns and not fields: fields = columns if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if not dshape: dshape = discover(data) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema,))) elif isscalar(dshape.measure) and fields: types = (dshape.measure,) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape[:-1] + (schema,))) elif isrecord(dshape.measure) and fields: types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema,))) ds = datashape.dshape(dshape) if (hasattr(data, 'schema') and isinstance(data.schema, (DataShape, str, unicode)) and ds.measure != data.dshape.measure): raise TypeError('%s schema %s does not match schema %s' % (type(data).__name__, data.schema, ds.measure)) name = name or next(names) result = InteractiveSymbol(data, ds, name) if sub_uri: for field in sub_uri.split('/'): if field: result = result[field] return result
def _csv_to_DataFrame(c, dshape=None, chunksize=None, **kwargs): has_header = kwargs.pop('has_header', c.has_header) if has_header is False: header = None elif has_header is True: header = 0 else: header = 'infer' sep = kwargs.pop('sep', kwargs.pop('delimiter', c.dialect.get('delimiter', ','))) encoding = kwargs.get('encoding', c.encoding) if dshape: dtypes, parse_dates = dshape_to_pandas(dshape) if isrecord(dshape.measure): names = kwargs.get('names', dshape.measure.names) else: names = kwargs.get('names') else: dtypes = parse_dates = names = None usecols = kwargs.pop('usecols', None) if parse_dates and usecols: parse_dates = [col for col in parse_dates if col in usecols] compression = kwargs.pop('compression', {'gz': 'gzip', 'bz2': 'bz2'}.get(ext(c.path))) # See read_csv docs for header for reasoning if names: try: found_names = pd.read_csv(c.path, encoding=encoding, compression=compression, nrows=1) except StopIteration: found_names = pd.read_csv(c.path, encoding=encoding, compression=compression) if names and header == 'infer': if [n.strip() for n in found_names] == [n.strip() for n in names]: header = 0 elif (all(re.match('^\s*\D\w*\s*$', n) for n in found_names) and not all(dt == datashape.string for dt in dshape.measure.types)): header = 0 else: header = None kwargs2 = keyfilter(keywords(pandas.read_csv).__contains__, kwargs) return pandas.read_csv(c.path, header=header, sep=sep, encoding=encoding, dtype=dtypes, parse_dates=parse_dates, names=names, compression=compression, chunksize=chunksize, usecols=usecols, **kwargs2)
def _csv_to_dataframe(c, dshape=None, chunksize=None, **kwargs): header = { False: None, True: 0 }.get(kwargs.pop('has_header', c.has_header), 'infer') sep = kwargs.pop('sep', kwargs.pop('delimiter', c.dialect.get('delimiter', ','))) encoding = kwargs.pop('encoding', c.encoding) if dshape: dtypes, parse_dates = dshape_to_pandas(dshape) if isrecord(dshape.measure): names = kwargs.get('names', dshape.measure.names) else: names = kwargs.get('names') else: dtypes = parse_dates = names = None usecols = kwargs.pop('usecols', None) if parse_dates and usecols: parse_dates = [col for col in parse_dates if col in usecols] # See read_csv docs for header for reasoning if names: try: with c.open() as f: found_names = pd.read_csv(f, nrows=1, encoding=encoding, sep=sep) except StopIteration: with c.open() as f: found_names = pd.read_csv(f, encoding=encoding, sep=sep) if names and header == 'infer': if [n.strip() for n in found_names] == [n.strip() for n in names]: header = 0 elif (all(re.match('^\s*\D\w*\s*$', n) for n in found_names) and not all(dt == datashape.string for dt in dshape.measure.types)): header = 0 else: header = None kwargs = keyfilter(keywords(pd.read_csv).__contains__, kwargs) with c.open() as f: return pd.read_csv(f, header=header, sep=sep, encoding=encoding, dtype=dtypes, parse_dates=parse_dates, names=names, chunksize=chunksize, usecols=usecols, **kwargs)
def __new__(cls, data, dshape, name=None): return super(Symbol, cls).__new__( cls, data, dshape, name or ( next(names) if isrecord(dshape.measure) else None ), )
def create_from_datashape(group, ds, name=None, **kwargs): if not isrecord(ds): raise ValueError( "Trying to create an HDF5 file with non-record datashape failed\n" "Perhaps you forgot to specify a datapath?\n" "\tdshape: %s\n" "If you're using odo consider the following change\n" "\tBefore: odo(data, 'myfile.hdf5')\n" "\tAfter: odo(data, 'myfile.hdf5::/datapath')" % ds) if isinstance(ds, DataShape) and len(ds) == 1: ds = ds[0] for name, sub_ds in ds.dict.items(): if isrecord(sub_ds): g = group.require_group(name) create_from_datashape(g, sub_ds, **kwargs) else: dataset_from_dshape(file=group.file, datapath='/'.join([group.name, name]), ds=sub_ds, **kwargs)
def test_mean(): (chunk, chunk_expr), (agg, agg_expr) = split(t, t.amount.mean()) assert chunk.schema == t.schema assert chunk_expr.isidentical( summary(total=chunk.amount.sum(), count=chunk.amount.count(), keepdims=True)) assert isrecord(agg.dshape.measure) assert agg_expr.isidentical(agg.total.sum() / agg['count'].sum())
def __dir__(self): result = dir(type(self)) if isrecord(self.dshape.measure) and self.fields: result.extend(list(map(valid_identifier, self.fields))) d = toolz.merge(schema_methods(self.dshape.measure), dshape_methods(self.dshape)) result.extend(list(d)) return sorted(set(filter(isvalid_identifier, result)))
def test_mean(): (chunk, chunk_expr), (agg, agg_expr) = split(t, t.amount.mean()) assert chunk.schema == t.schema assert chunk_expr.isidentical(summary(total=chunk.amount.sum(), count=chunk.amount.count(), keepdims=True)) assert isrecord(agg.dshape.measure) assert agg_expr.isidentical(agg.total.sum() / agg.count.sum())
def jsonlines_to_sparksql(ctx, json, dshape=None, name=None, schema=None, samplingRatio=0.25, **kwargs): # if we're passing in schema, assume that we know what we're doing and # bypass any automated dshape inference if dshape is not None and schema is None: schema = dshape_to_schema(dshape.measure if isrecord(dshape.measure) else dshape) srdd = ctx.jsonFile(json.path, schema=schema, samplingRatio=samplingRatio) register_table(ctx, srdd, name=name) return srdd
def create_from_datashape(group, ds, name=None, **kwargs): if not isrecord(ds): raise ValueError( "Trying to create an HDF5 file with non-record datashape failed\n" "Perhaps you forgot to specify a datapath?\n" "\tdshape: %s\n" "If you're using into consider the following change\n" "\tBefore: into('myfile.hdf5', data)\n" "\tAfter: into('myfile.hdf5::/datapath', data)" % ds) if isinstance(ds, DataShape) and len(ds) == 1: ds = ds[0] for name, sub_ds in ds.dict.items(): if isrecord(sub_ds): g = group.require_group(name) create_from_datashape(g, sub_ds, **kwargs) else: dataset_from_dshape(file=group.file, datapath='/'.join([group.name, name]), ds=sub_ds, **kwargs)
def fields(self): if isinstance(self.dshape.measure, Record): return self.dshape.measure.names elif isinstance(self.dshape.measure, datashape.Map): if not isrecord(self.dshape.measure.value): raise TypeError("Foreign key must reference a " "Record datashape") return self.dshape.measure.value.names name = getattr(self, "_name", None) if name is not None: return [self._name] return []
def __init__(self, data, dshape=None, name=None, fields=None, columns=None, schema=None, **kwargs): if isinstance(data, _strtypes): data = resource(data, schema=schema, dshape=dshape, columns=columns, **kwargs) if columns: warnings.warn("columns kwarg deprecated. Use fields instead", DeprecationWarning) if columns and not fields: fields = columns if schema and dshape: raise ValueError("Please specify one of schema= or dshape= keyword" " arguments") if schema and not dshape: dshape = var * schema if dshape and isinstance(dshape, _strtypes): dshape = datashape.dshape(dshape) if not dshape: dshape = discover(data) types = None if isinstance(dshape.measure, Tuple) and fields: types = dshape[1].dshapes schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) elif isscalar(dshape.measure) and fields: types = (dshape.measure, ) * int(dshape[-2]) schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) elif isrecord(dshape.measure) and fields: types = dshape.measure.types schema = Record(list(zip(fields, types))) dshape = DataShape(*(dshape.shape + (schema, ))) self.dshape = datashape.dshape(dshape) self.data = data if (hasattr(data, 'schema') and isinstance(data.schema, (DataShape, str, unicode)) and self.schema != data.schema): raise TypeError('%s schema %s does not match %s schema %s' % (type(data).__name__, data.schema, type(self).__name__, self.schema)) self._name = name or next(names)
def __dir__(self): result = dir(type(self)) if (isrecord(self.dshape.measure) or isinstance(self.dshape.measure, datashape.Map) and self.fields): result.extend(map(valid_identifier, self.fields)) result.extend(toolz.merge(schema_methods(self.dshape.measure), dshape_methods(self.dshape))) return sorted(set(filter(isvalid_identifier, result)))
def test_std(): (chunk, chunk_expr), (agg, agg_expr) = split(t, t.amount.std()) assert chunk.schema == t.schema assert chunk_expr.isidentical(summary(x=chunk.amount.sum(), x2=(chunk.amount ** 2).sum(), n=chunk.amount.count(), keepdims=True)) assert isrecord(agg.dshape.measure) assert agg_expr.isidentical(sqrt((agg.x2.sum() / (agg.n.sum()) - (agg.x.sum() / (agg.n.sum())) ** 2)))
def _csv_to_dataframe(c, dshape=None, chunksize=None, **kwargs): header = {False: None, True: 0}.get( kwargs.pop('has_header', c.has_header), 'infer') sep = kwargs.pop( 'sep', kwargs.pop('delimiter', c.dialect.get('delimiter', ','))) encoding = kwargs.pop('encoding', c.encoding) if dshape: dtypes, parse_dates = dshape_to_pandas(dshape) if isrecord(dshape.measure): names = kwargs.get('names', dshape.measure.names) else: names = kwargs.get('names') else: dtypes = parse_dates = names = None usecols = kwargs.pop('usecols', None) if parse_dates and usecols: parse_dates = [col for col in parse_dates if col in usecols] # See read_csv docs for header for reasoning if names: try: with c.open() as f: found_names = pd.read_csv(f, nrows=1, encoding=encoding, sep=sep) except StopIteration: with c.open() as f: found_names = pd.read_csv(f, encoding=encoding, sep=sep) if names and header == 'infer': if [n.strip() for n in found_names] == [n.strip() for n in names]: header = 0 elif (all(re.match('^\s*\D\w*\s*$', n) for n in found_names) and not all(dt == datashape.string for dt in dshape.measure.types)): header = 0 else: header = None kwargs = keyfilter(keywords(pd.read_csv).__contains__, kwargs) with c.open() as f: return pd.read_csv(f, header=header, sep=sep, encoding=encoding, dtype=dtypes, parse_dates=parse_dates, names=names, chunksize=chunksize, usecols=usecols, **kwargs)
def fields(self): if isinstance(self.dshape.measure, Record): return self.dshape.measure.names elif isinstance(self.dshape.measure, datashape.Map): if not isrecord(self.dshape.measure.value): raise TypeError('Foreign key must reference a ' 'Record datashape') return self.dshape.measure.value.names name = getattr(self, '_name', None) if name is not None: return [self._name] return []
def test_var(): (chunk, chunk_expr), (agg, agg_expr) = split(t, t.amount.var()) assert chunk.schema == t.schema assert chunk_expr.isidentical( summary(x=chunk.amount.sum(), x2=(chunk.amount**2).sum(), n=chunk.amount.count(), keepdims=True)) assert isrecord(agg.dshape.measure) assert agg_expr.isidentical( (agg.x2.sum() / (agg.n.sum()) - (agg.x.sum() / (agg.n.sum()))**2))
def discover_h5py_dataset(d): dshape = datashape.from_numpy(d.shape, d.dtype) shape, measure = dshape.shape, dshape.measure if not isrecord(measure): if dshape == datashape.object_: args = shape + (datashape.string,) return DataShape(*args) return dshape else: records = list(record_dshape_replace(measure, datashape.object_, datashape.string)) args = shape + (datashape.Record(records),) return DataShape(*args)
def _csv_to_dataframe(c, dshape=None, chunksize=None, **kwargs): header = {False: None, True: 0}.get(kwargs.pop("has_header", c.has_header), "infer") sep = kwargs.pop("sep", kwargs.pop("delimiter", c.dialect.get("delimiter", ","))) encoding = kwargs.pop("encoding", c.encoding) if dshape: dtypes, parse_dates = dshape_to_pandas(dshape) if isrecord(dshape.measure): names = kwargs.get("names", dshape.measure.names) else: names = kwargs.get("names") else: dtypes = parse_dates = names = None usecols = kwargs.pop("usecols", None) if parse_dates and usecols: parse_dates = [col for col in parse_dates if col in usecols] compression = kwargs.pop("compression", {"gz": "gzip", "bz2": "bz2"}.get(ext(c.path))) # See read_csv docs for header for reasoning if names: try: found_names = pd.read_csv(c.path, encoding=encoding, compression=compression, nrows=1) except StopIteration: found_names = pd.read_csv(c.path, encoding=encoding, compression=compression) if names and header == "infer": if [n.strip() for n in found_names] == [n.strip() for n in names]: header = 0 elif all(re.match("^\s*\D\w*\s*$", n) for n in found_names) and not all( dt == datashape.string for dt in dshape.measure.types ): header = 0 else: header = None kwargs = keyfilter(keywords(pd.read_csv).__contains__, kwargs) return pd.read_csv( c.path, header=header, sep=sep, encoding=encoding, dtype=dtypes, parse_dates=parse_dates, names=names, compression=compression, chunksize=chunksize, usecols=usecols, **kwargs )
def discover_h5py_dataset(d): dshape = datashape.from_numpy(d.shape, d.dtype) shape, measure = dshape.shape, dshape.measure if not isrecord(measure): if dshape == datashape.object_: args = shape + (datashape.string, ) return DataShape(*args) return dshape else: records = list( record_dshape_replace(measure, datashape.object_, datashape.string)) args = shape + (datashape.Record(records), ) return DataShape(*args)
def _split_agg(expr, leaf=None, chunk=None, agg=None, keepdims=True): exprs = [(name, split(leaf, val, keepdims=False)[1]) for name, val in zip(expr.fields, expr.values)] d = dict() for name, (a, ae) in exprs: if isscalar(a.dshape.measure): # For simple reductions d[name] = ae._subs({a: agg[name]}) elif isrecord(a.dshape.measure): # For reductions like mean/var names = ['%s_%s' % (name, field) for field in a.fields] namedict = dict(zip(a.fields, names)) d[name] = ae._subs(toolz.merge({a: agg}, namedict)) return summary(**d)
def is_nested_record(measure): """Predicate for checking whether `measure` is a nested ``Record`` dshape Examples -------- >>> from datashape import dshape >>> is_nested_record(dshape('{a: int32, b: int32}').measure) False >>> is_nested_record(dshape('{a: var * ?float64, b: ?string}').measure) True """ if not isrecord(measure): raise TypeError("Input must be a Record type got %s of type %r" % (measure, type(measure).__name__)) return not all(isscalar(t) for t in measure.types)
def fields(self): measure = self.dshape.measure if isinstance(self.dshape.measure, Option): measure = measure.ty if isinstance(measure, Record): return measure.names elif isinstance(measure, datashape.Map): if not isrecord(self.dshape.measure.value): raise TypeError('Foreign key must reference a ' 'Record datashape') return measure.value.names name = getattr(self, '_name', None) if name is not None: return [self._name] return []
def record_dshape_replace(dshape, old, new): """Recursively replace all instances of `old` with `new` in the record dshape `dshape`. Examples -------- >>> from datashape import Record, string, object_, dshape >>> ds = DataShape(Record([('a', 'int64'), ... ('b', 10 * Record([('c', 'object')])), ... ('d', 'int64')])) ... >>> Record(list(record_dshape_replace(ds, object_, string))) dshape("{a: int64, b: 10 * {c: object}, d: int64}") """ assert isrecord(dshape), 'input dshape must be a record' for name, subshape in dshape.measure.fields: if subshape == old: yield name, new else: if isrecord(subshape): yield record_dshape_replace(subshape, old, new) else: yield name, subshape
def is_nested_record(measure): """Predicate for checking whether `measure` is a nested ``Record`` dshape Examples -------- >>> from datashape import dshape >>> is_nested_record(dshape('{a: int32, b: int32}').measure) False >>> is_nested_record(dshape('{a: var * ?float64, b: ?string}').measure) True """ if not isrecord(measure): raise TypeError('Input must be a Record type got %s of type %r' % (measure, type(measure).__name__)) return not all(isscalar(t) for t in measure.types)
def create_from_datashape(engine, ds, schema=None, foreign_keys=None, primary_key=None, **kwargs): assert isrecord(ds), 'datashape must be Record type, got %s' % ds metadata = sa.MetaData(engine, schema=schema) for name, sub_ds in ds[0].dict.items(): t = dshape_to_table(name, sub_ds, metadata=metadata, foreign_keys=foreign_keys, primary_key=primary_key) t.create() return engine
def concrete_type(ds): """ A type into which we can safely deposit streaming data >>> concrete_type('5 * int').__name__ 'ndarray' >>> concrete_type('var * {name: string, amount: int}').__name__ 'DataFrame' """ if isinstance(ds, (str, unicode)): ds = dshape(ds) if not iscollection(ds): return type(ds) if ndim(ds) == 1 and isrecord(ds.measure): return pd.DataFrame if ndim(ds) > 1 or isscalar(ds.measure): return np.ndarray return list
def sort(child, key=None, ascending=True): """ Sort collection Parameters ---------- key: string, list of strings, Expr Defines by what you want to sort. Either: A single column string, ``t.sort('amount')`` A list of column strings, ``t.sort(['name', 'amount'])`` A Table Expression, ``t.sort(-t.amount)`` ascending: bool Determines order of the sort """ if not isrecord(child.dshape.measure): key = None if isinstance(key, list): key = tuple(key) return Sort(child, key, ascending)
def sort(child, key=None, ascending=True): """ Sort a collection Parameters ---------- key : str, list of str, or Expr Defines by what you want to sort. * A single column string: ``t.sort('amount')`` * A list of column strings: ``t.sort(['name', 'amount'])`` * An expression: ``t.sort(-t.amount)`` ascending : bool, optional Determines order of the sort """ if not isrecord(child.dshape.measure): key = None if isinstance(key, list): key = tuple(key) return Sort(child, key, ascending)
def dshape_to_numpy(ds): """ Convert a datashape to a NumPy dtype Parameters ---------- ds : DataShape The DataShape instance to convert Returns ------- np.dtype Examples -------- >>> dshape_to_numpy('int32') dtype('int32') >>> dshape_to_numpy('?int32') dtype('float32') >>> dshape_to_numpy('{name: string[5, "ascii"], amount: ?int32}') dtype([('name', 'S5'), ('amount', '<f4')]) >>> dshape_to_numpy('(int32, float32)') dtype([('f0', '<i4'), ('f1', '<f4')]) """ if isinstance(ds, str): ds = dshape(ds) if isinstance(ds, DataShape): ds = ds.measure if isrecord(ds): return np.dtype([ (str(name), unit_to_dtype(typ)) for name, typ in zip(ds.names, ds.types) ]) if isinstance(ds, Tuple): return np.dtype([ ('f%d' % i, unit_to_dtype(typ)) for i, typ in enumerate(ds.parameters[0]) ]) else: return unit_to_dtype(ds)
def coerce_core(result, dshape, odo_kwargs=None): """Coerce data to a core data type.""" if iscoretype(result): return result elif isscalar(dshape): result = coerce_scalar(result, dshape, odo_kwargs=odo_kwargs) elif istabular(dshape) and isrecord(dshape.measure): result = into(DataFrame, result, **(odo_kwargs or {})) elif iscollection(dshape): dim = _dimensions(dshape) if dim == 1: result = into(Series, result, **(odo_kwargs or {})) elif dim > 1: result = into(np.ndarray, result, **(odo_kwargs or {})) else: msg = "Expr with dshape dimensions < 1 should have been handled earlier: dim={}" raise ValueError(msg.format(str(dim))) else: msg = "Expr does not evaluate to a core return type" raise ValueError(msg) return result
def sparksql_dataframe_to_list(df, dshape=None, **kwargs): result = df.collect() if (dshape is not None and iscollection(dshape) and not isrecord(dshape.measure)): return list(map(get(0), result)) return result