def test_element(): x = symbol('x', '5 * 3 * float32') assert isscalar(x[1, 2].dshape) assert x[1, 2].dshape == dshape('float32') assert str(x[1, 2]) == 'x[1, 2]' x = symbol('x', '5 * float32') assert isscalar(x[3].dshape)
def compute_signature(expr): """Get the ``numba`` *function signature* corresponding to ``DataShape`` Examples -------- >>> from blaze import symbol >>> s = symbol('s', 'int64') >>> t = symbol('t', 'float32') >>> d = symbol('d', 'datetime') >>> expr = s + t >>> compute_signature(expr) float64(int64, float32) >>> expr = d.truncate(days=1) >>> compute_signature(expr) datetime64(D)(datetime64(us)) >>> expr = d.day + 1 >>> compute_signature(expr) # only looks at leaf nodes int64(datetime64(us)) Notes ----- * This could potentially be adapted/refactored to deal with ``datashape.Function`` types. * Cannot handle ``datashape.Record`` types. """ assert datashape.isscalar(expr.schema) restype = get_numba_type(expr.schema) argtypes = [get_numba_type(e.schema) for e in expr._leaves()] return restype(*argtypes)
def fillvalue_for_expr(expr): """表达式默认值""" fillmissing = _FILLVALUE_DEFAULTS.copy() fillmissing.update({ int64_dtype: 0, # # 默认值 0 categorical_dtype: '未知', # 类似object_dtype,默认值为None }) ret = {} for name, type_ in expr.dshape.measure.fields: if name in (AD_FIELD_NAME, SID_FIELD_NAME, TS_FIELD_NAME): continue if isscalar(type_): n_type = datashape_type_to_numpy(type_) ret[name] = fillmissing[n_type] return ret
def fillvalue_for_expr(expr): """为表达式填充空白值 Arguments: expr {Expr} -- 要使用的blaze表达式 """ fillmissing = _FILLVALUE_DEFAULTS.copy() fillmissing.update({ int64_dtype: -9999, categorical_dtype: 'NA', }) ret = {} for name, type_ in expr.dshape.measure.fields: if isscalar(type_): n_type = datashape_type_to_numpy(type_) ret[name] = fillmissing[n_type] return ret
def fillvalue_for_expr(expr): """表达式默认值""" fillmissing = _FILLVALUE_DEFAULTS.copy() fillmissing.update({int64_dtype: -1}) ret = {} # 传入类型 对象、浮点、整数、日期、逻辑 for name, type_ in expr.dshape.measure.fields: if name in (AD_FIELD_NAME, SID_FIELD_NAME, TS_FIELD_NAME): continue if isscalar(type_): n_type = datashape_type_to_numpy(type_) if pd.core.dtypes.common.is_float_dtype(n_type): ret[name] = fillmissing[float32_dtype] elif pd.core.dtypes.common.is_datetime64_any_dtype(n_type): ret[name] = fillmissing[datetime64ns_dtype] elif pd.core.dtypes.common.is_integer_dtype(n_type): ret[name] = fillmissing[int64_dtype] elif pd.core.dtypes.common.is_object_dtype(n_type): ret[name] = '' # fillmissing[object_dtype] elif pd.core.dtypes.common.is_bool_dtype(n_type): ret[name] = fillmissing[bool_dtype] return ret
def compute_signature(expr): """Get the ``numba`` *function signature* corresponding to ``DataShape`` Examples -------- >>> from blaze import symbol >>> s = symbol('s', 'int64') >>> t = symbol('t', 'float32') >>> from numba import float64, int64, float32 >>> expr = s + t >>> compute_signature(expr) == float64(int64, float32) True Notes ----- * This could potentially be adapted/refactored to deal with ``datashape.Function`` types. * Cannot handle ``datashape.Record`` types. """ assert datashape.isscalar(expr.schema) restype = get_numba_type(expr.schema) argtypes = [get_numba_type(e.schema) for e in expr._leaves()] return restype(*argtypes)
def from_blaze(expr, deltas='auto', loader=None, resources=None, no_deltas_rule=_valid_no_deltas_rules[0]): """Create a Pipeline API object from a blaze expression. Parameters ---------- expr : Expr The blaze expression to use. deltas : Expr or 'auto', optional The expression to use for the point in time adjustments. If the string 'auto' is passed, a deltas expr will be looked up by stepping up the expression tree and looking for another field with the name of ``expr`` + '_deltas'. If None is passed, no deltas will be used. loader : BlazeLoader, optional The blaze loader to attach this pipeline dataset to. If None is passed, the global blaze loader is used. resources : dict or any, optional The data to execute the blaze expressions against. This is used as the scope for ``bz.compute``. no_deltas_rule : {'warn', 'raise', 'ignore'} What should happen if ``deltas='auto'`` but no deltas can be found. 'warn' says to raise a warning but continue. 'raise' says to raise an exception if no deltas can be found. 'ignore' says take no action and proceed with no deltas. Returns ------- pipeline_api_obj : DataSet or BoundColumn Either a new dataset or bound column based on the shape of the expr passed in. If a table shaped expression is passed, this will return a ``DataSet`` that represents the whole table. If an array-like shape is passed, a ``BoundColumn`` on the dataset that would be constructed from passing the parent is returned. """ deltas = _get_deltas(expr, deltas, no_deltas_rule) if deltas is not None: invalid_nodes = tuple(filter(is_invalid_deltas_node, expr._subterms())) if invalid_nodes: raise TypeError( 'expression with deltas may only contain (%s) nodes,' " found: %s" % ( ', '.join(map(getname, valid_deltas_node_types)), ', '.join(set(map(compose(getname, type), invalid_nodes))), ), ) # Check if this is a single column out of a dataset. if bz.ndim(expr) != 1: raise TypeError( 'expression was not tabular or array-like,' ' %s dimensions: %d' % ( 'too many' if bz.ndim(expr) > 1 else 'not enough', bz.ndim(expr), ), ) single_column = None if isscalar(expr.dshape.measure): # This is a single column. Record which column we are to return # but create the entire dataset. single_column = rename = expr._name field_hit = False if not isinstance(expr, traversable_nodes): raise TypeError( "expression '%s' was array-like but not a simple field of" " some larger table" % str(expr), ) while isinstance(expr, traversable_nodes): if isinstance(expr, bz.expr.Field): if not field_hit: field_hit = True else: break rename = expr._name expr = expr._child dataset_expr = expr.relabel({rename: single_column}) else: dataset_expr = expr measure = dataset_expr.dshape.measure if not isrecord(measure) or AD_FIELD_NAME not in measure.names: raise TypeError( "The dataset must be a collection of records with at least an" " '{ad}' field. Fields provided: '{fields}'\nhint: maybe you need" " to use `relabel` to change your field names".format( ad=AD_FIELD_NAME, fields=measure, ), ) _check_datetime_field(AD_FIELD_NAME, measure) dataset_expr, deltas = _ensure_timestamp_field(dataset_expr, deltas) if deltas is not None and (sorted(deltas.dshape.measure.fields) != sorted(measure.fields)): raise TypeError( 'baseline measure != deltas measure:\n%s != %s' % ( measure, deltas.dshape.measure, ), ) # Ensure that we have a data resource to execute the query against. _check_resources('dataset_expr', dataset_expr, resources) _check_resources('deltas', deltas, resources) # Create or retrieve the Pipeline API dataset. ds = new_dataset(dataset_expr, deltas) # Register our new dataset with the loader. (loader if loader is not None else global_loader)[ds] = ExprData( dataset_expr, deltas, resources, ) if single_column is not None: # We were passed a single column, extract and return it. return getattr(ds, single_column) return ds
def post_compute(expr, query, scope=None): result = query.context.sql(sql_string(query.query)) if iscollection(expr.dshape) and isscalar(expr.dshape.measure): result = result.map(lambda x: x[0]) return result.collect()