Ejemplo n.º 1
0
def test_element():
    x = symbol('x', '5 * 3 * float32')
    assert isscalar(x[1, 2].dshape)
    assert x[1, 2].dshape == dshape('float32')

    assert str(x[1, 2]) == 'x[1, 2]'

    x = symbol('x', '5 * float32')
    assert isscalar(x[3].dshape)
Ejemplo n.º 2
0
def test_element():
    x = symbol('x', '5 * 3 * float32')
    assert isscalar(x[1, 2].dshape)
    assert x[1, 2].dshape == dshape('float32')

    assert str(x[1, 2]) == 'x[1, 2]'

    x = symbol('x', '5 * float32')
    assert isscalar(x[3].dshape)
Ejemplo n.º 3
0
def compute_signature(expr):
    """Get the ``numba`` *function signature* corresponding to ``DataShape``

    Examples
    --------
    >>> from blaze import symbol
    >>> s = symbol('s', 'int64')
    >>> t = symbol('t', 'float32')
    >>> d = symbol('d', 'datetime')

    >>> expr = s + t
    >>> compute_signature(expr)
    float64(int64, float32)

    >>> expr = d.truncate(days=1)
    >>> compute_signature(expr)
    datetime64(D)(datetime64(us))

    >>> expr = d.day + 1
    >>> compute_signature(expr)  # only looks at leaf nodes
    int64(datetime64(us))

    Notes
    -----
    * This could potentially be adapted/refactored to deal with
      ``datashape.Function`` types.
    * Cannot handle ``datashape.Record`` types.
    """
    assert datashape.isscalar(expr.schema)
    restype = get_numba_type(expr.schema)
    argtypes = [get_numba_type(e.schema) for e in expr._leaves()]
    return restype(*argtypes)
Ejemplo n.º 4
0
def compute_signature(expr):
    """Get the ``numba`` *function signature* corresponding to ``DataShape``

    Examples
    --------
    >>> from blaze import symbol
    >>> s = symbol('s', 'int64')
    >>> t = symbol('t', 'float32')
    >>> d = symbol('d', 'datetime')

    >>> expr = s + t
    >>> compute_signature(expr)
    float64(int64, float32)

    >>> expr = d.truncate(days=1)
    >>> compute_signature(expr)
    datetime64(D)(datetime64(us))

    >>> expr = d.day + 1
    >>> compute_signature(expr)  # only looks at leaf nodes
    int64(datetime64(us))

    Notes
    -----
    * This could potentially be adapted/refactored to deal with
      ``datashape.Function`` types.
    * Cannot handle ``datashape.Record`` types.
    """
    assert datashape.isscalar(expr.schema)
    restype = get_numba_type(expr.schema)
    argtypes = [get_numba_type(e.schema) for e in expr._leaves()]
    return restype(*argtypes)
Ejemplo n.º 5
0
def fillvalue_for_expr(expr):
    """表达式默认值"""
    fillmissing = _FILLVALUE_DEFAULTS.copy()
    fillmissing.update({
        int64_dtype: 0,          # # 默认值 0
        categorical_dtype: '未知',  # 类似object_dtype,默认值为None
    })
    ret = {}
    for name, type_ in expr.dshape.measure.fields:
        if name in (AD_FIELD_NAME, SID_FIELD_NAME, TS_FIELD_NAME):
            continue
        if isscalar(type_):
            n_type = datashape_type_to_numpy(type_)
            ret[name] = fillmissing[n_type]
    return ret
Ejemplo n.º 6
0
def fillvalue_for_expr(expr):
    """为表达式填充空白值
    
    Arguments:
        expr {Expr} -- 要使用的blaze表达式
    """
    fillmissing = _FILLVALUE_DEFAULTS.copy()
    fillmissing.update({
        int64_dtype: -9999,
        categorical_dtype: 'NA',
    })

    ret = {}
    for name, type_ in expr.dshape.measure.fields:
        if isscalar(type_):
            n_type = datashape_type_to_numpy(type_)
            ret[name] = fillmissing[n_type]
    return ret
Ejemplo n.º 7
0
def fillvalue_for_expr(expr):
    """表达式默认值"""
    fillmissing = _FILLVALUE_DEFAULTS.copy()
    fillmissing.update({int64_dtype: -1})
    ret = {}
    # 传入类型 对象、浮点、整数、日期、逻辑
    for name, type_ in expr.dshape.measure.fields:
        if name in (AD_FIELD_NAME, SID_FIELD_NAME, TS_FIELD_NAME):
            continue
        if isscalar(type_):
            n_type = datashape_type_to_numpy(type_)
            if pd.core.dtypes.common.is_float_dtype(n_type):
                ret[name] = fillmissing[float32_dtype]
            elif pd.core.dtypes.common.is_datetime64_any_dtype(n_type):
                ret[name] = fillmissing[datetime64ns_dtype]
            elif pd.core.dtypes.common.is_integer_dtype(n_type):
                ret[name] = fillmissing[int64_dtype]
            elif pd.core.dtypes.common.is_object_dtype(n_type):
                ret[name] = ''  # fillmissing[object_dtype]
            elif pd.core.dtypes.common.is_bool_dtype(n_type):
                ret[name] = fillmissing[bool_dtype]
    return ret
Ejemplo n.º 8
0
def compute_signature(expr):
    """Get the ``numba`` *function signature* corresponding to ``DataShape``

    Examples
    --------
    >>> from blaze import symbol
    >>> s = symbol('s', 'int64')
    >>> t = symbol('t', 'float32')
    >>> from numba import float64, int64, float32
    >>> expr = s + t
    >>> compute_signature(expr) == float64(int64, float32)
    True

    Notes
    -----
    * This could potentially be adapted/refactored to deal with
      ``datashape.Function`` types.
    * Cannot handle ``datashape.Record`` types.
    """
    assert datashape.isscalar(expr.schema)
    restype = get_numba_type(expr.schema)
    argtypes = [get_numba_type(e.schema) for e in expr._leaves()]
    return restype(*argtypes)
Ejemplo n.º 9
0
def compute_signature(expr):
    """Get the ``numba`` *function signature* corresponding to ``DataShape``

    Examples
    --------
    >>> from blaze import symbol
    >>> s = symbol('s', 'int64')
    >>> t = symbol('t', 'float32')
    >>> from numba import float64, int64, float32
    >>> expr = s + t
    >>> compute_signature(expr) == float64(int64, float32)
    True

    Notes
    -----
    * This could potentially be adapted/refactored to deal with
      ``datashape.Function`` types.
    * Cannot handle ``datashape.Record`` types.
    """
    assert datashape.isscalar(expr.schema)
    restype = get_numba_type(expr.schema)
    argtypes = [get_numba_type(e.schema) for e in expr._leaves()]
    return restype(*argtypes)
Ejemplo n.º 10
0
def from_blaze(expr,
               deltas='auto',
               loader=None,
               resources=None,
               no_deltas_rule=_valid_no_deltas_rules[0]):
    """Create a Pipeline API object from a blaze expression.

    Parameters
    ----------
    expr : Expr
        The blaze expression to use.
    deltas : Expr or 'auto', optional
        The expression to use for the point in time adjustments.
        If the string 'auto' is passed, a deltas expr will be looked up
        by stepping up the expression tree and looking for another field
        with the name of ``expr`` + '_deltas'. If None is passed, no deltas
        will be used.
    loader : BlazeLoader, optional
        The blaze loader to attach this pipeline dataset to. If None is passed,
        the global blaze loader is used.
    resources : dict or any, optional
        The data to execute the blaze expressions against. This is used as the
        scope for ``bz.compute``.
    no_deltas_rule : {'warn', 'raise', 'ignore'}
        What should happen if ``deltas='auto'`` but no deltas can be found.
        'warn' says to raise a warning but continue.
        'raise' says to raise an exception if no deltas can be found.
        'ignore' says take no action and proceed with no deltas.

    Returns
    -------
    pipeline_api_obj : DataSet or BoundColumn
        Either a new dataset or bound column based on the shape of the expr
        passed in. If a table shaped expression is passed, this will return
        a ``DataSet`` that represents the whole table. If an array-like shape
        is passed, a ``BoundColumn`` on the dataset that would be constructed
        from passing the parent is returned.
    """
    deltas = _get_deltas(expr, deltas, no_deltas_rule)
    if deltas is not None:
        invalid_nodes = tuple(filter(is_invalid_deltas_node, expr._subterms()))
        if invalid_nodes:
            raise TypeError(
                'expression with deltas may only contain (%s) nodes,'
                " found: %s" % (
                    ', '.join(map(getname, valid_deltas_node_types)),
                    ', '.join(set(map(compose(getname, type), invalid_nodes))),
                ),
            )

    # Check if this is a single column out of a dataset.
    if bz.ndim(expr) != 1:
        raise TypeError(
            'expression was not tabular or array-like,'
            ' %s dimensions: %d' % (
                'too many' if bz.ndim(expr) > 1 else 'not enough',
                bz.ndim(expr),
            ),
        )

    single_column = None
    if isscalar(expr.dshape.measure):
        # This is a single column. Record which column we are to return
        # but create the entire dataset.
        single_column = rename = expr._name
        field_hit = False
        if not isinstance(expr, traversable_nodes):
            raise TypeError(
                "expression '%s' was array-like but not a simple field of"
                " some larger table" % str(expr),
            )
        while isinstance(expr, traversable_nodes):
            if isinstance(expr, bz.expr.Field):
                if not field_hit:
                    field_hit = True
                else:
                    break
            rename = expr._name
            expr = expr._child
        dataset_expr = expr.relabel({rename: single_column})
    else:
        dataset_expr = expr

    measure = dataset_expr.dshape.measure
    if not isrecord(measure) or AD_FIELD_NAME not in measure.names:
        raise TypeError(
            "The dataset must be a collection of records with at least an"
            " '{ad}' field. Fields provided: '{fields}'\nhint: maybe you need"
            " to use `relabel` to change your field names".format(
                ad=AD_FIELD_NAME,
                fields=measure,
            ),
        )
    _check_datetime_field(AD_FIELD_NAME, measure)
    dataset_expr, deltas = _ensure_timestamp_field(dataset_expr, deltas)

    if deltas is not None and (sorted(deltas.dshape.measure.fields) !=
                               sorted(measure.fields)):
        raise TypeError(
            'baseline measure != deltas measure:\n%s != %s' % (
                measure,
                deltas.dshape.measure,
            ),
        )

    # Ensure that we have a data resource to execute the query against.
    _check_resources('dataset_expr', dataset_expr, resources)
    _check_resources('deltas', deltas, resources)

    # Create or retrieve the Pipeline API dataset.
    ds = new_dataset(dataset_expr, deltas)
    # Register our new dataset with the loader.
    (loader if loader is not None else global_loader)[ds] = ExprData(
        dataset_expr,
        deltas,
        resources,
    )
    if single_column is not None:
        # We were passed a single column, extract and return it.
        return getattr(ds, single_column)
    return ds
Ejemplo n.º 11
0
def post_compute(expr, query, scope=None):
    result = query.context.sql(sql_string(query.query))
    if iscollection(expr.dshape) and isscalar(expr.dshape.measure):
        result = result.map(lambda x: x[0])
    return result.collect()
Ejemplo n.º 12
0
def post_compute(expr, query, scope=None):
    result = query.context.sql(sql_string(query.query))
    if iscollection(expr.dshape) and isscalar(expr.dshape.measure):
        result = result.map(lambda x: x[0])
    return result.collect()