Beispiel #1
0
def test_base():
    for expr, exclusions in expressions.items():
        if iscollection(expr.dshape):
            model = into(DataFrame, into(np.ndarray, expr._subs({t: Data(base, t.dshape)})))
        else:
            model = compute(expr._subs({t: Data(base, t.dshape)}))
        print('\nexpr: %s\n' % expr)
        for source in sources:
            if id(source) in map(id, exclusions):
                continue
            print('%s <- %s' % (typename(model), typename(source)))
            T = Data(source)
            if iscollection(expr.dshape):
                result = into(type(model), expr._subs({t: T}))
                if isscalar(expr.dshape.measure):
                    assert set(into(list, result)) == set(into(list, model))
                else:
                    assert df_eq(result, model)
            elif isrecord(expr.dshape):
                result = compute(expr._subs({t: T}))
                assert into(tuple, result) == into(tuple, model)
            else:
                result = compute(expr._subs({t: T}))
                try:
                    result = result.scalar()
                except AttributeError:
                    pass
                assert result == model
Beispiel #2
0
def concrete_head(expr, n=10):
    """ Return head of computed expression """
    if not expr._resources():
        raise ValueError("Expression does not contain data resources")
    if not iscollection(expr.dshape):
        return compute(expr)

    head = expr.head(n + 1)

    if not iscollection(expr.dshape):
        return odo(head, object)
    elif isrecord(expr.dshape.measure):
        return odo(head, DataFrame)
    else:
        df = odo(head, DataFrame)
        df.columns = [expr._name]
        return df
    result = compute(head)

    if len(result) == 0:
        return DataFrame(columns=expr.fields)
    if isrecord(expr.dshape.measure):
        return odo(result, DataFrame, dshape=expr.dshape)
    else:
        df = odo(result, DataFrame, dshape=expr.dshape)
        df.columns = [expr._name]
        return df
Beispiel #3
0
def test_comprehensive():
    L = [[100, 1, 'Alice'],
         [200, 2, 'Bob'],
         [300, 3, 'Charlie'],
         [400, 4, 'Dan'],
         [500, 5, 'Edith']]

    df = DataFrame(L, columns=['amount', 'id', 'name'])

    rdd = into(sc, df)
    srdd = into(sqlContext, df)

    t = TableSymbol('t', '{amount: int64, id: int64, name: string}')

    expressions = {
            t: [],
            t['id']: [],
            t.id.max(): [],
            t.amount.sum(): [],
            t.amount + 1: [],
            sin(t.amount): [srdd], # sparksql without hiveql doesn't support math
            exp(t.amount): [srdd], # sparksql without hiveql doesn't support math
            t.amount > 50: [],
            t[t.amount > 50]: [],
            t.sort('name'): [],
            t.sort('name', ascending=False): [],
            t.head(3): [],
            t.name.distinct(): [],
            t[t.amount > 50]['name']: [],
            t.id.map(lambda x: x + 1, '{id: int}'): [srdd], # no udfs yet
            t[t.amount > 50]['name']: [],
            by(t.name, t.amount.sum()): [],
            by(t.id, t.id.count()): [],
            by(t[['id', 'amount']], t.id.count()): [],
            by(t[['id', 'amount']], (t.amount + 1).sum()): [],
            by(t[['id', 'amount']], t.name.nunique()): [rdd, srdd],
            by(t.id, t.amount.count()): [],
            by(t.id, t.id.nunique()): [rdd, srdd],
            # by(t, t.count()): [],
            # by(t.id, t.count()): [df],
            t[['amount', 'id']]: [],
            t[['id', 'amount']]: [],
            }

    for e, exclusions in expressions.items():
        if rdd not in exclusions:
            if iscollection(e.dshape):
                assert into(set, compute(e, rdd)) == into(set, compute(e, df))
            else:
                assert compute(e, rdd) == compute(e, df)
        if srdd not in exclusions:
            if iscollection(e.dshape):
                assert into(set, compute(e, srdd)) == into(set, compute(e, df))
            else:
                assert compute(e, rdd) == compute(e, df)
Beispiel #4
0
def concrete_head(expr, n=10):
    """ Return head of computed expression """
    if not expr.resources():
        raise ValueError("Expression does not contain data resources")
    if iscollection(expr.dshape):
        head = expr.head(n + 1)
        result = compute(head)

        if not len(result):
            return DataFrame(columns=expr.fields)

        if iscollection(expr.dshape):
            return into(DataFrame(columns=expr.fields), result)
    else:
        return compute(expr)
Beispiel #5
0
def summary(keepdims=False, **kwargs):
    items = sorted(kwargs.items(), key=first)
    names = tuple(map(first, items))
    values = tuple(map(toolz.second, items))
    child = common_subexpression(*values)

    if len(kwargs) == 1 and not iscollection(child.dshape):
        while not iscollection(child.dshape):
            children = [i for i in child._inputs if isinstance(i, Expr)]
            if len(children) == 1:
                child = children[0]
            else:
                raise ValueError()

    return Summary(child, names, values, keepdims=keepdims)
Beispiel #6
0
def compserver(serial_format):
    try:
        serial = _get_format(serial_format)
    except KeyError:
        return 'Unsupported serialization format', 404

    try:
        payload = serial.loads(request.data)
    except ValueError:
        return ("Bad data.  Got %s " % request.data, 400)  # 400: Bad Request

    ns = payload.get('namespace', dict())
    dataset = _get_data()
    ns[':leaf'] = symbol('leaf', discover(dataset))

    expr = from_tree(payload['expr'], namespace=ns)
    assert len(expr._leaves()) == 1
    leaf = expr._leaves()[0]

    try:
        result = compute(expr, {leaf: dataset})

        if iscollection(expr.dshape):
            result = odo(result, list)
        elif isscalar(expr.dshape):
            result = coerce_scalar(result, str(expr.dshape))
    except NotImplementedError as e:
        # 501: Not Implemented
        return ("Computation not supported:\n%s" % e, 501)
    except Exception as e:
        # 500: Internal Server Error
        return ("Computation failed with message:\n%s" % e, 500)

    return serial.dumps({'datashape': str(expr.dshape), 'data': result})
Beispiel #7
0
def compserver(payload, serial):
    ns = payload.get('namespace', dict())
    compute_kwargs = payload.get('compute_kwargs') or {}
    odo_kwargs = payload.get('odo_kwargs') or {}
    dataset = _get_data()
    ns[':leaf'] = symbol('leaf', discover(dataset))

    expr = from_tree(payload['expr'], namespace=ns)
    assert len(expr._leaves()) == 1
    leaf = expr._leaves()[0]

    try:
        result = compute(expr, {leaf: dataset}, **compute_kwargs)

        if iscollection(expr.dshape):
            result = odo(result, list, **odo_kwargs)
        elif isscalar(expr.dshape):
            result = coerce_scalar(result, str(expr.dshape))
    except NotImplementedError as e:
        # 501: Not Implemented
        return ("Computation not supported:\n%s" % e, 501)
    except Exception as e:
        # 500: Internal Server Error
        return (
            "Computation failed with message:\n%s: %s" % (type(e).__name__, e),
            500,
        )

    return serial.dumps({
        'datashape': pprint(expr.dshape, width=0),
        'data': result,
        'names': expr.fields
    })
Beispiel #8
0
def default_materialize(data, dshape, odo_kwargs):
    if iscollection(dshape):
        return odo(data, list, **odo_kwargs)
    if isscalar(dshape):
        return coerce_scalar(data, str(dshape), odo_kwargs)

    return data
Beispiel #9
0
def comp(datasets, name):
    if request.headers['content-type'] != 'application/json':
        return ("Expected JSON data", 404)
    try:
        data = json.loads(request.data)
    except ValueError:
        return ("Bad JSON.  Got %s " % request.data, 404)

    try:
        dset = datasets[name]
    except KeyError:
        return ("Dataset %s not found" % name, 404)

    t = Symbol(name, discover(dset))
    namespace = data.get('namespace', dict())
    namespace[name] = t

    expr = from_tree(data['expr'], namespace=namespace)

    result = compute(expr, dset)
    if iscollection(expr.dshape):
        result = into(list, result)
    return jsonify({'name': name,
                    'datashape': str(expr.dshape),
                    'data': result})
Beispiel #10
0
def compserver(dataset):
    if request.headers['content-type'] != 'application/json':
        return ("Expected JSON data", 404)
    try:
        payload = json.loads(request.data.decode('utf-8'))
    except ValueError:
        return ("Bad JSON.  Got %s " % request.data, 404)

    ns = payload.get('namespace', dict())
    ns[':leaf'] = symbol('leaf', discover(dataset))

    expr = from_tree(payload['expr'], namespace=ns)
    assert len(expr._leaves()) == 1
    leaf = expr._leaves()[0]

    try:
        result = compute(expr, {leaf: dataset})
    except Exception as e:
        return ("Computation failed with message:\n%s" % e, 500)

    if iscollection(expr.dshape):
        result = into(list, result)

    return json.dumps({'datashape': str(expr.dshape),
                       'data': result}, default=json_dumps)
Beispiel #11
0
def compserver():
    if request.headers['content-type'] != 'application/json':
        return ("Expected JSON data", 415)  # 415: Unsupported Media Type
    try:
        payload = json.loads(request.data.decode('utf-8'))
    except ValueError:
        return ("Bad JSON.  Got %s " % request.data, 400)  # 400: Bad Request

    ns = payload.get('namespace', dict())
    dataset = _get_data()
    ns[':leaf'] = symbol('leaf', discover(dataset))

    expr = from_tree(payload['expr'], namespace=ns)
    assert len(expr._leaves()) == 1
    leaf = expr._leaves()[0]

    try:
        result = compute(expr, {leaf: dataset})

        if iscollection(expr.dshape):
            result = odo(result, list)
        elif isscalar(expr.dshape):
            result = coerce_scalar(result, str(expr.dshape))
    except NotImplementedError as e:
        # 501: Not Implemented
        return ("Computation not supported:\n%s" % e, 501)
    except Exception as e:
        # 500: Internal Server Error
        return ("Computation failed with message:\n%s" % e, 500)

    return json.dumps({'datashape': str(expr.dshape),
                       'data': result}, default=json_dumps)
Beispiel #12
0
def default_materialize(data, dshape, odo_kwargs):
    if iscollection(dshape):
        return odo(data, list, **odo_kwargs)
    if isscalar(dshape):
        return coerce_scalar(data, str(dshape), odo_kwargs)

    return data
Beispiel #13
0
def concrete_head(expr, n=10):
    """ Return head of computed expression """
    if not expr._resources():
        raise ValueError("Expression does not contain data resources")
    if not iscollection(expr.dshape):
        return compute(expr)

    head = expr.head(n + 1)

    if not iscollection(expr.dshape):
        return odo(head, object)
    elif isrecord(expr.dshape.measure):
        return odo(head, DataFrame)

    df = odo(head, DataFrame)
    df.columns = [expr._name]
    return df
Beispiel #14
0
def fastmsgpack_materialize(data, dshape, odo_kwargs):
    if istabular(dshape):
        return odo(data, pd.DataFrame, **odo_kwargs)
    if iscollection(dshape):
        return odo(data, pd.Series, **odo_kwargs)
    if isscalar(dshape):
        return coerce_scalar(data, str(dshape), odo_kwargs)
    return data
Beispiel #15
0
def test_isnan():
    t = symbol('t', 'var * {name: string, amount: real, timestamp: ?date}')

    for expr in [t.amount.isnan(), ~t.amount.isnan()]:
        assert eval(str(expr)).isidentical(expr)

    assert iscollection(t.amount.isnan().dshape)
    assert 'bool' in str(t.amount.isnan().dshape)
Beispiel #16
0
def fastmsgpack_materialize(data, dshape, odo_kwargs):
    if istabular(dshape):
        return odo(data, pd.DataFrame, **odo_kwargs)
    if iscollection(dshape):
        return odo(data, pd.Series, **odo_kwargs)
    if isscalar(dshape):
        return coerce_scalar(data, str(dshape), odo_kwargs)
    return data
Beispiel #17
0
def test_chunks_compute():
    exprs = [s, s + 1, s.max(), s.mean() + 1, s.head()]
    for e in exprs:
        result = compute(e, {s: cL})
        expected = compute(e, {s: L})
        if iscollection(e.dshape):
            result = into(list, result)
            expected = into(list, expected)
        assert result == expected
Beispiel #18
0
def test_isnan():
    from blaze import isnan
    t = TableSymbol('t', '{name: string, amount: int, timestamp: ?date}')

    for expr in [t.amount.isnan(), ~t.amount.isnan()]:
        assert eval(str(expr)).isidentical(expr)

    assert iscollection(t.amount.isnan().dshape)
    assert 'bool' in str(t.amount.isnan().dshape)
Beispiel #19
0
def test_chunks_compute():
    exprs = [s, s + 1, s.max(), s.mean() + 1, s.head()]
    for e in exprs:
        result = compute(e, {s: cL})
        expected = compute(e, {s: L})
        if iscollection(e.dshape):
            result = into(list, result)
            expected = into(list, expected)
        assert result == expected
Beispiel #20
0
def test_isnan():
    from blaze import isnan

    t = TableSymbol("t", "{name: string, amount: real, timestamp: ?date}")

    for expr in [t.amount.isnan(), ~t.amount.isnan()]:
        assert eval(str(expr)).isidentical(expr)

    assert iscollection(t.amount.isnan().dshape)
    assert "bool" in str(t.amount.isnan().dshape)
Beispiel #21
0
def summary(keepdims=False, axis=None, **kwargs):
    items = sorted(kwargs.items(), key=toolz.first)
    names = tuple(map(toolz.first, items))
    values = tuple(map(toolz.second, items))
    child = common_subexpression(*values)

    if len(kwargs) == 1 and not iscollection(child.dshape):
        while not iscollection(child.dshape):
            children = [i for i in child._inputs if isinstance(i, Expr)]
            if len(children) == 1:
                child = children[0]
            else:
                child = common_subexpression(*children)

    if axis is None:
        axis = tuple(range(ndim(child)))
    if isinstance(axis, (set, list)):
        axis = tuple(axis)
    if not isinstance(axis, tuple):
        axis = (axis,)
    return Summary(child, names, values, keepdims=keepdims, axis=axis)
Beispiel #22
0
def summary(keepdims=False, axis=None, **kwargs):
    items = sorted(kwargs.items(), key=toolz.first)
    names = tuple(map(toolz.first, items))
    values = tuple(map(toolz.second, items))
    child = common_subexpression(*values)

    if len(kwargs) == 1 and not iscollection(child.dshape):
        while not iscollection(child.dshape):
            children = [i for i in child._inputs if isinstance(i, Expr)]
            if len(children) == 1:
                child = children[0]
            else:
                child = common_subexpression(*children)

    if axis is None:
        axis = tuple(range(ndim(child)))
    if isinstance(axis, (set, list)):
        axis = tuple(axis)
    if not isinstance(axis, tuple):
        axis = (axis, )
    return Summary(child, names, values, keepdims=keepdims, axis=axis)
Beispiel #23
0
def test_multiple_csv_files():
    d = {"mult1.csv": "name,val\nAlice,1\nBob,2", "mult2.csv": "name,val\nAlice,3\nCharlie,4"}

    data = [("Alice", 1), ("Bob", 2), ("Alice", 3), ("Charlie", 4)]
    with filetexts(d) as fns:
        r = resource("mult*.csv")
        s = symbol("s", discover(r))

        for e in [s, s.name, s.name.nunique(), s.name.count_values(), s.val.mean()]:
            a = compute(e, {s: r})
            b = compute(e, {s: data})
            if iscollection(e.dshape):
                a, b = into(set, a), into(set, b)
            assert a == b
Beispiel #24
0
def compserver():
    if not request.json:
        raise ServerException('Expected JSON data', status_code=404)
    payload = request.json
    expr, result = _compserver(payload)
    if iscollection(expr.dshape):
        result = into(list, result)
    return json.dumps(
        {
            'datashape': str(expr.dshape),
            'names': expr.fields,
            'data': result
        },
        default=json_dumps)
Beispiel #25
0
def test_summary():
    t = TableSymbol("t", "{id: int32, name: string, amount: int32}")
    s = summary(total=t.amount.sum(), num=t.id.count())
    assert s.dshape == dshape("{num: int32, total: int64}")
    assert hash(s)
    assert eval(str(s)).isidentical(s)

    assert "summary(" in str(s)
    assert "total=" in str(s)
    assert "num=" in str(s)
    assert str(t.amount.sum()) in str(s)

    assert not summary(total=t.amount.sum())._child.isidentical(t.amount.sum())
    assert iscollection(summary(total=t.amount.sum() + 1)._child.dshape)
Beispiel #26
0
def test_summary():
    t = symbol('t', 'var * {id: int32, name: string, amount: int32}')
    s = summary(total=t.amount.sum(), num=t.id.count())
    assert s.dshape == dshape('{num: int32, total: int64}')
    assert hash(s)
    assert eval(str(s)).isidentical(s)

    assert 'summary(' in str(s)
    assert 'total=' in str(s)
    assert 'num=' in str(s)
    assert str(t.amount.sum()) in str(s)

    assert not summary(total=t.amount.sum())._child.isidentical(t.amount.sum())
    assert iscollection(summary(total=t.amount.sum() + 1)._child.dshape)
Beispiel #27
0
 def __getitem__(self, key):
     if isinstance(key, _strtypes) and key in self.fields:
         return self._get_field(key)
     elif isinstance(key, Expr) and iscollection(key.dshape):
         return selection(self, key)
     elif isinstance(key, list) and builtins.all(isinstance(k, _strtypes) for k in key):
         if set(key).issubset(self.fields):
             return self._project(key)
         else:
             raise ValueError("Names %s not consistent with known names %s" % (key, self.fields))
     elif isinstance(key, tuple) and all(isinstance(k, (int, slice, type(None), list, np.ndarray)) for k in key):
         return sliceit(self, key)
     elif isinstance(key, (slice, int, type(None), list, np.ndarray)):
         return sliceit(self, (key,))
     raise ValueError("Not understood %s[%s]" % (self, key))
Beispiel #28
0
def test_multiple_csv_files():
    d = {'mult1.csv': 'name,val\nAlice,1\nBob,2',
         'mult2.csv': 'name,val\nAlice,3\nCharlie,4'}

    dta = [('Alice', 1), ('Bob', 2), ('Alice', 3), ('Charlie', 4)]
    with filetexts(d) as fns:
        r = data('mult*.csv')
        s = symbol('s', discover(r))

        for e in [s, s.name, s.name.nunique(), s.name.count_values(),
                s.val.mean()]:
            a = compute(e, {s: r})
            b = compute(e, {s: dta})
            if iscollection(e.dshape):
                a, b = into(set, a), into(set, b)
            assert a == b
Beispiel #29
0
def test_multiple_csv_files():
    d = {'mult1.csv': 'name,val\nAlice,1\nBob,2',
         'mult2.csv': 'name,val\nAlice,3\nCharlie,4'}

    data = [('Alice', 1), ('Bob', 2), ('Alice', 3), ('Charlie', 4)]
    with filetexts(d) as fns:
        r = resource('mult*.csv')
        s = symbol('s', discover(r))

        for e in [s, s.name, s.name.nunique(), s.name.count_values(),
                s.val.mean()]:
            a = compute(e, {s: r})
            b = compute(e, {s: data})
            if iscollection(e.dshape):
                a, b = into(set, a), into(set, b)
            assert a == b
Beispiel #30
0
def compserver():
    content_type = request.headers['content-type']
    matched = mimetype_regex.match(content_type)

    if matched is None:
        return 'Unsupported serialization format %s' % content_type, 415

    try:
        serial = _get_format(matched.groups()[0])
    except KeyError:
        return (
            "Unsupported serialization format '%s'" % matched.groups()[0],
            415,
        )

    try:
        payload = serial.loads(request.data)
    except ValueError:
        return ("Bad data.  Got %s " % request.data, 400)  # 400: Bad Request

    ns = payload.get('namespace', dict())
    dataset = _get_data()
    ns[':leaf'] = symbol('leaf', discover(dataset))

    expr = from_tree(payload['expr'], namespace=ns)
    assert len(expr._leaves()) == 1
    leaf = expr._leaves()[0]

    try:
        result = compute(expr, {leaf: dataset})

        if iscollection(expr.dshape):
            result = odo(result, list)
        elif isscalar(expr.dshape):
            result = coerce_scalar(result, str(expr.dshape))
    except NotImplementedError as e:
        # 501: Not Implemented
        return ("Computation not supported:\n%s" % e, 501)
    except Exception as e:
        # 500: Internal Server Error
        return ("Computation failed with message:\n%s" % e, 500)

    return serial.dumps({
        'datashape': str(expr.dshape),
        'data': result,
        'names': expr.fields
    })
Beispiel #31
0
def compserver():
    content_type = request.headers['content-type']
    matched = mimetype_regex.match(content_type)

    if matched is None:
        return 'Unsupported serialization format %s' % content_type, 415

    try:
        serial = _get_format(matched.groups()[0])
    except KeyError:
        return (
            "Unsupported serialization format '%s'" % matched.groups()[0],
            415,
        )

    try:
        payload = serial.loads(request.data)
    except ValueError:
        return ("Bad data.  Got %s " % request.data, 400)  # 400: Bad Request

    ns = payload.get('namespace', dict())
    dataset = _get_data()
    ns[':leaf'] = symbol('leaf', discover(dataset))

    expr = from_tree(payload['expr'], namespace=ns)
    assert len(expr._leaves()) == 1
    leaf = expr._leaves()[0]

    try:
        result = compute(expr, {leaf: dataset})

        if iscollection(expr.dshape):
            result = odo(result, list)
        elif isscalar(expr.dshape):
            result = coerce_scalar(result, str(expr.dshape))
    except NotImplementedError as e:
        # 501: Not Implemented
        return ("Computation not supported:\n%s" % e, 501)
    except Exception as e:
        # 500: Internal Server Error
        return ("Computation failed with message:\n%s" % e, 500)

    return serial.dumps({
        'datashape': str(expr.dshape),
        'data': result,
        'names': expr.fields
    })
Beispiel #32
0
def concrete_type(ds):
    """ A type into which we can safely deposit streaming data

    >>> concrete_type('5 * int').__name__
    'ndarray'
    >>> concrete_type('var * {name: string, amount: int}').__name__
    'DataFrame'
    """
    if isinstance(ds, (str, unicode)):
        ds = dshape(ds)
    if not iscollection(ds):
        return type(ds)
    if ndim(ds) == 1 and isrecord(ds.measure):
        return pd.DataFrame
    if ndim(ds) > 1 or isscalar(ds.measure):
        return np.ndarray
    return list
Beispiel #33
0
def concrete_type(ds):
    """ A type into which we can safely deposit streaming data

    >>> concrete_type('5 * int').__name__
    'ndarray'
    >>> concrete_type('var * {name: string, amount: int}').__name__
    'DataFrame'
    """
    if isinstance(ds, (str, unicode)):
        ds = dshape(ds)
    if not iscollection(ds):
        return type(ds)
    if ndim(ds) == 1 and isrecord(ds.measure):
        return pd.DataFrame
    if ndim(ds) > 1 or isscalar(ds.measure):
        return np.ndarray
    return list
Beispiel #34
0
def concrete_head(expr, n=10):
    """ Return head of computed expression """
    if not expr._resources():
        raise ValueError("Expression does not contain data resources")
    if not iscollection(expr.dshape):
        return compute(expr)

    head = expr.head(n + 1)
    result = compute(head)

    if len(result) == 0:
        return DataFrame(columns=expr.fields)
    if isrecord(expr.dshape.measure):
        return into(DataFrame, result, dshape=expr.dshape)
    else:
        df = into(DataFrame, result, dshape=expr.dshape)
        df.columns = [expr._name]
        return df
Beispiel #35
0
 def __getitem__(self, key):
     if isinstance(key, _strtypes) and key in self.fields:
         return self._get_field(key)
     elif isinstance(key, Expr) and iscollection(key.dshape):
         return selection(self, key)
     elif (isinstance(key, list)
             and builtins.all(isinstance(k, _strtypes) for k in key)):
         if set(key).issubset(self.fields):
             return self._project(key)
         else:
             raise ValueError('Names %s not consistent with known names %s'
                     % (key, self.fields))
     elif (isinstance(key, tuple)
             and all(isinstance(k, (int, slice)) for k in key)):
         return Slice(self, key)
     elif isinstance(key, (slice, int)):
         return Slice(self, (key,))
     raise ValueError("Not understood %s[%s]" % (self, key))
Beispiel #36
0
def coerce_core(result, dshape, odo_kwargs=None):
    """Coerce data to a core data type."""
    if iscoretype(result):
        return result
    elif isscalar(dshape):
        result = coerce_scalar(result, dshape, odo_kwargs=odo_kwargs)
    elif istabular(dshape) and isrecord(dshape.measure):
        result = into(DataFrame, result, **(odo_kwargs or {}))
    elif iscollection(dshape):
        dim = _dimensions(dshape)
        if dim == 1:
            result = into(Series, result, **(odo_kwargs or {}))
        elif dim > 1:
            result = into(np.ndarray, result, **(odo_kwargs or {}))
        else:
            raise ValueError("Expr with dshape dimensions < 1 should have been handled earlier: dim={}".format(str(dim)))
    else:
        raise ValueError("Expr does not evaluate to a core return type")

    return result
Beispiel #37
0
def test_base():
    for expr, exclusions in expressions.items():
        model = compute(expr._subs({t: Table(base, t.schema)}))
        print('\nexpr: %s\n' % expr)
        for source in sources:
            if id(source) in map(id, exclusions):
                continue
            print('%s <- %s' % (typename(model), typename(source)))
            T = Table(source)
            if iscollection(expr.dshape):
                result = into(model, expr._subs({t: T}))
                if isscalar(expr.dshape.measure):
                    assert set(into([], result)) == set(into([], model))
                else:
                    assert df_eq(result, model)
            elif isrecord(expr.dshape):
                result = compute(expr._subs({t: T}))
                assert into(tuple, result) == into(tuple, model)
            else:
                result = compute(expr._subs({t: T}))
                assert result == model
Beispiel #38
0
def test_base():
    for expr, exclusions in expressions.items():
        model = compute(expr._subs({t: Data(base, t.dshape)}))
        print('\nexpr: %s\n' % expr)
        for source in sources:
            if id(source) in map(id, exclusions):
                continue
            print('%s <- %s' % (typename(model), typename(source)))
            T = Data(source)
            if iscollection(expr.dshape):
                result = into(model, expr._subs({t: T}))
                if isscalar(expr.dshape.measure):
                    assert set(into([], result)) == set(into([], model))
                else:
                    assert df_eq(result, model)
            elif isrecord(expr.dshape):
                result = compute(expr._subs({t: T}))
                assert into(tuple, result) == into(tuple, model)
            else:
                result = compute(expr._subs({t: T}))
                assert result == model
Beispiel #39
0
def TableSymbol(name, dshape):
    """ A Symbol for Tabular data

    This is a leaf in the expression tree

    Examples
    --------

    >>> accounts = TableSymbol('accounts',
    ...                        '{name: string, amount: int, id: int}')
    >>> accounts.amount + 1
    accounts.amount + 1

    We define a TableSymbol with a name like ``accounts`` and the datashape of
    a single row, called a schema.
    """
    if isinstance(dshape, _strtypes):
        dshape = datashape.dshape(dshape)
    if not iscollection(dshape):
        dshape = datashape.var * dshape
    return symbol(name, dshape)
Beispiel #40
0
def compserver(datasets):
    if request.headers['content-type'] != 'application/json':
        return ("Expected JSON data", 404)
    try:
        data = json.loads(request.data)
    except ValueError:
        return ("Bad JSON.  Got %s " % request.data, 404)

    tree_ns = dict(
        (name, Symbol(name, discover(datasets[name]))) for name in datasets)
    if 'namespace' in data:
        tree_ns = merge(tree_ns, data['namespace'])

    expr = from_tree(data['expr'], namespace=tree_ns)

    compute_ns = dict((Symbol(name, discover(datasets[name])), datasets[name])
                      for name in datasets)
    result = compute(expr, compute_ns)
    if iscollection(expr.dshape):
        result = into(list, result)

    return jsonify({'datashape': str(expr.dshape), 'data': result})
Beispiel #41
0
def broadcast_collect(expr, Broadcastable=Broadcastable,
                            WantToBroadcast=WantToBroadcast):
    """ Collapse expression down using Broadcast - Tabular cases only

    Expressions of type Broadcastables are swallowed into Broadcast
    operations

    >>> t = Symbol('t', 'var * {x: int, y: int, z: int, when: datetime}')
    >>> expr = (t.x + 2*t.y).distinct()

    >>> broadcast_collect(expr)
    distinct(Broadcast(_children=(t,), _scalars=(t,), _scalar_expr=t.x + (2 * t.y)))
    """
    if (isinstance(expr, WantToBroadcast) and
        iscollection(expr.dshape)):
        leaves = leaves_of_type(Broadcastable, expr)
        expr = broadcast(expr, sorted(leaves, key=str))

    # Recurse down
    children = [broadcast_collect(i, Broadcastable, WantToBroadcast)
            for i in expr._inputs]
    return expr._subs(dict(zip(expr._inputs, children)))
Beispiel #42
0
def coerce_core(result, dshape, odo_kwargs=None):
    """Coerce data to a core data type."""
    if iscoretype(result):
        return result
    elif isscalar(dshape):
        result = coerce_scalar(result, dshape, odo_kwargs=odo_kwargs)
    elif istabular(dshape) and isrecord(dshape.measure):
        result = into(DataFrame, result, **(odo_kwargs or {}))
    elif iscollection(dshape):
        dim = _dimensions(dshape)
        if dim == 1:
            result = into(Series, result, **(odo_kwargs or {}))
        elif dim > 1:
            result = into(np.ndarray, result, **(odo_kwargs or {}))
        else:
            msg = "Expr with dshape dimensions < 1 should have been handled earlier: dim={}"
            raise ValueError(msg.format(str(dim)))
    else:
        msg = "Expr does not evaluate to a core return type"
        raise ValueError(msg)

    return result
Beispiel #43
0
def broadcast_collect(expr,
                      Broadcastable=Broadcastable,
                      WantToBroadcast=WantToBroadcast):
    """ Collapse expression down using Broadcast - Tabular cases only

    Expressions of type Broadcastables are swallowed into Broadcast
    operations

    >>> t = symbol('t', 'var * {x: int, y: int, z: int, when: datetime}')
    >>> expr = (t.x + 2*t.y).distinct()

    >>> broadcast_collect(expr)
    distinct(Broadcast(_children=(t,), _scalars=(t,), _scalar_expr=t.x + (2 * t.y)))
    """
    if (isinstance(expr, WantToBroadcast) and iscollection(expr.dshape)):
        leaves = leaves_of_type(Broadcastable, expr)
        expr = broadcast(expr, sorted(leaves, key=str))

    # Recurse down
    children = [
        broadcast_collect(i, Broadcastable, WantToBroadcast)
        for i in expr._inputs
    ]
    return expr._subs(dict(zip(expr._inputs, children)))
Beispiel #44
0
def compserver(datasets):
    if request.headers['content-type'] != 'application/json':
        return ("Expected JSON data", 404)
    try:
        data = json.loads(request.data)
    except ValueError:
        return ("Bad JSON.  Got %s " % request.data, 404)


    tree_ns = dict((name, Symbol(name, discover(datasets[name])))
                    for name in datasets)
    if 'namespace' in data:
        tree_ns = merge(tree_ns, data['namespace'])

    expr = from_tree(data['expr'], namespace=tree_ns)

    compute_ns = dict((Symbol(name, discover(datasets[name])), datasets[name])
                        for name in datasets)
    result = compute(expr, compute_ns)
    if iscollection(expr.dshape):
        result = into(list, result)

    return jsonify({'datashape': str(expr.dshape),
                    'data': result})
Beispiel #45
0
    See np.linalg.norm
    """
    if ord is None or ord == 'fro':
        ord = 2
    if ord == inf:
        return max(abs(expr), axis=axis, keepdims=keepdims)
    elif ord == -inf:
        return min(abs(expr), axis=axis, keepdims=keepdims)
    elif ord == 1:
        return sum(abs(expr), axis=axis, keepdims=keepdims)
    elif ord % 2 == 0:
        return sum(expr ** ord, axis=axis, keepdims=keepdims) ** (1.0 / ord)
    return sum(abs(expr) ** ord, axis=axis, keepdims=keepdims) ** (1.0 / ord)


dshape_method_list.extend([
    (iscollection, set([count, nelements])),
    (lambda ds: (iscollection(ds) and
                 (isstring(ds) or isnumeric(ds) or isboolean(ds) or
                  isdatelike(ds) or isinstance(ds, TimeDelta))),
     set([min, max])),
    (lambda ds: len(ds.shape) == 1,
     set([nrows, nunique])),
    (lambda ds: iscollection(ds) and isboolean(ds),
     set([any, all])),
    (lambda ds: iscollection(ds) and (isnumeric(ds) or isboolean(ds)),
     set([mean, sum, std, var, vnorm])),
])

method_properties.update([nrows])
Beispiel #46
0
def valid_reducer(expr):
    ds = expr.dshape
    measure = ds.measure
    return (not iscollection(ds) and
            (isscalar(measure) or
             (isrecord(measure) and not is_nested_record(measure))))
Beispiel #47
0
 def schema(self):
     if iscollection(self.dshape):
         return self.dshape.subshape[0]
     else:
         raise TypeError("Non-tabular datashape, %s" % self.dshape)
Beispiel #48
0
def sparksql_dataframe_to_list(df, dshape=None, **kwargs):
    result = df.collect()
    if (dshape is not None and iscollection(dshape) and
            not isrecord(dshape.measure)):
        return list(map(get(0), result))
    return result
Beispiel #49
0
def compute_up(t, seq, **kwargs):
    func = rowfunc(t)
    if iscollection(t._child.dshape):
        return deepmap(func, seq, n=ndim(child(t)))
    else:
        return func(seq)
Beispiel #50
0
def concrete_type(ds):
    """A type into which we can safely deposit streaming data.

    Parameters
    ----------
    ds : DataShape

    Returns
    -------
    type : type
        The concrete type corresponding to the DataShape `ds`

    Notes
    -----
    * This will return a Python type if possible
    * Option types are not handled specially. The base type of the option type
      is returned.

    Examples
    --------
    >>> concrete_type('5 * int')
    <class 'pandas.core.series.Series'>
    >>> concrete_type('var * {name: string, amount: int}')
    <class 'pandas.core.frame.DataFrame'>
    >>> concrete_type('float64')
    <... 'float'>
    >>> concrete_type('float32')
    <... 'float'>
    >>> concrete_type('int64')
    <... 'int'>
    >>> concrete_type('int32')
    <... 'int'>
    >>> concrete_type('uint8')
    <... 'int'>
    >>> concrete_type('bool')
    <... 'bool'>
    >>> concrete_type('complex[float64]')
    <... 'complex'>
    >>> concrete_type('complex[float32]')
    <... 'complex'>
    >>> concrete_type('?int64')
    <... 'int'>
    """
    if isinstance(ds, (str, unicode)):
        ds = dshape(ds)
    if not iscollection(ds) and isscalar(ds.measure):
        measure = getattr(ds.measure, 'ty', ds.measure)
        if measure in integral.types:
            return int
        elif measure in floating.types:
            return float
        elif measure in boolean.types:
            return bool
        elif measure in complexes.types:
            return complex
        else:
            return ds.measure.to_numpy_dtype().type
    if not iscollection(ds):
        return type(ds)
    if ndim(ds) == 1:
        return pd.DataFrame if isrecord(ds.measure) else pd.Series
    if ndim(ds) > 1:
        return np.ndarray
    return list
Beispiel #51
0
def test_keepdims_equals_true_doesnt_mess_up_agg_shape():
    x = symbol('x', '10 * int')
    (chunk, chunk_expr), (agg, agg_expr) = split(x, x.sum(), keepdims=False)

    assert iscollection(agg.dshape)
Beispiel #52
0
def valid_grouper(expr):
    ds = expr.dshape
    measure = ds.measure
    return (iscollection(ds) and
            (isscalar(getattr(measure, 'key', measure)) or
             (isrecord(measure) and not is_nested_record(measure))))
Beispiel #53
0
    return tuple(s)


def ndim(expr):
    """ Number of dimensions of expression

    >>> symbol('s', '3 * var * int32').ndim
    2
    """
    return len(shape(expr))


dshape_method_list.extend([
    (lambda ds: True, set([apply])),
    (iscollection, set([shape, ndim])),
    (lambda ds: iscollection(ds) and isscalar(ds.measure), set([coerce]))
])

schema_method_list.extend([
    (isscalar, set([label, relabel, coerce])),
    (isrecord, set([relabel])),
    (lambda ds: isinstance(ds, Option), {coalesce}),
])

method_properties.update([shape, ndim])


@dispatch(Expr)
def discover(expr):
    return expr.dshape
Beispiel #54
0
def valid_reducer(expr):
    ds = expr.dshape
    measure = ds.measure
    return (not iscollection(ds)
            and (isscalar(measure) or
                 (isrecord(measure) and not is_nested_record(measure))))
Beispiel #55
0
    return broadcast(USub, a)


def _invert(a):
    return broadcast(Not, a)


def isnan(expr):
    return broadcast(math.isnan, expr)


from .expressions import dshape_method_list


def isreal(ds):
    if isinstance(ds, DataShape) and len(ds) == 1:
        ds = ds[0]
    if isinstance(ds, Option):
        ds = ds.ty
    return isinstance(ds, Unit) and 'float' in str(ds)


dshape_method_list.extend([
    (lambda ds: iscollection(ds) and isscalar(ds.measure),
     set([
         _eq, _ne, _lt, _le, _gt, _ge, _add, _radd, _mul, _rmul, _div, _rdiv,
         _floordiv, _rfloordiv, _sub, _rsub, _pow, _rpow, _mod, _rmod, _or,
         _ror, _and, _rand, _neg, _invert
     ])), (lambda ds: iscollection(ds) and isreal(ds.measure), set([isnan]))
])
Beispiel #56
0
    """ Vector norm

    See np.linalg.norm
    """
    if ord is None or ord == 'fro':
        ord = 2
    if ord == inf:
        return max(abs(expr), axis=axis, keepdims=keepdims)
    elif ord == -inf:
        return min(abs(expr), axis=axis, keepdims=keepdims)
    elif ord == 1:
        return sum(abs(expr), axis=axis, keepdims=keepdims)
    elif ord % 2 == 0:
        return sum(expr**ord, axis=axis, keepdims=keepdims)**(1.0 / ord)
    return sum(abs(expr)**ord, axis=axis, keepdims=keepdims)**(1.0 / ord)


dshape_method_list.extend([
    (iscollection, set([count, nelements])),
    (lambda ds:
     (iscollection(ds) and
      (isstring(ds) or isnumeric(ds) or isboolean(ds) or isdatelike(ds))),
     set([min, max])),
    (lambda ds: len(ds.shape) == 1, set([nrows, nunique])),
    (lambda ds: iscollection(ds) and isboolean(ds), set([any, all])),
    (lambda ds: iscollection(ds) and (isnumeric(ds) or isboolean(ds)),
     set([mean, sum, std, var, vnorm])),
])

method_properties.update([nrows])
Beispiel #57
0
        if field not in to_remove:
            new_fields.append(field)
        else:
            to_remove.remove(field)

    if to_remove:
        raise ValueError(
            'fields %r were not in the fields of expr (%r)' %
            (sorted(to_remove), expr.fields), )
    return expr[new_fields]


dshape_method_list.extend([
    (lambda ds: True, {apply}),
    (iscollection, {shape, ndim}),
    (lambda ds: iscollection(ds) and isscalar(ds.measure), {coerce}),
    (istabular, {drop_field}),
])

schema_method_list.extend([
    (isscalar, {label, relabel, coerce}),
    (isrecord, {relabel}),
    (lambda ds: isinstance(ds, Option), {coalesce}),
])

method_properties.update([shape, ndim])


@dispatch(Expr)
def discover(expr):
    return expr.dshape