Esempi in Python per window, esempi in Python per ibis.window

Esempio n. 1

0

Mostra file

File: test_window_functions.py Progetto: wesm/ibis

def test_window_bind_to_table(t):
    w = ibis.window(group_by="g", order_by=ibis.desc("f"))

    w2 = w.bind(t)
    expected = ibis.window(group_by=t.g, order_by=ibis.desc(t.f))

    assert_equal(w2, expected)

Esempio n. 2

0

Mostra file

File: test_impala_e2e.py Progetto: megvuyyuru/ibis

    def test_analytic_functions(self):
        t = self.alltypes.limit(1000)

        g = t.group_by("string_col").order_by("double_col")
        f = t.float_col

        exprs = [
            f.lag(),
            f.lead(),
            f.rank(),
            f.dense_rank(),
            f.first(),
            f.last(),
            f.first().over(ibis.window(preceding=10)),
            f.first().over(ibis.window(following=10)),
            ibis.row_number(),
            f.cumsum(),
            f.cummean(),
            f.cummin(),
            f.cummax(),
            # boolean cumulative reductions
            (f == 0).cumany(),
            (f == 0).cumall(),
            f.sum(),
            f.mean(),
            f.min(),
            f.max(),
        ]

        proj_exprs = [expr.name("e%d" % i) for i, expr in enumerate(exprs)]

        proj_table = g.mutate(proj_exprs)
        proj_table.execute()

Esempio n. 3

0

Mostra file

File: test_window_functions.py Progetto: raincoatrun/ibis

    def test_window_bind_to_table(self):
        w = ibis.window(group_by='g', order_by=ibis.desc('f'))

        w2 = w.bind(self.t)
        expected = ibis.window(group_by=self.t.g,
                               order_by=ibis.desc(self.t.f))

        assert_equal(w2, expected)

Esempio n. 4

0

Mostra file

File: test_window_functions.py Progetto: cloudera/ibis

def test_window_bind_to_table(alltypes):
    t = alltypes
    w = ibis.window(group_by='g', order_by=ibis.desc('f'))

    w2 = w.bind(alltypes)
    expected = ibis.window(group_by=t.g, order_by=ibis.desc(t.f))

    assert_equal(w2, expected)

Esempio n. 5

0

Mostra file

File: test_window_functions.py Progetto: cloudera/ibis

def test_window_function_bind(alltypes):
    # GH #532
    t = alltypes

    w = ibis.window(group_by=lambda x: x.g, order_by=lambda x: x.f)

    expr = t.f.lag().over(w)

    actual_window = expr.op().args[1]
    expected = ibis.window(group_by=t.g, order_by=t.f)
    assert_equal(actual_window, expected)

Esempio n. 6

0

Mostra file

File: test_window_functions.py Progetto: raincoatrun/ibis

    def test_over_auto_bind(self):
        # GH #542
        t = self.t

        w = ibis.window(group_by='g', order_by='f')

        expr = t.f.lag().over(w)

        actual_window = expr.op().args[1]
        expected = ibis.window(group_by=t.g, order_by=t.f)
        assert_equal(actual_window, expected)

Esempio n. 7

0

Mostra file

File: test_window_functions.py Progetto: wesm/ibis

def test_over_auto_bind(t):
    # GH #542
    t = t

    w = ibis.window(group_by="g", order_by="f")

    expr = t.f.lag().over(w)

    actual_window = expr.op().args[1]
    expected = ibis.window(group_by=t.g, order_by=t.f)
    assert_equal(actual_window, expected)

Esempio n. 8

0

Mostra file

File: test_window_functions.py Progetto: wesm/ibis

def test_combine_windows(t):
    t = t
    w1 = ibis.window(group_by=t.g, order_by=t.f)
    w2 = ibis.window(preceding=5, following=5)

    w3 = w1.combine(w2)
    expected = ibis.window(group_by=t.g, order_by=t.f, preceding=5, following=5)
    assert_equal(w3, expected)

    w4 = ibis.window(group_by=t.a, order_by=t.e)
    w5 = w3.combine(w4)
    expected = ibis.window(group_by=[t.g, t.a], order_by=[t.f, t.e], preceding=5, following=5)
    assert_equal(w5, expected)

Esempio n. 9

0

Mostra file

File: test_window.py Progetto: cloudera/ibis

def test_unsupported_aggregate_functions(alltypes, column, op):
    t = alltypes
    w = ibis.window(order_by=t.d)
    expr = getattr(t[column], op)()
    proj = t.projection([expr.over(w).name('foo')])
    with pytest.raises(com.TranslationError):
        to_sql(proj)

Esempio n. 10

0

Mostra file

File: test_functions.py Progetto: teamclairvoyant/ibis

    def test_partitioned_window(self):
        t = self.alltypes
        df = t.execute()
        window = ibis.window(
            group_by=t.string_col,
            order_by=t.timestamp_col,
            preceding=6,
            following=0,
        )

        def roller(func):
            def rolled(df):
                torder = df.sort_values('timestamp_col')
                rolling = torder.double_col.rolling(7, min_periods=0)
                return getattr(rolling, func)()
            return rolled

        for func in 'mean sum min max'.split():
            f = getattr(t.double_col, func)
            expr = f().over(window).name('double_col')
            result = t.projection([expr]).execute().double_col
            expected = df.groupby('string_col').apply(
                roller(func)
            ).reset_index(drop=True)
            tm.assert_series_equal(result, expected)

Esempio n. 11

0

Mostra file

File: test_window.py Progetto: nataliaking/ibis

    def test_window_frame_specs(self):
        t = self.con.table('alltypes')

        ex_template = """\
SELECT sum(d) OVER (ORDER BY f {0}) AS `foo`
FROM alltypes"""

        cases = [
            (window(preceding=0),
             'range between current row and unbounded following'),

            (window(following=0),
             'range between unbounded preceding and current row'),

            (window(preceding=5),
             'rows between 5 preceding and unbounded following'),
            (window(preceding=5, following=0),
             'rows between 5 preceding and current row'),
            (window(preceding=5, following=2),
             'rows between 5 preceding and 2 following'),
            (window(following=2),
             'rows between unbounded preceding and 2 following'),
            (window(following=2, preceding=0),
             'rows between current row and 2 following'),
            (window(preceding=5),
             'rows between 5 preceding and unbounded following'),
            (window(following=[5, 10]),
             'rows between 5 following and 10 following'),
            (window(preceding=[10, 5]),
             'rows between 10 preceding and 5 preceding'),

            # # cumulative windows
            (ibis.cumulative_window(),
             'range between unbounded preceding and current row'),

            # # trailing windows
            (ibis.trailing_window(10),
             'rows between 10 preceding and current row'),
        ]

        for w, frame in cases:
            w2 = w.order_by(t.f)
            expr = t.projection([t.d.sum().over(w2).name('foo')])
            expected = ex_template.format(frame.upper())
            self._check_sql(expr, expected)

Esempio n. 12

0

Mostra file

File: test_window.py Progetto: deepfield/ibis

def test_row_number_properly_composes_with_arithmetic(con):
    t = con.table('alltypes')
    w = ibis.window(order_by=t.f)
    expr = t.mutate(new=ibis.row_number().over(w) / 2)

    expected = """\
SELECT *, (row_number() OVER (ORDER BY `f`) - 1) / 2 AS `new`
FROM alltypes"""
    assert_sql_equal(expr, expected)

Esempio n. 13

0

Mostra file

File: test_functions.py Progetto: deepfield/ibis

    def test_window_with_arithmetic(self):
        t = self.alltypes
        w = ibis.window(order_by=t.timestamp_col)
        expr = t.mutate(new_col=ibis.row_number().over(w) / 2)

        df = t.projection(['timestamp_col']).sort_by('timestamp_col').execute()
        expected = df.assign(new_col=[x / 2. for x in range(len(df))])
        result = expr['timestamp_col', 'new_col'].execute()
        tm.assert_frame_equal(result, expected)

Esempio n. 14

0

Mostra file

File: test_window.py Progetto: cloudera/ibis

def test_row_number_properly_composes_with_arithmetic(alltypes):
    t = alltypes
    w = ibis.window(order_by=t.f)
    expr = t.mutate(new=ibis.row_number().over(w) / 2)

    expected = """\
SELECT *, (row_number() OVER (ORDER BY `f`) - 1) / 2 AS `new`
FROM ibis_testing.`alltypes`"""
    assert_sql_equal(expr, expected)

Esempio n. 15

0

Mostra file

File: test_window.py Progetto: teamclairvoyant/ibis

def test_nested_analytic_function(con):
    t = con.table('alltypes')

    w = window(order_by=t.f)
    expr = (t.f - t.f.lag()).lag().over(w).name('foo')
    result = t.projection([expr])
    expected = """\
SELECT lag(`f` - lag(`f`) OVER (ORDER BY `f`)) \
OVER (ORDER BY `f`) AS `foo`
FROM alltypes"""
    assert_sql_equal(result, expected)

Esempio n. 16

0

Mostra file

File: test_window.py Progetto: cloudera/ibis

def test_nested_analytic_function(alltypes):
    t = alltypes

    w = window(order_by=t.f)
    expr = (t.f - t.f.lag()).lag().over(w).name('foo')
    result = t.projection([expr])
    expected = """\
SELECT lag(`f` - lag(`f`) OVER (ORDER BY `f`)) \
OVER (ORDER BY `f`) AS `foo`
FROM ibis_testing.`alltypes`"""
    assert_sql_equal(result, expected)

Esempio n. 17

0

Mostra file

File: test_window.py Progetto: nataliaking/ibis

    def test_nested_analytic_function(self):
        t = self.con.table('alltypes')

        w = window(order_by=t.f)
        expr = (t.f - t.f.lag()).lag().over(w).name('foo')
        result = t.projection([expr])
        expected = """\
SELECT lag(f - lag(f) OVER (ORDER BY f)) \
OVER (ORDER BY f) AS `foo`
FROM alltypes"""
        self._check_sql(result, expected)

Esempio n. 18

0

Mostra file

File: test_udf.py Progetto: cloudera/ibis

def test_udaf_analytic_group_by(con, t, df):
    expr = zscore(t.c).over(ibis.window(group_by=t.key))

    assert isinstance(expr, ir.ColumnExpr)

    result = expr.execute()

    def f(s):
        return s.sub(s.mean()).div(s.std())

    expected = df.groupby('key').c.transform(f)
    tm.assert_series_equal(result, expected)

Esempio n. 19

0

Mostra file

File: test_window.py Progetto: nataliaking/ibis

    def test_multiple_windows(self):
        t = self.con.table('alltypes')

        w = window(group_by=t.g)

        expr = t.f.sum().over(w) - t.f.sum()
        proj = t.projection([t.g, expr.name('result')])

        expected = """\
SELECT g, sum(f) OVER (PARTITION BY g) - sum(f) OVER () AS `result`
FROM alltypes"""
        self._check_sql(proj, expected)

Esempio n. 20

0

Mostra file

File: test_window.py Progetto: cloudera/ibis

def test_multiple_windows(alltypes):
    t = alltypes

    w = window(group_by=t.g)

    expr = t.f.sum().over(w) - t.f.sum()
    proj = t.projection([t.g, expr.name('result')])

    expected = """\
SELECT `g`, sum(`f`) OVER (PARTITION BY `g`) - sum(`f`) OVER () AS `result`
FROM ibis_testing.`alltypes`"""
    assert_sql_equal(proj, expected)

Esempio n. 21

0

Mostra file

File: test_window.py Progetto: cloudera/ibis

def test_cumulative_functions(alltypes, cumulative, static):
    t = alltypes

    w = ibis.window(order_by=t.d)

    actual = cumulative(t, w).name('foo')
    expected = static(t, w).over(ibis.cumulative_window()).name('foo')

    expr1 = t.projection(actual)
    expr2 = t.projection(expected)

    assert to_sql(expr1) == to_sql(expr2)

Esempio n. 22

0

Mostra file

File: test_window.py Progetto: teamclairvoyant/ibis

def test_multiple_windows(con):
    t = con.table('alltypes')

    w = window(group_by=t.g)

    expr = t.f.sum().over(w) - t.f.sum()
    proj = t.projection([t.g, expr.name('result')])

    expected = """\
SELECT `g`, sum(`f`) OVER (PARTITION BY `g`) - sum(`f`) OVER () AS `result`
FROM alltypes"""
    assert_sql_equal(proj, expected)

Esempio n. 23

0

Mostra file

File: test_compiler.py Progetto: cloudera/ibis

def test_window_function(alltypes, project_id):
    t = alltypes
    w1 = ibis.window(
        preceding=1, following=0, group_by='year', order_by='timestamp_col'
    )
    expr = t.mutate(win_avg=t.float_col.mean().over(w1))
    result = expr.compile()
    expected = """\
SELECT *,
       avg(`float_col`) OVER (PARTITION BY `year` ORDER BY `timestamp_col` ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) AS `win_avg`
FROM `{}.testing.functional_alltypes`""".format(  # noqa: E501
        project_id
    )
    assert result == expected

    w2 = ibis.window(
        preceding=0, following=2, group_by='year', order_by='timestamp_col'
    )
    expr = t.mutate(win_avg=t.float_col.mean().over(w2))
    result = expr.compile()
    expected = """\
SELECT *,
       avg(`float_col`) OVER (PARTITION BY `year` ORDER BY `timestamp_col` ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING) AS `win_avg`
FROM `{}.testing.functional_alltypes`""".format(  # noqa: E501
        project_id
    )
    assert result == expected

    w3 = ibis.window(
        preceding=(4, 2), group_by='year', order_by='timestamp_col'
    )
    expr = t.mutate(win_avg=t.float_col.mean().over(w3))
    result = expr.compile()
    expected = """\
SELECT *,
       avg(`float_col`) OVER (PARTITION BY `year` ORDER BY `timestamp_col` ROWS BETWEEN 4 PRECEDING AND 2 PRECEDING) AS `win_avg`
FROM `{}.testing.functional_alltypes`""".format(  # noqa: E501
        project_id
    )
    assert result == expected

Esempio n. 24

0

Mostra file

File: test_compiler.py Progetto: cloudera/ibis

def test_window_unbounded(kind, begin, end, expected):
    t = ibis.table([('a', 'int64')], name='t')
    kwargs = {kind: (begin, end)}
    expr = t.a.sum().over(ibis.window(**kwargs))
    result = ibis.bigquery.compile(expr)
    assert (
        result
        == """\
SELECT sum(`a`) OVER (ROWS BETWEEN {}) AS `tmp`
FROM t""".format(
            expected
        )
    )

Esempio n. 25

0

Mostra file

File: test_window_functions.py Progetto: cloudera/ibis

def test_combine_windows(alltypes):
    t = alltypes
    w1 = ibis.window(group_by=t.g, order_by=t.f)
    w2 = ibis.window(preceding=5, following=5)

    w3 = w1.combine(w2)
    expected = ibis.window(
        group_by=t.g, order_by=t.f, preceding=5, following=5
    )
    assert_equal(w3, expected)

    w4 = ibis.window(group_by=t.a, order_by=t.e)
    w5 = w3.combine(w4)
    expected = ibis.window(
        group_by=[t.g, t.a], order_by=[t.f, t.e], preceding=5, following=5
    )
    assert_equal(w5, expected)

    # Cannot combine windows of varying types.
    w6 = ibis.range_window(preceding=5, following=5)
    with pytest.raises(ibis.common.IbisInputError):
        w1.combine(w6)

Esempio n. 26

0

Mostra file

File: test_functions.py Progetto: teamclairvoyant/ibis

 def test_rolling_window(self):
     t = self.alltypes
     df = t[['double_col', 'timestamp_col']].execute().sort_values('timestamp_col').reset_index(drop=True)
     window = ibis.window(
         order_by=t.timestamp_col,
         preceding=6,
         following=0
     )
     for func in 'mean sum min max'.split():
         f = getattr(t.double_col, func)
         df_f = getattr(df.double_col.rolling(7, min_periods=0), func)
         result = t.projection([f().over(window).name('double_col')]).execute().double_col
         expected = df_f()
         tm.assert_series_equal(result, expected)

Esempio n. 27

0

Mostra file

File: test_window.py Progetto: nataliaking/ibis

    def test_unsupported_aggregate_functions(self):
        t = self.con.table('alltypes')
        w = ibis.window(order_by=t.d)

        exprs = [
            t.f.approx_nunique(),
            t.f.approx_median(),
            t.g.group_concat(),
        ]

        for expr in exprs:
            with self.assertRaises(com.TranslationError):
                proj = t.projection([expr.over(w).name('foo')])
                to_sql(proj)

Esempio n. 28

0

Mostra file

File: test_window_functions.py Progetto: wesm/ibis

def test_auto_windowize_analysis_bug(con):
    # GH #544
    t = con.table("airlines")

    def metric(x):
        return x.arrdelay.mean().name("avg_delay")

    annual_delay = t[t.dest.isin(["JFK", "SFO"])].group_by(["dest", "year"]).aggregate(metric)
    what = annual_delay.group_by("dest")
    enriched = what.mutate(grand_avg=annual_delay.avg_delay.mean())

    expr = annual_delay.avg_delay.mean().name("grand_avg").over(ibis.window(group_by=annual_delay.dest))
    expected = annual_delay[annual_delay, expr]

    assert_equal(enriched, expected)

Esempio n. 29

0

Mostra file

File: test_window_functions.py Progetto: raincoatrun/ibis

    def test_auto_windowize_analysis_bug(self):
        # GH #544
        t = self.con.table('airlines')

        annual_delay = (t[t.dest.isin(['JFK', 'SFO'])]
                        .group_by(['dest', 'year'])
                        .aggregate(t.arrdelay.mean().name('avg_delay')))
        what = annual_delay.group_by('dest')
        enriched = what.mutate(grand_avg=annual_delay.avg_delay.mean())

        expr = (annual_delay.avg_delay.mean().name('grand_avg')
                .over(ibis.window(group_by=annual_delay.dest)))
        expected = annual_delay[annual_delay, expr]

        assert_equal(enriched, expected)

Esempio n. 30

0

Mostra file

File: test_window_functions.py Progetto: raincoatrun/ibis

    def test_compose_group_by_apis(self):
        t = self.t
        w = ibis.window(group_by=t.g, order_by=t.f)

        diff = t.d - t.d.lag()
        grouped = t.group_by('g').order_by('f')

        expr = grouped[t, diff.name('diff')]
        expr2 = grouped.mutate(diff=diff)
        expr3 = grouped.mutate([diff.name('diff')])

        window_expr = (t.d - t.d.lag().over(w)).name('diff')
        expected = t.projection([t, window_expr])

        assert_equal(expr, expected)
        assert_equal(expr, expr2)
        assert_equal(expr, expr3)

Esempio n. 31

0

Mostra file

File: test_functions.py Progetto: icexelloss/ibis

def test_partitioned_window(alltypes, func, df):
    t = alltypes
    window = ibis.window(
        group_by=t.string_col,
        order_by=t.timestamp_col,
        preceding=6,
        following=0,
    )

    def roller(func):
        def rolled(df):
            torder = df.sort_values('timestamp_col')
            rolling = torder.double_col.rolling(7, min_periods=0)
            return getattr(rolling, func)()

        return rolled

    f = getattr(t.double_col, func)
    expr = f().over(window).name('double_col')
    result = t.projection([expr]).execute().double_col
    expected = (
        df.groupby('string_col').apply(roller(func)).reset_index(drop=True)
    )
    tm.assert_series_equal(result, expected)

Esempio n. 32

0

Mostra file

def test_window_has_pre_execute_scope():
    called = [0]

    @pre_execute.register(ops.Lag, Backend)
    def test_pre_execute(op, client, **kwargs):
        called[0] += 1
        return Scope()

    data = {'key': list('abc'), 'value': [1, 2, 3], 'dup': list('ggh')}
    df = pd.DataFrame(data, columns=['key', 'value', 'dup'])
    client = ibis.pandas.connect({'df': df})
    t = client.table('df')
    window = ibis.window(order_by='value')
    expr = t.key.lag(1).over(window).name('foo')
    result = expr.execute()
    assert result is not None

    # once in window op at the top to pickup any scope changes before computing
    # twice in window op when calling execute on the ops.Lag node at the
    # beginning of execute and once before the actual computation
    #
    # this process happens twice because of the pre_execute call on the Alias
    # operation
    assert called[0] == 3 + 3

Esempio n. 33

0

Mostra file

def test_project_scope_does_not_override(t, df):
    col = t.plain_int64
    expr = t[
        [
            col.name('new_col'),
            col.sum()
            .over(ibis.window(group_by='dup_strings'))
            .name('grouped'),
        ]
    ]
    result = expr.execute()
    expected = pd.concat(
        [
            df[['plain_int64', 'dup_strings']].rename(
                columns={'plain_int64': 'new_col'}
            ),
            df.groupby('dup_strings')
            .plain_int64.transform('sum')
            .reset_index(drop=True)
            .rename('grouped'),
        ],
        axis=1,
    )[['new_col', 'grouped']]
    tm.assert_frame_equal(result, expected)

Esempio n. 34

0

Mostra file

def test_window(backend, alltypes, df, con, result_fn, expected_fn):
    if not backend.supports_window_operations:
        pytest.skip(
            'Backend {} does not support window operations'.format(backend)
        )

    expr = alltypes.mutate(
        val=result_fn(
            alltypes,
            win=ibis.window(
                following=0,
                group_by=[alltypes.string_col],
                order_by=[alltypes.id],
            ),
        )
    )

    result = expr.execute().set_index('id').sort_index()
    column = expected_fn(df.sort_values('id').groupby('string_col'))
    expected = df.assign(val=column).set_index('id').sort_index()

    left, right = result.val, expected.val

    backend.assert_series_equal(left, right)

Esempio n. 35

0

Mostra file

    columns = [group_by, order_by, 'G']
    expected = (
        batting_df[columns].set_index(order_by).groupby(group_by).G.rolling(
            4, min_periods=1).sum().rename('rolled'))

    tm.assert_series_equal(
        result.set_index([group_by, order_by]).sort_index().rolled,
        expected.sort_index().astype("int64"),
    )


@pytest.mark.parametrize(
    'window',
    [
        ibis.window(order_by='yearID'),
        ibis.window(order_by='yearID', group_by='playerID'),
    ],
)
def test_window_failure_mode(batting, batting_df, window):
    # can't have order by without a following value of 0
    expr = batting.mutate(more_values=batting.G.sum().over(window))
    with pytest.raises(ibis.common.exceptions.OperationNotDefinedError):
        expr.execute()


def test_scalar_broadcasting(batting, batting_df):
    expr = batting.mutate(demeaned=batting.G - batting.G.mean())
    result = expr.execute()
    expected = batting_df.assign(demeaned=batting_df.G - batting_df.G.mean())
    tm.assert_frame_equal(result, expected)

Esempio n. 36

0

Mostra file

def row_window():
    return ibis.window(following=0, order_by='plain_int64')

Esempio n. 37

0

Mostra file

def range_window():
    return ibis.window(following=0, order_by='plain_datetimes_naive')

Esempio n. 38

0

Mostra file

import pytest

import ibis
from ibis.backends.impala.tests.conftest import translate


@pytest.fixture(scope="module")
def table(mockcon):
    return mockcon.table("functional_alltypes")


@pytest.mark.parametrize(
    ("expr_fn", "expected"),
    [
        pytest.param(
            lambda t: ibis.row_number().over(ibis.window(order_by=t.float_col)
                                             ),
            '(row_number() OVER (ORDER BY `float_col`) - 1)',
        ),
        pytest.param(lambda t: t.string_col.lag(),
                     'lag(`string_col`)',
                     id="lag_default"),
        pytest.param(lambda t: t.string_col.lag(2),
                     'lag(`string_col`, 2)',
                     id="lag_arg"),
        pytest.param(
            lambda t: t.string_col.lag(default=0),
            'lag(`string_col`, 1, 0)',
            id="lag_explicit_default",
        ),
        pytest.param(

Esempio n. 39

0

Mostra file

File: santander_pandas_ibis.py Progetto: dchigarev/omniscripts

def etl_ibis(
    filename,
    columns_names,
    columns_types,
    database_name,
    table_name,
    omnisci_server_worker,
    delete_old_database,
    create_new_table,
    ipc_connection,
    validation,
    run_import_queries,
    etl_keys,
):
    tmp_table_name = "tmp_table"

    etl_times = {key: 0.0 for key in etl_keys}

    omnisci_server_worker.create_database(database_name,
                                          delete_if_exists=delete_old_database)

    if run_import_queries:
        etl_times_import = {
            "t_readcsv_by_ibis": 0.0,
            "t_readcsv_by_COPY": 0.0,
            "t_readcsv_by_FSI": 0.0,
        }

        # SQL statemnts preparation for data file import queries
        connect_to_db_sql_template = "\c {0} admin HyperInteractive"
        create_table_sql_template = """
        CREATE TABLE {0} ({1});
        """
        import_by_COPY_sql_template = """
        COPY {0} FROM '{1}' WITH (header='{2}');
        """
        import_by_FSI_sql_template = """
        CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}');
        """
        drop_table_sql_template = """
        DROP TABLE IF EXISTS {0};
        """

        import_query_cols_list = (
            ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"] +
            ["var_%s DOUBLE, \n" % i for i in range(199)] + ["var_199 DOUBLE"])
        import_query_cols_str = "".join(import_query_cols_list)

        create_table_sql = create_table_sql_template.format(
            tmp_table_name, import_query_cols_str)
        import_by_COPY_sql = import_by_COPY_sql_template.format(
            tmp_table_name, filename, "true")
        import_by_FSI_sql = import_by_FSI_sql_template.format(
            tmp_table_name, import_query_cols_str, filename)

        # data file import by ibis
        columns_types_import_query = ["string", "int64"
                                      ] + ["float64" for _ in range(200)]
        schema_table_import = ibis.Schema(names=columns_names,
                                          types=columns_types_import_query)
        omnisci_server_worker.create_table(
            table_name=tmp_table_name,
            schema=schema_table_import,
            database=database_name,
        )

        table_import_query = omnisci_server_worker.database(
            database_name).table(tmp_table_name)
        t0 = timer()
        table_import_query.read_csv(filename, delimiter=",")
        etl_times_import["t_readcsv_by_ibis"] = round((timer() - t0) * 1000)

        # data file import by FSI
        omnisci_server_worker.drop_table(tmp_table_name)
        t0 = timer()
        omnisci_server_worker.execute_sql_query(import_by_FSI_sql)
        etl_times_import["t_readcsv_by_FSI"] = round((timer() - t0) * 1000)

        omnisci_server_worker.drop_table(tmp_table_name)

        # data file import by SQL COPY statement
        omnisci_server_worker.execute_sql_query(create_table_sql)

        t0 = timer()
        omnisci_server_worker.execute_sql_query(import_by_COPY_sql)
        etl_times_import["t_readcsv_by_COPY"] = round((timer() - t0) * 1000)

        omnisci_server_worker.drop_table(tmp_table_name)

        etl_times.update(etl_times_import)

    if create_new_table:
        # Create table and import data for ETL queries
        schema_table = ibis.Schema(names=columns_names, types=columns_types)
        omnisci_server_worker.create_table(
            table_name=table_name,
            schema=schema_table,
            database=database_name,
        )

        table_import = omnisci_server_worker.database(database_name).table(
            table_name)
        t0 = timer()
        table_import.read_csv(filename, delimiter=",")
        etl_times["t_readcsv"] = round((timer() - t0) * 1000)

    omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection)
    table = omnisci_server_worker.database(database_name).table(table_name)

    # group_by/count, merge (join) and filtration queries
    # We are making 400 columns and then insert them into original table thus avoiding
    # nested sql requests
    t_etl_start = timer()
    count_cols = []
    orig_cols = ["ID_code", "target"] + ['var_%s' % i for i in range(200)]
    cast_cols = []
    cast_cols.append(table["target"].cast("int64").name("target0"))
    gt1_cols = []
    for i in range(200):
        col = "var_%d" % i
        col_count = "var_%d_count" % i
        col_gt1 = "var_%d_gt1" % i
        w = ibis.window(group_by=col)
        count_cols.append(table[col].count().over(w).name(col_count))
        gt1_cols.append(ibis.case().when(
            table[col].count().over(w).name(col_count) > 1,
            table[col].cast("float32"),
        ).else_(ibis.null()).end().name("var_%d_gt1" % i))
        cast_cols.append(table[col].cast("float32").name(col))

    table = table.mutate(count_cols)
    table = table.drop(orig_cols)
    table = table.mutate(gt1_cols)
    table = table.mutate(cast_cols)

    table_df = table.execute()

    etl_times["t_etl"] = round((timer() - t_etl_start) * 1000)
    return table_df, etl_times

Esempio n. 40

0

Mostra file

    proj = grouped.mutate([lag, diff, first, last, lag2])
    expected = """\
SELECT *, lag(`f`) OVER (PARTITION BY `g` ORDER BY `f`) AS `lag`,
       lead(`f`) OVER (PARTITION BY `g` ORDER BY `f`) - `f` AS `fwd_diff`,
       first_value(`f`) OVER (PARTITION BY `g` ORDER BY `f`) AS `first`,
       last_value(`f`) OVER (PARTITION BY `g` ORDER BY `f`) AS `last`,
       lag(`f`) OVER (PARTITION BY `g` ORDER BY `d`) AS `lag2`
FROM ibis_testing.`alltypes`"""
    assert_sql_equal(proj, expected)


@pytest.mark.impala
@pytest.mark.parametrize(
    ['window', 'frame'],
    [
        (window(preceding=0),
         'range between current row and unbounded following'),
        (window(following=0),
         'range between unbounded preceding and current row'),
        (window(preceding=5),
         'rows between 5 preceding and unbounded following'),
        (window(preceding=5,
                following=0), 'rows between 5 preceding and current row'),
        (window(preceding=5,
                following=2), 'rows between 5 preceding and 2 following'),
        (window(following=2),
         'rows between unbounded preceding and 2 following'),
        (window(following=2,
                preceding=0), 'rows between current row and 2 following'),
        (window(preceding=5),
         'rows between 5 preceding and unbounded following'),

Esempio n. 41

0

Mostra file

File: test_udf.py Progetto: brian-ketelboeter/ibis

def test_array_return_type_reduction_window(con, t, df, qs):
    expr = quantiles(t.b, quantiles=qs).over(ibis.window())
    result = expr.execute()
    expected_raw = df.b.quantile(qs).tolist()
    expected = pd.Series([expected_raw] * len(df))
    tm.assert_series_equal(result, expected)

Esempio n. 42

0

Mostra file

def test_window_unbounded_invalid(kind, begin, end):
    kwargs = {kind: (begin, end)}
    with pytest.raises(com.IbisInputError):
        ibis.window(**kwargs)

Esempio n. 43

0

Mostra file

File: test_window.py Progetto: wkusnierczyk/ibis

     marks=pytest.mark.xfail,
 ),
 param(
     lambda t: t.float_col.first(),
     lambda t: t.float_col.head(1),
     id='first',
     marks=pytest.mark.xfail,
 ),
 param(
     lambda t: t.float_col.last(),
     lambda t: t.float_col.tail(1),
     id='last',
     marks=pytest.mark.xfail,
 ),
 param(
     lambda t: t.float_col.first().over(ibis.window(preceding=10)),
     lambda t: t,
     id='first_preceding',
     marks=pytest.mark.xfail,
 ),
 param(
     lambda t: t.float_col.first().over(ibis.window(following=10)),
     lambda t: t,
     id='first_following',
     marks=pytest.mark.xfail,
 ),
 param(
     lambda t: ibis.row_number(),
     lambda t: pd.Series(np.arange(len(t))),
     id='row_number',
     marks=pytest.mark.xfail,

Esempio n. 44

0

Mostra file

File: test_window.py Progetto: jelitox/ibis

            level=1).reset_index(drop=True)).set_index('id').sort_index())

    # discard first 2 rows of each group to account for the shift
    n = len(gdf) * 2
    left, right = result.val.shift(-n), expected.val.shift(-n)

    backend.assert_series_equal(left, right)


@pytest.mark.parametrize(
    'window_fn',
    [
        param(
            lambda t: ibis.window(
                preceding=2,
                following=0,
                group_by=[t.string_col],
                order_by=[t.id],
            ),
            id='preceding-2-following-0',
        ),
        param(
            lambda t: ibis.trailing_window(
                preceding=2, group_by=[t.string_col], order_by=[t.id]),
            id='trailing-2',
        ),
    ],
)
@pytest.mark.xfail_unsupported
def test_grouped_bounded_preceding_windows(backend, alltypes, df, con,
                                           window_fn):
    if not backend.supports_window_operations:

Esempio n. 45

0

Mostra file

File: benchmarks.py Progetto: jp-harvey/ibis

    def setup(self):
        n = 30 * int(2e5)
        self.data = pd.DataFrame({
            'key':
            np.random.choice(16000, size=n),
            'low_card_key':
            np.random.choice(30, size=n),
            'value':
            np.random.rand(n),
            'timestamps':
            pd.date_range(start='now', periods=n, freq='s').values,
            'timestamp_strings':
            pd.date_range(start='now', periods=n, freq='s').values.astype(str),
            'repeated_timestamps':
            pd.date_range(start='2018-09-01', periods=30).repeat(int(n / 30)),
        })

        t = ibis.pandas.connect({'df': self.data}).table('df')

        self.high_card_group_by = t.groupby(
            t.key).aggregate(avg_value=t.value.mean())

        self.cast_to_dates = t.timestamps.cast(dt.date)
        self.cast_to_dates_from_strings = t.timestamp_strings.cast(dt.date)

        self.multikey_group_by_with_mutate = (t.mutate(
            dates=t.timestamps.cast('date')).groupby(
                ['low_card_key',
                 'dates']).aggregate(avg_value=lambda t: t.value.mean()))

        self.simple_sort = t.sort_by([t.key])

        self.simple_sort_projection = t[['key', 'value']].sort_by(['key'])

        self.multikey_sort = t.sort_by(['low_card_key', 'key'])

        self.multikey_sort_projection = t[['low_card_key', 'key', 'value'
                                           ]].sort_by(['low_card_key', 'key'])

        low_card_rolling_window = ibis.trailing_range_window(
            ibis.interval(days=2),
            order_by=t.repeated_timestamps,
            group_by=t.low_card_key,
        )
        self.low_card_grouped_rolling = t.value.mean().over(
            low_card_rolling_window)

        high_card_rolling_window = ibis.trailing_range_window(
            ibis.interval(days=2),
            order_by=t.repeated_timestamps,
            group_by=t.key,
        )
        self.high_card_grouped_rolling = t.value.mean().over(
            high_card_rolling_window)

        @udf.reduction(['double'], 'double')
        def my_mean(series):
            return series.mean()

        self.low_card_grouped_rolling_udf_mean = my_mean(
            t.value).over(low_card_rolling_window)
        self.high_card_grouped_rolling_udf_mean = my_mean(
            t.value).over(high_card_rolling_window)

        @udf.analytic(['double'], 'double')
        def my_zscore(series):
            return (series - series.mean()) / series.std()

        low_card_window = ibis.window(group_by=t.low_card_key)

        high_card_window = ibis.window(group_by=t.key)

        self.low_card_window_analytics_udf = my_zscore(
            t.value).over(low_card_window)
        self.high_card_window_analytics_udf = my_zscore(
            t.value).over(high_card_window)

        @udf.reduction(['double', 'double'], 'double')
        def my_wm(v, w):
            return np.average(v, weights=w)

        self.low_card_grouped_rolling_udf_wm = my_wm(
            t.value, t.value).over(low_card_rolling_window)

        self.high_card_grouped_rolling_udf_wm = my_wm(
            t.value, t.value).over(low_card_rolling_window)

Esempio n. 46

0

Mostra file

def etl_ibis(
    filename,
    columns_names,
    columns_types,
    database_name,
    table_name,
    omnisci_server_worker,
    delete_old_database,
    create_new_table,
    ipc_connection,
    validation,
    run_import_queries,
    etl_keys,
    import_mode,
):
    tmp_table_name = "tmp_table"

    etl_times = {key: 0.0 for key in etl_keys}

    omnisci_server_worker.create_database(database_name,
                                          delete_if_exists=delete_old_database)

    if run_import_queries:
        etl_times_import = {
            "t_readcsv_by_ibis": 0.0,
            "t_readcsv_by_COPY": 0.0,
            "t_readcsv_by_FSI": 0.0,
        }

        # SQL statemnts preparation for data file import queries
        connect_to_db_sql_template = "\c {0} admin HyperInteractive"
        create_table_sql_template = """
        CREATE TABLE {0} ({1});
        """
        import_by_COPY_sql_template = """
        COPY {0} FROM '{1}' WITH (header='{2}');
        """
        import_by_FSI_sql_template = """
        CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}');
        """
        drop_table_sql_template = """
        DROP TABLE IF EXISTS {0};
        """

        import_query_cols_list = (
            ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"] +
            ["var_%s DOUBLE, \n" % i for i in range(199)] + ["var_199 DOUBLE"])
        import_query_cols_str = "".join(import_query_cols_list)

        create_table_sql = create_table_sql_template.format(
            tmp_table_name, import_query_cols_str)
        import_by_COPY_sql = import_by_COPY_sql_template.format(
            tmp_table_name, filename, "true")
        import_by_FSI_sql = import_by_FSI_sql_template.format(
            tmp_table_name, import_query_cols_str, filename)

        # data file import by ibis
        columns_types_import_query = ["string", "int64"
                                      ] + ["float64" for _ in range(200)]
        schema_table_import = ibis.Schema(names=columns_names,
                                          types=columns_types_import_query)
        omnisci_server_worker.create_table(
            table_name=tmp_table_name,
            schema=schema_table_import,
            database=database_name,
        )

        table_import_query = omnisci_server_worker.database(
            database_name).table(tmp_table_name)
        t0 = timer()
        table_import_query.read_csv(filename, delimiter=",")
        etl_times_import["t_readcsv_by_ibis"] = round((timer() - t0) * 1000)

        # data file import by FSI
        omnisci_server_worker.drop_table(tmp_table_name)
        t0 = timer()
        omnisci_server_worker.execute_sql_query(import_by_FSI_sql)
        etl_times_import["t_readcsv_by_FSI"] = round((timer() - t0) * 1000)

        omnisci_server_worker.drop_table(tmp_table_name)

        # data file import by SQL COPY statement
        omnisci_server_worker.execute_sql_query(create_table_sql)

        t0 = timer()
        omnisci_server_worker.execute_sql_query(import_by_COPY_sql)
        etl_times_import["t_readcsv_by_COPY"] = round((timer() - t0) * 1000)

        omnisci_server_worker.drop_table(tmp_table_name)

        etl_times.update(etl_times_import)

    if create_new_table:
        # Create table and import data for ETL queries
        schema_table = ibis.Schema(names=columns_names, types=columns_types)
        if import_mode == "copy-from":
            omnisci_server_worker.create_table(
                table_name=table_name,
                schema=schema_table,
                database=database_name,
            )
            table_import = omnisci_server_worker.database(database_name).table(
                table_name)

            t0 = timer()
            table_import.read_csv(filename,
                                  header=True,
                                  quotechar="",
                                  delimiter=",")
            etl_times["t_readcsv"] = round((timer() - t0) * 1000)

        elif import_mode == "pandas":
            # Datafiles import
            columns_types_converted = [
                "float64" if (x.startswith("decimal")) else x
                for x in columns_types
            ]
            t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis(
                table_name=table_name,
                data_files_names=filename,
                files_limit=1,
                columns_names=columns_names,
                columns_types=columns_types_converted,
                header=0,
                nrows=None,
                compression_type="gzip" if filename.endswith("gz") else None,
                validation=validation,
            )
            etl_times["t_readcsv"] = round(
                (t_import_pandas + t_import_ibis) * 1000)

        elif import_mode == "fsi":
            try:
                unzip_name = None
                if filename.endswith("gz"):
                    import gzip

                    unzip_name = "/tmp/santander-fsi.csv"

                    with gzip.open(filename, "rb") as gz_input:
                        with open(unzip_name, "wb") as output:
                            output.write(gz_input.read())

                t0 = timer()
                omnisci_server_worker._conn.create_table_from_csv(
                    table_name, unzip_name or filename, schema_table)
                etl_times["t_readcsv"] = round((timer() - t0) * 1000)

            finally:
                if filename.endswith("gz"):
                    import os

                    os.remove(unzip_name)

    # Second connection - this is ibis's ipc connection for DML
    omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection)
    table = omnisci_server_worker.database(database_name).table(table_name)

    # group_by/count, merge (join) and filtration queries
    # We are making 400 columns and then insert them into original table thus avoiding
    # nested sql requests
    t_etl_start = timer()
    count_cols = []
    orig_cols = ["ID_code", "target"] + ["var_%s" % i for i in range(200)]
    cast_cols = []
    cast_cols.append(table["target"].cast("int64").name("target0"))
    gt1_cols = []
    for i in range(200):
        col = "var_%d" % i
        col_count = "var_%d_count" % i
        col_gt1 = "var_%d_gt1" % i
        w = ibis.window(group_by=col)
        count_cols.append(table[col].count().over(w).name(col_count))
        gt1_cols.append(ibis.case().when(
            table[col].count().over(w).name(col_count) > 1,
            table[col].cast("float32"),
        ).else_(ibis.null()).end().name("var_%d_gt1" % i))
        cast_cols.append(table[col].cast("float32").name(col))

    table = table.mutate(count_cols)
    table = table.drop(orig_cols)
    table = table.mutate(gt1_cols)
    table = table.mutate(cast_cols)

    table_df = table.execute()

    etl_times["t_etl"] = round((timer() - t_etl_start) * 1000)
    return table_df, etl_times

Esempio n. 47

0

Mostra file

def etl_ibis(args, run_import_queries, columns_names, columns_types, validation=False):

    filename = args.file
    database_name = args.name
    table_name = args.table
    delete_old_database = not args.dnd
    create_new_table = not args.dni
    run_import_queries = str_arg_to_bool(run_import_queries)
    validation = str_arg_to_bool(validation)

    tmp_table_name = "tmp_table"

    etl_times = {"t_groupby_merge_where": 0.0, "t_train_test_split": 0.0, "t_etl": 0.0}

    if run_import_queries:
        etl_times_import = {
            "t_readcsv_by_ibis": 0.0,
            "t_readcsv_by_COPY": 0.0,
            "t_readcsv_by_FSI": 0.0,
        }
        etl_times.update(etl_times_import)

    omnisci_server = OmnisciServer(
        omnisci_executable=args.omnisci_executable,
        omnisci_port=args.omnisci_port,
        database_name=args.name,
        user=args.user,
        password=args.password,
        debug_timer=True,
        columnar_output=args.server_columnar_output,
        lazy_fetch=args.server_lazy_fetch,
    )
    omnisci_server.launch()

    import ibis
    from server_worker import OmnisciServerWorker

    omnisci_server_worker = OmnisciServerWorker(omnisci_server)
    omnisci_server_worker.create_database(
        database_name, delete_if_exists=delete_old_database
    )

    time.sleep(2)
    omnisci_server_worker.connect_to_server()

    if run_import_queries:
        # SQL statemnts preparation for data file import queries
        connect_to_db_sql_template = "\c {0} admin HyperInteractive"
        create_table_sql_template = """
        CREATE TABLE {0} ({1});
        """
        import_by_COPY_sql_template = """
        COPY {0} FROM '{1}' WITH (header='{2}');
        """
        import_by_FSI_sql_template = """
        CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}');
        """
        drop_table_sql_template = """
        DROP TABLE IF EXISTS {0};
        """

        import_query_cols_list = (
            ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"]
            + ["var_%s DOUBLE, \n" % i for i in range(199)]
            + ["var_199 DOUBLE"]
        )
        import_query_cols_str = "".join(import_query_cols_list)

        connect_to_db_sql = connect_to_db_sql_template.format(database_name)
        create_table_sql = create_table_sql_template.format(
            tmp_table_name, import_query_cols_str
        )
        import_by_COPY_sql = import_by_COPY_sql_template.format(
            tmp_table_name, filename, "true"
        )
        import_by_FSI_sql = import_by_FSI_sql_template.format(
            tmp_table_name, import_query_cols_str, filename
        )

        # data file import by ibis
        columns_types_import_query = ["string", "int64"] + [
            "float64" for _ in range(200)
        ]
        schema_table_import = ibis.Schema(
            names=columns_names, types=columns_types_import_query
        )
        omnisci_server_worker.get_conn().create_table(
            table_name=tmp_table_name,
            schema=schema_table_import,
            database=database_name,
            fragment_size=args.fragment_size,
        )

        table_import_query = omnisci_server_worker.database(database_name).table(tmp_table_name)
        t0 = timer()
        table_import_query.read_csv(filename, delimiter=",")
        etl_times["t_readcsv_by_ibis"] = timer() - t0

        # data file import by FSI
        omnisci_server_worker.drop_table(tmp_table_name)
        t0 = timer()
        omnisci_server_worker.execute_sql_query(import_by_FSI_sql)
        etl_times["t_readcsv_by_FSI"] = timer() - t0

        omnisci_server_worker.drop_table(tmp_table_name)

        # data file import by SQL COPY statement
        omnisci_server_worker.execute_sql_query(create_table_sql)

        t0 = timer()
        omnisci_server_worker.execute_sql_query(import_by_COPY_sql)
        etl_times["t_readcsv_by_COPY"] = timer() - t0

        omnisci_server_worker.drop_table(tmp_table_name)

    if create_new_table:
        # Create table and import data for ETL queries
        schema_table = ibis.Schema(names=columns_names, types=columns_types)
        omnisci_server_worker.get_conn().create_table(
            table_name=table_name,
            schema=schema_table,
            database=database_name,
            fragment_size=args.fragment_size,
        )

        table_import = omnisci_server_worker.database(database_name).table(table_name)
        table_import.read_csv(filename, delimiter=",")

    if args.server_conn_type == "regular":
        omnisci_server_worker.connect_to_server()
    elif args.server_conn_type == "ipc":
        omnisci_server_worker.ipc_connect_to_server()
    else:
        print("Wrong connection type is specified!")
        sys.exit(0)

    db = omnisci_server_worker.database(database_name)
    table = db.table(table_name)

    # group_by/count, merge (join) and filtration queries
    # We are making 400 columns and then insert them into original table thus avoiding
    # nested sql requests
    t0 = timer()
    count_cols = []
    orig_cols = ["ID_code", "target"] + ['var_%s'%i for i in range(200)]
    cast_cols = []
    cast_cols.append(table["target"].cast("int64").name("target0"))
    gt1_cols = []
    for i in range(200):
        col = "var_%d" % i
        col_count = "var_%d_count" % i
        col_gt1 = "var_%d_gt1" % i
        w = ibis.window(group_by=col)
        count_cols.append(table[col].count().over(w).name(col_count))
        gt1_cols.append(
            ibis.case()
            .when(
                table[col].count().over(w).name(col_count) > 1,
                table[col].cast("float32"),
            )
            .else_(ibis.null())
            .end()
            .name("var_%d_gt1" % i)
        )
        cast_cols.append(table[col].cast("float32").name(col))

    table = table.mutate(count_cols)
    table = table.drop(orig_cols)
    table = table.mutate(gt1_cols)
    table = table.mutate(cast_cols)

    table_df = table.execute()
    etl_times["t_groupby_merge_where"] = timer() - t0

    # rows split query
    t0 = timer()
    training_part, validation_part = table_df[:-10000], table_df[-10000:]
    etl_times["t_train_test_split"] = timer() - t0
    
    etl_times["t_etl"] = etl_times["t_groupby_merge_where"] + etl_times["t_train_test_split"]
    
    x_train = training_part.drop(['target0'],axis=1)
    y_train = training_part['target0']
    x_valid = validation_part.drop(['target0'],axis=1)
    y_valid = validation_part['target0']
    
    omnisci_server.terminate()
    omnisci_server = None

    return x_train, y_train, x_valid, y_valid, etl_times

Esempio n. 48

0

Mostra file

def test_preceding_following_validate(alltypes):
    # these all work
    [
        ibis.window(preceding=0),
        ibis.window(following=0),
        ibis.window(preceding=0, following=0),
        ibis.window(preceding=(None, 4)),
        ibis.window(preceding=(10, 4)),
        ibis.window(following=(4, None)),
        ibis.window(following=(4, 10)),
    ]

    # these are ill-specified
    error_cases = [
        lambda: ibis.window(preceding=(1, 3)),
        lambda: ibis.window(preceding=(3, 1), following=2),
        lambda: ibis.window(preceding=(3, 1), following=(2, 4)),
        lambda: ibis.window(preceding=-1),
        lambda: ibis.window(following=-1),
        lambda: ibis.window(preceding=(-1, 2)),
        lambda: ibis.window(following=(2, -1)),
    ]

    for i, case in enumerate(error_cases):
        with pytest.raises(Exception):
            case()

Esempio n. 49

0

Mostra file

File: santander_pandas_ibis.py Progetto: kurapov-peter/omniscripts

def etl_ibis(
    filename,
    columns_names,
    columns_types,
    database_name,
    table_name,
    omnisci_server_worker,
    delete_old_database,
    create_new_table,
    ipc_connection,
    validation,
    etl_keys,
    import_mode,
    fragments_size,
):
    etl_times = {key: 0.0 for key in etl_keys}

    fragments_size = check_fragments_size(fragments_size,
                                          count_table=1,
                                          import_mode=import_mode)

    omnisci_server_worker.create_database(database_name,
                                          delete_if_exists=delete_old_database)

    if create_new_table:
        # Create table and import data for ETL queries
        schema_table = ibis.Schema(names=columns_names, types=columns_types)
        if import_mode == "copy-from":
            t0 = timer()
            omnisci_server_worker.create_table(
                table_name=table_name,
                schema=schema_table,
                database=database_name,
                fragment_size=fragments_size[0],
            )
            table_import = omnisci_server_worker.database(database_name).table(
                table_name)
            etl_times["t_connect"] += timer() - t0

            t0 = timer()
            table_import.read_csv(filename,
                                  header=True,
                                  quotechar="",
                                  delimiter=",")
            etl_times["t_readcsv"] = timer() - t0

        elif import_mode == "pandas":
            # decimal(8, 4) is converted to decimal(9, 6) in order to provide better data conversion
            # accuracy during import from Pandas into OmniSciDB for proper results validation
            columns_types = [
                "decimal(9, 6)" if (x == "decimal(8, 4)") else x
                for x in columns_types
            ]
            t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis(
                table_name=table_name,
                data_files_names=filename,
                files_limit=1,
                columns_names=columns_names,
                columns_types=columns_types,
                header=0,
                nrows=None,
                compression_type="gzip" if filename.endswith(".gz") else None,
                use_columns_types_for_pd=False,
            )
            etl_times["t_readcsv"] = t_import_pandas + t_import_ibis
            etl_times[
                "t_connect"] += omnisci_server_worker.get_conn_creation_time()

        elif import_mode == "fsi":
            try:
                unzip_name = None
                if filename.endswith(".gz"):
                    import gzip

                    unzip_name = get_tmp_filepath("santander-fsi.csv")

                    with gzip.open(filename, "rb") as gz_input:
                        with open(unzip_name, "wb") as output:
                            output.write(gz_input.read())

                t0 = timer()
                omnisci_server_worker._conn.create_table_from_csv(
                    table_name,
                    unzip_name or filename,
                    schema_table,
                    fragment_size=fragments_size[0],
                )
                etl_times["t_readcsv"] = timer() - t0
                etl_times[
                    "t_connect"] += omnisci_server_worker.get_conn_creation_time(
                    )

            finally:
                if filename.endswith("gz"):
                    import os

                    os.remove(unzip_name)

    # Second connection - this is ibis's ipc connection for DML
    t0 = timer()
    omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection)
    table = omnisci_server_worker.database(database_name).table(table_name)
    etl_times["t_connect"] += timer() - t0

    # group_by/count, merge (join) and filtration queries
    # We are making 400 columns and then insert them into original table thus avoiding
    # nested sql requests
    t_etl_start = timer()
    count_cols = []
    orig_cols = ["ID_code", "target"] + ["var_%s" % i for i in range(200)]
    cast_cols = []
    cast_cols.append(table["target"].cast("int64").name("target0"))
    gt1_cols = []
    for i in range(200):
        col = "var_%d" % i
        col_count = "var_%d_count" % i
        col_gt1 = "var_%d_gt1" % i
        w = ibis.window(group_by=col)
        count_cols.append(table[col].count().over(w).name(col_count))
        gt1_cols.append(ibis.case().when(
            table[col].count().over(w).name(col_count) > 1,
            table[col].cast("float32"),
        ).else_(ibis.null()).end().name(col_gt1))
        cast_cols.append(table[col].cast("float32").name(col))

    table = table.mutate(count_cols)
    table = table.drop(orig_cols)
    table = table.mutate(gt1_cols)
    table = table.mutate(cast_cols)

    table_df = table.execute()

    etl_times["t_etl"] = timer() - t_etl_start
    return table_df, etl_times

Esempio n. 50

0

Mostra file

def test_window_group_by():
    t = ibis.table(dict(a="int64", b="string"), name="t")
    expr = t.a.mean().over(ibis.window(group_by=t.b))
    result = repr(expr)
    assert "preceding=0" not in result
    assert "group_by=[r0.b]" in result

Esempio n. 51

0

Mostra file

def low_card_window(t):
    return ibis.window(group_by=t.low_card_key)

Esempio n. 52

0

Mostra file

File: test_window.py Progetto: ibis-project/ibis

    proj = grouped.mutate([lag, diff, first, last, lag2])
    expected = """\
SELECT *, lag(`f`) OVER (PARTITION BY `g` ORDER BY `f`) AS `lag`,
       lead(`f`) OVER (PARTITION BY `g` ORDER BY `f`) - `f` AS `fwd_diff`,
       first_value(`f`) OVER (PARTITION BY `g` ORDER BY `f`) AS `first`,
       last_value(`f`) OVER (PARTITION BY `g` ORDER BY `f`) AS `last`,
       lag(`f`) OVER (PARTITION BY `g` ORDER BY `d`) AS `lag2`
FROM ibis_testing.`alltypes`"""
    assert_sql_equal(proj, expected)


@pytest.mark.parametrize(
    ['window', 'frame'],
    [
        (
            window(preceding=0),
            'range between current row and unbounded following',
        ),
        (
            window(following=0),
            'range between unbounded preceding and current row',
        ),
        (
            window(preceding=5),
            'rows between 5 preceding and unbounded following',
        ),
        (
            window(preceding=5, following=0),
            'rows between 5 preceding and current row',
        ),
        (

Esempio n. 53

0

Mostra file

def high_card_window(t):
    return ibis.window(group_by=t.key)