def create_joined_df(perf_table): delinquency_12_expr = (ibis.case().when( perf_table["current_loan_delinquency_status"].notnull(), perf_table["current_loan_delinquency_status"], ).else_(-1).end()) upb_12_expr = (ibis.case().when( perf_table["current_actual_upb"].notnull(), perf_table["current_actual_upb"]).else_(999999999).end()) joined_df = perf_table["loan_id", perf_table["monthly_reporting_period"]. month().name("timestamp_month").cast("int32"), perf_table["monthly_reporting_period"].year( ).name("timestamp_year").cast("int32"), delinquency_12_expr.name("delinquency_12"), upb_12_expr.name("upb_12"), ] return joined_df
def test_case_where(backend, alltypes, df): table = alltypes table = table.mutate( new_col=( ibis.case() .when(table['int_col'] == 1, 20) .when(table['int_col'] == 0, 10) .else_(0) .end() .cast('int64') ) ) result = table.execute() expected = df.copy() mask_0 = expected['int_col'] == 1 mask_1 = expected['int_col'] == 0 expected['new_col'] = 0 expected.loc[mask_0, 'new_col'] = 20 expected.loc[mask_1, 'new_col'] = 10 expected['new_col'] = expected['new_col'] backend.assert_frame_equal(result, expected)
def test_multiple_case_null_else(table): expr = ibis.case().when(table.g == "foo", "bar").end() op = expr.op() assert isinstance(expr, ir.StringColumn) assert isinstance(op.default, ir.ValueExpr) assert isinstance(op.default.op(), ops.NullLiteral)
def test_case_in_projection(alltypes): t = alltypes expr = ( t.g.case().when('foo', 'bar').when('baz', 'qux').else_('default').end() ) expr2 = ibis.case().when(t.g == 'foo', 'bar').when(t.g == 'baz', t.g).end() proj = t[expr.name('col1'), expr2.name('col2'), t] result = Compiler.to_sql(proj) expected = """\ SELECT CASE `g` WHEN 'foo' THEN 'bar' WHEN 'baz' THEN 'qux' ELSE 'default' END AS `col1`, CASE WHEN `g` = 'foo' THEN 'bar' WHEN `g` = 'baz' THEN `g` ELSE CAST(NULL AS string) END AS `col2`, * FROM alltypes""" assert result == expected
def test_select_filter_mutate(backend, alltypes, df): """Test that select, filter and mutate are executed in right order. Before Pr 2635, try_fusion in analysis.py would fuse these operations together in a way that the order of the operations were wrong. (mutate was executed before filter). """ t = alltypes # Prepare the float_col so that filter must execute # before the cast to get the correct result. t = t.mutate( float_col=ibis.case() .when(t['bool_col'], t['float_col']) .else_(np.nan) .end() ) # Actual test t = t[t.columns] t = t[~t['float_col'].isnan()] t = t.mutate(float_col=t['float_col'].cast('int32')) result = t.execute() expected = df.copy() expected.loc[~df['bool_col'], 'float_col'] = None expected = expected[~expected['float_col'].isna()] expected = expected.assign(float_col=expected['float_col'].astype('int32')) backend.assert_frame_equal(result, expected)
def test_multiple_case_null_else(self): expr = ibis.case().when(self.table.g == "foo", "bar").end() op = expr.op() assert isinstance(expr, ir.StringArray) assert isinstance(op.default, ir.ValueExpr) assert isinstance(op.default.op(), ir.NullLiteral)
def ifelse( self, true_expr: ir.Value, false_expr: ir.Value, ) -> ir.Value: """Construct a ternary conditional expression. Parameters ---------- true_expr Expression to return if `self` evaluates to `True` false_expr Expression to return if `self` evaluates to `False` Returns ------- Value The value of `true_expr` if `arg` is `True` else `false_expr` Examples -------- >>> import ibis >>> t = ibis.table([("is_person", "boolean")], name="t") >>> expr = t.is_person.ifelse("yes", "no") >>> print(ibis.impala.compile(expr)) SELECT CASE WHEN `is_person` THEN 'yes' ELSE 'no' END AS `tmp` FROM t """ import ibis # Result will be the result of promotion of true/false exprs. These # might be conflicting types; same type resolution as case expressions # must be used. return ibis.case().when(self, true_expr).else_(false_expr).end()
def _extract_quarter(t, expr): (arg, ) = expr.op().args expr_new = ops.ExtractMonth(arg).to_expr() expr_new = (ibis.case().when(expr_new.isin([1, 2, 3]), 1).when(expr_new.isin([4, 5, 6]), 2).when(expr_new.isin([7, 8, 9]), 3).else_(4).end()) return sa.cast(t.translate(expr_new), sa.Integer)
def test_searched_case_column(batting, batting_df): t = batting df = batting_df expr = (ibis.case().when(t.RBI < 5, 'really bad team').when( t.teamID == 'PH1', 'ph1 team').else_(t.teamID).end()) result = expr.execute() expected = pd.Series( np.select([df.RBI < 5, df.teamID == 'PH1'], ['really bad team', 'ph1 team'], df.teamID)) tm.assert_series_equal(result, expected)
def test_search_case(self): expr = ibis.case().when(self.table.f > 0, self.table.d * 2).when(self.table.c < 0, self.table.a * 2).end() result = self._translate(expr) expected = """CASE WHEN f > 0 THEN d * 2 WHEN c < 0 THEN a * 2 ELSE NULL END""" assert result == expected
def test_search_case(self): expr = (ibis.case().when(self.table.f > 0, self.table.d * 2).when( self.table.c < 0, self.table.a * 2).end()) result = self._translate(expr) expected = """CASE WHEN f > 0 THEN d * 2 WHEN c < 0 THEN a * 2 ELSE NULL END""" assert result == expected
def _get_hist_disease_expr( cls, disease: str, geocodes: List[int], year_week_start: int, year_week_end: int, ) -> ibis.expr.types.Expr: """ Return an ibis expression with the history for a given disease. Parameters ---------- disease : str, {'dengue', 'chik', 'zika'} geocodes : List[int] year_week_start : int The starting Year/Week, e.g.: 202002 year_week_end : int The ending Year/Week, e.g.: 202005 Returns ------- ibis.expr.types.Expr """ table_suffix = '' if disease != 'dengue': table_suffix = '_{}'.format(disease) schema_city = con.schema('Municipio') t_hist = schema_city.table('Historico_alerta{}'.format(table_suffix)) case_level = (ibis.case().when( (t_hist.nivel.cast('string') == '1'), 'verde').when( (t_hist.nivel.cast('string') == '2'), 'amarelo').when( (t_hist.nivel.cast('string') == '3'), 'laranja').when( (t_hist.nivel.cast('string') == '4'), 'vermelho').else_('-').end()).name(f'nivel_{disease}') hist_keys = [ t_hist.SE.name(f'SE_{disease}'), t_hist.casos.name(f'casos_{disease}'), t_hist.p_rt1.name(f'p_rt1_{disease}'), t_hist.casos_est.name(f'casos_est_{disease}'), t_hist.p_inc100k.name(f'p_inc100k_{disease}'), t_hist.nivel.name(f'level_code_{disease}'), case_level, t_hist.municipio_geocodigo.name(f'geocode_{disease}'), ] hist_filter = (t_hist['SE'].between( year_week_start, year_week_end)) & (t_hist['municipio_geocodigo'].isin(geocodes)) return t_hist[hist_filter][hist_keys].sort_by(f'SE_{disease}')
def test_searched_case_column(batting, batting_df): t = batting df = batting_df expr = (ibis.case().when(t.RBI < 5, 'really bad team').when( t.teamID == 'PH1', 'ph1 team').else_(t.teamID).end()) result = expr.compile() expected = dd.from_array( np.select( [df.RBI < 5, df.teamID == 'PH1'], ['really bad team', 'ph1 team'], df.teamID, )) tm.assert_series_equal(result.compute(), expected.compute())
def test_substr_with_null_values(backend, alltypes, df): table = alltypes.mutate(substr_col_null=ibis.case().when( alltypes['bool_col'], alltypes['string_col']).else_(None).end().substr( 0, 2)) result = table.execute() expected = df.copy() mask = ~expected['bool_col'] expected['substr_col_null'] = expected['string_col'] expected.loc[mask, 'substr_col_null'] = None expected['substr_col_null'] = expected['substr_col_null'].str.slice(0, 2) backend.assert_frame_equal(result, expected)
def test_search_case(con, alltypes, translate): t = alltypes expr = (ibis.case().when(t.float_col > 0, t.int_col * 2).when(t.float_col < 0, t.int_col).else_(0).end()) expected = """CASE WHEN `float_col` > 0 THEN `int_col` * 2 WHEN `float_col` < 0 THEN `int_col` ELSE 0 END""" assert translate(expr) == expected assert len(con.execute(expr))
def test_pickle_multiple_case_node(table): case1 = table.a == 5 case2 = table.b == 128 case3 = table.c == 1000 result1 = table.f result2 = table.b * 2 result3 = table.e default = table.d expr = (ibis.case().when(case1, result1).when(case2, result2).when( case3, result3).else_(default).end()) op = expr.op() assert_pickle_roundtrip(op)
def _bucket(expr): op = expr.op() stmt = ibis.case() if op.closed == 'left': l_cmp = operator.le r_cmp = operator.lt else: l_cmp = operator.lt r_cmp = operator.le user_num_buckets = len(op.buckets) - 1 bucket_id = 0 if op.include_under: if user_num_buckets > 0: cmp = operator.lt if op.close_extreme else r_cmp else: cmp = operator.le if op.closed == 'right' else operator.lt stmt = stmt.when(cmp(op.arg, op.buckets[0]), bucket_id) bucket_id += 1 for j, (lower, upper) in enumerate(zip(op.buckets, op.buckets[1:])): if op.close_extreme and ( (op.closed == 'right' and j == 0) or (op.closed == 'left' and j == (user_num_buckets - 1))): stmt = stmt.when((lower <= op.arg) & (op.arg <= upper), bucket_id) else: stmt = stmt.when( l_cmp(lower, op.arg) & r_cmp(op.arg, upper), bucket_id) bucket_id += 1 if op.include_over: if user_num_buckets > 0: cmp = operator.lt if op.close_extreme else l_cmp else: cmp = operator.lt if op.closed == 'right' else operator.le stmt = stmt.when(cmp(op.buckets[-1], op.arg), bucket_id) bucket_id += 1 result = stmt.end() if expr.has_name(): result = result.name(expr.get_name()) return result
def _bucket(translator, expr): import operator op = expr.op() stmt = ibis.case() if op.closed == 'left': l_cmp = operator.le r_cmp = operator.lt else: l_cmp = operator.lt r_cmp = operator.le user_num_buckets = len(op.buckets) - 1 bucket_id = 0 if op.include_under: if user_num_buckets > 0: cmp = operator.lt if op.close_extreme else r_cmp else: cmp = operator.le if op.closed == 'right' else operator.lt stmt = stmt.when(cmp(op.arg, op.buckets[0]), bucket_id) bucket_id += 1 for j, (lower, upper) in enumerate(zip(op.buckets, op.buckets[1:])): if (op.close_extreme and ((op.closed == 'right' and j == 0) or (op.closed == 'left' and j == (user_num_buckets - 1)))): stmt = stmt.when((lower <= op.arg) & (op.arg <= upper), bucket_id) else: stmt = stmt.when(l_cmp(lower, op.arg) & r_cmp(op.arg, upper), bucket_id) bucket_id += 1 if op.include_over: if user_num_buckets > 0: cmp = operator.lt if op.close_extreme else l_cmp else: cmp = operator.lt if op.closed == 'right' else operator.le stmt = stmt.when(cmp(op.buckets[-1], op.arg), bucket_id) bucket_id += 1 case_expr = stmt.end().name(expr._name) return _searched_case(translator, case_expr)
def test_multiple_case_expr(self): case1 = self.table.a == 5 case2 = self.table.b == 128 case3 = self.table.c == 1000 result1 = self.table.f result2 = self.table.b * 2 result3 = self.table.e default = self.table.d expr = (ibis.case().when(case1, result1).when(case2, result2).when( case3, result3).else_(default).end()) op = expr.op() assert isinstance(expr, ir.DoubleArray) assert isinstance(op, ops.SearchedCase) assert op.default is default
def test_multiple_case_expr(table): case1 = table.a == 5 case2 = table.b == 128 case3 = table.c == 1000 result1 = table.f result2 = table.b * 2 result3 = table.e default = table.d expr = (ibis.case().when(case1, result1).when(case2, result2).when( case3, result3).else_(default).end()) op = expr.op() assert isinstance(expr, ir.FloatingColumn) assert isinstance(op, ops.SearchedCase) assert op.default is default
def tpc_h08( con, NATION="BRAZIL", REGION="AMERICA", TYPE="ECONOMY ANODIZED STEEL", DATE="1995-01-01", ): part = con.table("part") supplier = con.table("supplier") lineitem = con.table("lineitem") orders = con.table("orders") customer = con.table("customer") region = con.table("region") n1 = con.table("nation") n2 = n1.view() q = part q = q.join(lineitem, part.p_partkey == lineitem.l_partkey) q = q.join(supplier, supplier.s_suppkey == lineitem.l_suppkey) q = q.join(orders, lineitem.l_orderkey == orders.o_orderkey) q = q.join(customer, orders.o_custkey == customer.c_custkey) q = q.join(n1, customer.c_nationkey == n1.n_nationkey) q = q.join(region, n1.n_regionkey == region.r_regionkey) q = q.join(n2, supplier.s_nationkey == n2.n_nationkey) q = q[orders.o_orderdate.year().cast("string").name("o_year"), (lineitem.l_extendedprice * (1 - lineitem.l_discount)).name("volume"), n2.n_name.name("nation"), region.r_name, orders.o_orderdate, part.p_type, ] q = q.filter([ q.r_name == REGION, q.o_orderdate.between(DATE, add_date(DATE, dy=2, dd=-1)), q.p_type == TYPE, ]) q = q.mutate(nation_volume=ibis.case().when(q.nation == NATION, q.volume).else_(0).end()) gq = q.group_by([q.o_year]) q = gq.aggregate(mkt_share=q.nation_volume.sum() / q.volume.sum()) q = q.sort_by([q.o_year]) return q
def test_searched_case_column(batting, batting_df): t = batting df = batting_df expr = ( ibis.case() .when(t.RBI < 5, 'really bad team') .when(t.teamID == 'PH1', 'ph1 team') .else_(t.teamID) .end() ) result = expr.execute() expected = pd.Series( np.select( [df.RBI < 5, df.teamID == 'PH1'], ['really bad team', 'ph1 team'], df.teamID, ) ) tm.assert_series_equal(result, expected)
def case_expression(self, when_expressions: List[Union[Tuple[Value, Value], Value]]): """ Handles sql_to_ibis case expressions :param when_expressions: :return: """ case_expression = ibis.case() for i, when_expression in enumerate(when_expressions): if isinstance(when_expression, tuple): conditional_boolean = when_expression[0].get_value() conditional_value = when_expression[1].get_value() case_expression = case_expression.when(conditional_boolean, conditional_value) else: case_expression = case_expression.else_( when_expression.get_value()).end() return Expression(value=case_expression)
def test_multiple_case_expr(self): case1 = self.table.a == 5 case2 = self.table.b == 128 case3 = self.table.c == 1000 result1 = self.table.f result2 = self.table.b * 2 result3 = self.table.e default = self.table.d expr = (ibis.case() .when(case1, result1) .when(case2, result2) .when(case3, result3) .else_(default) .end()) op = expr.op() assert isinstance(expr, ir.DoubleArray) assert isinstance(op, ops.SearchedCase) assert op.default is default
def create_12_mon_features(joined_df): delinq_df = None n_months = 12 for y in range(1, n_months + 1): year_dec = (ibis.case().when( joined_df["timestamp_month"] < ibis.literal(y), 1).else_(0).end()) tmp_df = joined_df["loan_id", "delinquency_12", "upb_12", (joined_df["timestamp_year"] - year_dec).name("timestamp_year"), ] delinquency_12 = (tmp_df["delinquency_12"].max() > 3).cast("int32") + ( tmp_df["upb_12"].min() == 0).cast("int32") tmp_df = tmp_df.groupby(["loan_id", "timestamp_year"]).aggregate( delinquency_12.name("delinquency_12")) tmp_df = tmp_df.mutate(timestamp_month=ibis.literal(y, "int32")) if delinq_df is None: delinq_df = tmp_df else: delinq_df = delinq_df.union(tmp_df) return delinq_df
def test_searched_case_scalar(client): expr = ibis.case().when(True, 1).when(False, 2).end() result = client.execute(expr) expected = np.int8(1) assert result == expected
def search_case(con): t = con.table('alltypes') return ibis.case().when(t.f > 0, t.d * 2).when(t.c < 0, t.a * 2).end()
def etl_ibis( filename, columns_names, columns_types, database_name, table_name, omnisci_server_worker, delete_old_database, create_new_table, ipc_connection, validation, run_import_queries, etl_keys, import_mode, ): tmp_table_name = "tmp_table" etl_times = {key: 0.0 for key in etl_keys} omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database) if run_import_queries: etl_times_import = { "t_readcsv_by_ibis": 0.0, "t_readcsv_by_COPY": 0.0, "t_readcsv_by_FSI": 0.0, } # SQL statemnts preparation for data file import queries connect_to_db_sql_template = "\c {0} admin HyperInteractive" create_table_sql_template = """ CREATE TABLE {0} ({1}); """ import_by_COPY_sql_template = """ COPY {0} FROM '{1}' WITH (header='{2}'); """ import_by_FSI_sql_template = """ CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}'); """ drop_table_sql_template = """ DROP TABLE IF EXISTS {0}; """ import_query_cols_list = ( ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"] + ["var_%s DOUBLE, \n" % i for i in range(199)] + ["var_199 DOUBLE"]) import_query_cols_str = "".join(import_query_cols_list) create_table_sql = create_table_sql_template.format( tmp_table_name, import_query_cols_str) import_by_COPY_sql = import_by_COPY_sql_template.format( tmp_table_name, filename, "true") import_by_FSI_sql = import_by_FSI_sql_template.format( tmp_table_name, import_query_cols_str, filename) # data file import by ibis columns_types_import_query = ["string", "int64" ] + ["float64" for _ in range(200)] schema_table_import = ibis.Schema(names=columns_names, types=columns_types_import_query) omnisci_server_worker.create_table( table_name=tmp_table_name, schema=schema_table_import, database=database_name, ) table_import_query = omnisci_server_worker.database( database_name).table(tmp_table_name) t0 = timer() table_import_query.read_csv(filename, delimiter=",") etl_times_import["t_readcsv_by_ibis"] = round((timer() - t0) * 1000) # data file import by FSI omnisci_server_worker.drop_table(tmp_table_name) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_FSI_sql) etl_times_import["t_readcsv_by_FSI"] = round((timer() - t0) * 1000) omnisci_server_worker.drop_table(tmp_table_name) # data file import by SQL COPY statement omnisci_server_worker.execute_sql_query(create_table_sql) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_COPY_sql) etl_times_import["t_readcsv_by_COPY"] = round((timer() - t0) * 1000) omnisci_server_worker.drop_table(tmp_table_name) etl_times.update(etl_times_import) if create_new_table: # Create table and import data for ETL queries schema_table = ibis.Schema(names=columns_names, types=columns_types) if import_mode == "copy-from": omnisci_server_worker.create_table( table_name=table_name, schema=schema_table, database=database_name, ) table_import = omnisci_server_worker.database(database_name).table( table_name) t0 = timer() table_import.read_csv(filename, header=True, quotechar="", delimiter=",") etl_times["t_readcsv"] = round((timer() - t0) * 1000) elif import_mode == "pandas": # Datafiles import columns_types_converted = [ "float64" if (x.startswith("decimal")) else x for x in columns_types ] t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis( table_name=table_name, data_files_names=filename, files_limit=1, columns_names=columns_names, columns_types=columns_types_converted, header=0, nrows=None, compression_type="gzip" if filename.endswith("gz") else None, validation=validation, ) etl_times["t_readcsv"] = round( (t_import_pandas + t_import_ibis) * 1000) elif import_mode == "fsi": try: unzip_name = None if filename.endswith("gz"): import gzip unzip_name = "/tmp/santander-fsi.csv" with gzip.open(filename, "rb") as gz_input: with open(unzip_name, "wb") as output: output.write(gz_input.read()) t0 = timer() omnisci_server_worker._conn.create_table_from_csv( table_name, unzip_name or filename, schema_table) etl_times["t_readcsv"] = round((timer() - t0) * 1000) finally: if filename.endswith("gz"): import os os.remove(unzip_name) # Second connection - this is ibis's ipc connection for DML omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection) table = omnisci_server_worker.database(database_name).table(table_name) # group_by/count, merge (join) and filtration queries # We are making 400 columns and then insert them into original table thus avoiding # nested sql requests t_etl_start = timer() count_cols = [] orig_cols = ["ID_code", "target"] + ["var_%s" % i for i in range(200)] cast_cols = [] cast_cols.append(table["target"].cast("int64").name("target0")) gt1_cols = [] for i in range(200): col = "var_%d" % i col_count = "var_%d_count" % i col_gt1 = "var_%d_gt1" % i w = ibis.window(group_by=col) count_cols.append(table[col].count().over(w).name(col_count)) gt1_cols.append(ibis.case().when( table[col].count().over(w).name(col_count) > 1, table[col].cast("float32"), ).else_(ibis.null()).end().name("var_%d_gt1" % i)) cast_cols.append(table[col].cast("float32").name(col)) table = table.mutate(count_cols) table = table.drop(orig_cols) table = table.mutate(gt1_cols) table = table.mutate(cast_cols) table_df = table.execute() etl_times["t_etl"] = round((timer() - t_etl_start) * 1000) return table_df, etl_times
def etl_ibis( filename, columns_names, columns_types, database_name, table_name, omnisci_server_worker, delete_old_database, create_new_table, ipc_connection, validation, etl_keys, import_mode, ): import ibis etl_times = {key: 0.0 for key in etl_keys} omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database) # Create table and import data if create_new_table: schema_table = ibis.Schema(names=columns_names, types=columns_types) if import_mode == "copy-from": # Create table and import data for ETL queries omnisci_server_worker.create_table( table_name=table_name, schema=schema_table, database=database_name, ) table_import = omnisci_server_worker.database(database_name).table( table_name) t0 = timer() table_import.read_csv(filename, header=True, quotechar="", delimiter=",") etl_times["t_readcsv"] = round((timer() - t0) * 1000) elif import_mode == "pandas": # Datafiles import t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis( table_name=table_name, data_files_names=filename, files_limit=1, columns_names=columns_names, columns_types=columns_types, header=0, nrows=None, compression_type="gzip" if filename.endswith("gz") else None, validation=validation, ) etl_times["t_readcsv"] = round( (t_import_pandas + t_import_ibis) * 1000) elif import_mode == "fsi": try: unzip_name = None if filename.endswith("gz"): import gzip unzip_name = "/tmp/census-fsi.csv" with gzip.open(filename, "rb") as gz_input: with open(unzip_name, "wb") as output: output.write(gz_input.read()) t0 = timer() omnisci_server_worker._conn.create_table_from_csv( table_name, unzip_name or filename, schema_table) etl_times["t_readcsv"] = round((timer() - t0) * 1000) finally: if filename.endswith("gz"): import os os.remove(unzip_name) # Second connection - this is ibis's ipc connection for DML omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection) table = omnisci_server_worker.database(database_name).table(table_name) t_etl_start = timer() keep_cols = [ "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "PERNUM", "SEX", "AGE", "INCTOT", "EDUC", "EDUCD", "EDUC_HEAD", "EDUC_POP", "EDUC_MOM", "EDUCD_MOM2", "EDUCD_POP2", "INCTOT_MOM", "INCTOT_POP", "INCTOT_MOM2", "INCTOT_POP2", "INCTOT_HEAD", "SEX_HEAD", ] if import_mode == "pandas" and validation: keep_cols.append("id") table = table[keep_cols] # first, we do all filters and eliminate redundant fillna operations for EDUC and EDUCD table = table[table.INCTOT != 9999999] table = table[table["EDUC"].notnull()] table = table[table["EDUCD"].notnull()] table = table.set_column("INCTOT", table["INCTOT"] * table["CPI99"]) cols = [] # final fillna and casting for necessary columns for column in keep_cols: cols.append(ibis.case().when( table[column].notnull(), table[column]).else_(-1).end().cast("float64").name(column)) table = table.mutate(cols) df = table.execute() if import_mode == "pandas" and validation: df.index = df["id"].values # here we use pandas to split table y = df["EDUC"] X = df.drop(["EDUC", "CPI99"], axis=1) etl_times["t_etl"] = round((timer() - t_etl_start) * 1000) print("DataFrame shape:", X.shape) return df, X, y, etl_times
def etl_ibis( filename, columns_names, columns_types, database_name, table_name, omnisci_server_worker, delete_old_database, create_new_table, ipc_connection, validation, etl_keys, import_mode, fragments_size, ): etl_times = {key: 0.0 for key in etl_keys} fragments_size = check_fragments_size(fragments_size, count_table=1, import_mode=import_mode) omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database) if create_new_table: # Create table and import data for ETL queries schema_table = ibis.Schema(names=columns_names, types=columns_types) if import_mode == "copy-from": t0 = timer() omnisci_server_worker.create_table( table_name=table_name, schema=schema_table, database=database_name, fragment_size=fragments_size[0], ) table_import = omnisci_server_worker.database(database_name).table( table_name) etl_times["t_connect"] += timer() - t0 t0 = timer() table_import.read_csv(filename, header=True, quotechar="", delimiter=",") etl_times["t_readcsv"] = timer() - t0 elif import_mode == "pandas": # decimal(8, 4) is converted to decimal(9, 6) in order to provide better data conversion # accuracy during import from Pandas into OmniSciDB for proper results validation columns_types = [ "decimal(9, 6)" if (x == "decimal(8, 4)") else x for x in columns_types ] t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis( table_name=table_name, data_files_names=filename, files_limit=1, columns_names=columns_names, columns_types=columns_types, header=0, nrows=None, compression_type="gzip" if filename.endswith(".gz") else None, use_columns_types_for_pd=False, ) etl_times["t_readcsv"] = t_import_pandas + t_import_ibis etl_times[ "t_connect"] += omnisci_server_worker.get_conn_creation_time() elif import_mode == "fsi": try: unzip_name = None if filename.endswith(".gz"): import gzip unzip_name = get_tmp_filepath("santander-fsi.csv") with gzip.open(filename, "rb") as gz_input: with open(unzip_name, "wb") as output: output.write(gz_input.read()) t0 = timer() omnisci_server_worker._conn.create_table_from_csv( table_name, unzip_name or filename, schema_table, fragment_size=fragments_size[0], ) etl_times["t_readcsv"] = timer() - t0 etl_times[ "t_connect"] += omnisci_server_worker.get_conn_creation_time( ) finally: if filename.endswith("gz"): import os os.remove(unzip_name) # Second connection - this is ibis's ipc connection for DML t0 = timer() omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection) table = omnisci_server_worker.database(database_name).table(table_name) etl_times["t_connect"] += timer() - t0 # group_by/count, merge (join) and filtration queries # We are making 400 columns and then insert them into original table thus avoiding # nested sql requests t_etl_start = timer() count_cols = [] orig_cols = ["ID_code", "target"] + ["var_%s" % i for i in range(200)] cast_cols = [] cast_cols.append(table["target"].cast("int64").name("target0")) gt1_cols = [] for i in range(200): col = "var_%d" % i col_count = "var_%d_count" % i col_gt1 = "var_%d_gt1" % i w = ibis.window(group_by=col) count_cols.append(table[col].count().over(w).name(col_count)) gt1_cols.append(ibis.case().when( table[col].count().over(w).name(col_count) > 1, table[col].cast("float32"), ).else_(ibis.null()).end().name(col_gt1)) cast_cols.append(table[col].cast("float32").name(col)) table = table.mutate(count_cols) table = table.drop(orig_cols) table = table.mutate(gt1_cols) table = table.mutate(cast_cols) table_df = table.execute() etl_times["t_etl"] = timer() - t_etl_start return table_df, etl_times
def etl_ibis( filename, columns_names, columns_types, database_name, table_name, omnisci_server_worker, delete_old_database, create_new_table, ipc_connection, validation, etl_keys, ): import ibis etl_times = {key: 0.0 for key in etl_keys} omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database) # Create table and import data if create_new_table: # Datafiles import t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis( table_name=table_name, data_files_names=filename, files_limit=1, columns_names=columns_names, columns_types=columns_types, header=0, nrows=None, compression_type="gzip", validation=validation, ) etl_times["t_readcsv"] = t_import_pandas + t_import_ibis # Second connection - this is ibis's ipc connection for DML omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection) table = omnisci_server_worker.database(database_name).table(table_name) t_etl_start = timer() keep_cols = [ "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "PERNUM", "SEX", "AGE", "INCTOT", "EDUC", "EDUCD", "EDUC_HEAD", "EDUC_POP", "EDUC_MOM", "EDUCD_MOM2", "EDUCD_POP2", "INCTOT_MOM", "INCTOT_POP", "INCTOT_MOM2", "INCTOT_POP2", "INCTOT_HEAD", "SEX_HEAD", ] if validation: keep_cols.append("id") table = table[keep_cols] # first, we do all filters and eliminate redundant fillna operations for EDUC and EDUCD table = table[table.INCTOT != 9999999] table = table[table["EDUC"].notnull()] table = table[table["EDUCD"].notnull()] table = table.set_column("INCTOT", table["INCTOT"] * table["CPI99"]) cols = [] # final fillna and casting for necessary columns for column in keep_cols: cols.append(ibis.case().when( table[column].notnull(), table[column]).else_(-1).end().cast("float64").name(column)) table = table.mutate(cols) df = table.execute() # here we use pandas to split table y = df["EDUC"] X = df.drop(["EDUC", "CPI99"], axis=1) etl_times["t_etl"] = round((timer() - t_etl_start) * 1000) print("DataFrame shape:", X.shape) return df, X, y, etl_times
def _calculate_difference(field_differences, datatype, validation, is_value_comparison): pct_threshold = ibis.literal(validation.threshold) if isinstance(datatype, ibis.expr.datatypes.Timestamp): source_value = field_differences["differences_source_value"].epoch_seconds() target_value = field_differences["differences_target_value"].epoch_seconds() elif isinstance(datatype, ibis.expr.datatypes.Float64): # Float64 type results from AVG() aggregation source_value = field_differences["differences_source_value"].round(digits=4) target_value = field_differences["differences_target_value"].round(digits=4) elif isinstance(datatype, ibis.expr.datatypes.Decimal): source_value = ( field_differences["differences_source_value"] .cast("float64") .round(digits=4) ) target_value = ( field_differences["differences_target_value"] .cast("float64") .round(digits=4) ) else: source_value = field_differences["differences_source_value"] target_value = field_differences["differences_target_value"] # Does not calculate difference between agg values for row hash due to int64 overflow if is_value_comparison: difference = pct_difference = ibis.null() validation_status = ( ibis.case() .when( target_value.isnull() & source_value.isnull(), consts.VALIDATION_STATUS_SUCCESS, ) .when(target_value == source_value, consts.VALIDATION_STATUS_SUCCESS) .else_(consts.VALIDATION_STATUS_FAIL) .end() ) # String data types i.e "None" can be returned for NULL timestamp/datetime aggs elif isinstance(datatype, ibis.expr.datatypes.String): difference = pct_difference = ibis.null().cast("float64") validation_status = ( ibis.case() .when( target_value.isnull() & source_value.isnull(), consts.VALIDATION_STATUS_SUCCESS, ) .else_(consts.VALIDATION_STATUS_FAIL) .end() ) else: difference = (target_value - source_value).cast("float64") pct_difference_nonzero = ( ibis.literal(100.0) * difference / ( source_value.case() .when(ibis.literal(0), target_value) .else_(source_value) .end() ).cast("float64") ).cast("float64") # Considers case that source and target agg values can both be 0 pct_difference = ( ibis.case() .when(difference == ibis.literal(0), ibis.literal(0).cast("float64")) .else_(pct_difference_nonzero) .end() ) th_diff = (pct_difference.abs() - pct_threshold).cast("float64") validation_status = ( ibis.case() .when( source_value.isnull() & target_value.isnull(), consts.VALIDATION_STATUS_SUCCESS, ) .when(th_diff.isnan() | (th_diff > 0.0), consts.VALIDATION_STATUS_FAIL) .else_(consts.VALIDATION_STATUS_SUCCESS) .end() ) return ( difference.name("difference"), pct_difference.name("pct_difference"), pct_threshold.name("pct_threshold"), validation_status.name("validation_status"), )
def etl_ibis( filename, columns_names, columns_types, database_name, table_name, omnisci_server_worker, delete_old_database, create_new_table, ipc_connection, validation, run_import_queries, etl_keys, ): tmp_table_name = "tmp_table" etl_times = {key: 0.0 for key in etl_keys} omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database) if run_import_queries: etl_times_import = { "t_readcsv_by_ibis": 0.0, "t_readcsv_by_COPY": 0.0, "t_readcsv_by_FSI": 0.0, } # SQL statemnts preparation for data file import queries connect_to_db_sql_template = "\c {0} admin HyperInteractive" create_table_sql_template = """ CREATE TABLE {0} ({1}); """ import_by_COPY_sql_template = """ COPY {0} FROM '{1}' WITH (header='{2}'); """ import_by_FSI_sql_template = """ CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}'); """ drop_table_sql_template = """ DROP TABLE IF EXISTS {0}; """ import_query_cols_list = ( ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"] + ["var_%s DOUBLE, \n" % i for i in range(199)] + ["var_199 DOUBLE"]) import_query_cols_str = "".join(import_query_cols_list) create_table_sql = create_table_sql_template.format( tmp_table_name, import_query_cols_str) import_by_COPY_sql = import_by_COPY_sql_template.format( tmp_table_name, filename, "true") import_by_FSI_sql = import_by_FSI_sql_template.format( tmp_table_name, import_query_cols_str, filename) # data file import by ibis columns_types_import_query = ["string", "int64" ] + ["float64" for _ in range(200)] schema_table_import = ibis.Schema(names=columns_names, types=columns_types_import_query) omnisci_server_worker.create_table( table_name=tmp_table_name, schema=schema_table_import, database=database_name, ) table_import_query = omnisci_server_worker.database( database_name).table(tmp_table_name) t0 = timer() table_import_query.read_csv(filename, delimiter=",") etl_times_import["t_readcsv_by_ibis"] = round((timer() - t0) * 1000) # data file import by FSI omnisci_server_worker.drop_table(tmp_table_name) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_FSI_sql) etl_times_import["t_readcsv_by_FSI"] = round((timer() - t0) * 1000) omnisci_server_worker.drop_table(tmp_table_name) # data file import by SQL COPY statement omnisci_server_worker.execute_sql_query(create_table_sql) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_COPY_sql) etl_times_import["t_readcsv_by_COPY"] = round((timer() - t0) * 1000) omnisci_server_worker.drop_table(tmp_table_name) etl_times.update(etl_times_import) if create_new_table: # Create table and import data for ETL queries schema_table = ibis.Schema(names=columns_names, types=columns_types) omnisci_server_worker.create_table( table_name=table_name, schema=schema_table, database=database_name, ) table_import = omnisci_server_worker.database(database_name).table( table_name) t0 = timer() table_import.read_csv(filename, delimiter=",") etl_times["t_readcsv"] = round((timer() - t0) * 1000) omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection) table = omnisci_server_worker.database(database_name).table(table_name) # group_by/count, merge (join) and filtration queries # We are making 400 columns and then insert them into original table thus avoiding # nested sql requests t_etl_start = timer() count_cols = [] orig_cols = ["ID_code", "target"] + ['var_%s' % i for i in range(200)] cast_cols = [] cast_cols.append(table["target"].cast("int64").name("target0")) gt1_cols = [] for i in range(200): col = "var_%d" % i col_count = "var_%d_count" % i col_gt1 = "var_%d_gt1" % i w = ibis.window(group_by=col) count_cols.append(table[col].count().over(w).name(col_count)) gt1_cols.append(ibis.case().when( table[col].count().over(w).name(col_count) > 1, table[col].cast("float32"), ).else_(ibis.null()).end().name("var_%d_gt1" % i)) cast_cols.append(table[col].cast("float32").name(col)) table = table.mutate(count_cols) table = table.drop(orig_cols) table = table.mutate(gt1_cols) table = table.mutate(cast_cols) table_df = table.execute() etl_times["t_etl"] = round((timer() - t_etl_start) * 1000) return table_df, etl_times
def etl_ibis(args, run_import_queries, columns_names, columns_types, validation=False): filename = args.file database_name = args.name table_name = args.table delete_old_database = not args.dnd create_new_table = not args.dni run_import_queries = str_arg_to_bool(run_import_queries) validation = str_arg_to_bool(validation) tmp_table_name = "tmp_table" etl_times = {"t_groupby_merge_where": 0.0, "t_train_test_split": 0.0, "t_etl": 0.0} if run_import_queries: etl_times_import = { "t_readcsv_by_ibis": 0.0, "t_readcsv_by_COPY": 0.0, "t_readcsv_by_FSI": 0.0, } etl_times.update(etl_times_import) omnisci_server = OmnisciServer( omnisci_executable=args.omnisci_executable, omnisci_port=args.omnisci_port, database_name=args.name, user=args.user, password=args.password, debug_timer=True, columnar_output=args.server_columnar_output, lazy_fetch=args.server_lazy_fetch, ) omnisci_server.launch() import ibis from server_worker import OmnisciServerWorker omnisci_server_worker = OmnisciServerWorker(omnisci_server) omnisci_server_worker.create_database( database_name, delete_if_exists=delete_old_database ) time.sleep(2) omnisci_server_worker.connect_to_server() if run_import_queries: # SQL statemnts preparation for data file import queries connect_to_db_sql_template = "\c {0} admin HyperInteractive" create_table_sql_template = """ CREATE TABLE {0} ({1}); """ import_by_COPY_sql_template = """ COPY {0} FROM '{1}' WITH (header='{2}'); """ import_by_FSI_sql_template = """ CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}'); """ drop_table_sql_template = """ DROP TABLE IF EXISTS {0}; """ import_query_cols_list = ( ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"] + ["var_%s DOUBLE, \n" % i for i in range(199)] + ["var_199 DOUBLE"] ) import_query_cols_str = "".join(import_query_cols_list) connect_to_db_sql = connect_to_db_sql_template.format(database_name) create_table_sql = create_table_sql_template.format( tmp_table_name, import_query_cols_str ) import_by_COPY_sql = import_by_COPY_sql_template.format( tmp_table_name, filename, "true" ) import_by_FSI_sql = import_by_FSI_sql_template.format( tmp_table_name, import_query_cols_str, filename ) # data file import by ibis columns_types_import_query = ["string", "int64"] + [ "float64" for _ in range(200) ] schema_table_import = ibis.Schema( names=columns_names, types=columns_types_import_query ) omnisci_server_worker.get_conn().create_table( table_name=tmp_table_name, schema=schema_table_import, database=database_name, fragment_size=args.fragment_size, ) table_import_query = omnisci_server_worker.database(database_name).table(tmp_table_name) t0 = timer() table_import_query.read_csv(filename, delimiter=",") etl_times["t_readcsv_by_ibis"] = timer() - t0 # data file import by FSI omnisci_server_worker.drop_table(tmp_table_name) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_FSI_sql) etl_times["t_readcsv_by_FSI"] = timer() - t0 omnisci_server_worker.drop_table(tmp_table_name) # data file import by SQL COPY statement omnisci_server_worker.execute_sql_query(create_table_sql) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_COPY_sql) etl_times["t_readcsv_by_COPY"] = timer() - t0 omnisci_server_worker.drop_table(tmp_table_name) if create_new_table: # Create table and import data for ETL queries schema_table = ibis.Schema(names=columns_names, types=columns_types) omnisci_server_worker.get_conn().create_table( table_name=table_name, schema=schema_table, database=database_name, fragment_size=args.fragment_size, ) table_import = omnisci_server_worker.database(database_name).table(table_name) table_import.read_csv(filename, delimiter=",") if args.server_conn_type == "regular": omnisci_server_worker.connect_to_server() elif args.server_conn_type == "ipc": omnisci_server_worker.ipc_connect_to_server() else: print("Wrong connection type is specified!") sys.exit(0) db = omnisci_server_worker.database(database_name) table = db.table(table_name) # group_by/count, merge (join) and filtration queries # We are making 400 columns and then insert them into original table thus avoiding # nested sql requests t0 = timer() count_cols = [] orig_cols = ["ID_code", "target"] + ['var_%s'%i for i in range(200)] cast_cols = [] cast_cols.append(table["target"].cast("int64").name("target0")) gt1_cols = [] for i in range(200): col = "var_%d" % i col_count = "var_%d_count" % i col_gt1 = "var_%d_gt1" % i w = ibis.window(group_by=col) count_cols.append(table[col].count().over(w).name(col_count)) gt1_cols.append( ibis.case() .when( table[col].count().over(w).name(col_count) > 1, table[col].cast("float32"), ) .else_(ibis.null()) .end() .name("var_%d_gt1" % i) ) cast_cols.append(table[col].cast("float32").name(col)) table = table.mutate(count_cols) table = table.drop(orig_cols) table = table.mutate(gt1_cols) table = table.mutate(cast_cols) table_df = table.execute() etl_times["t_groupby_merge_where"] = timer() - t0 # rows split query t0 = timer() training_part, validation_part = table_df[:-10000], table_df[-10000:] etl_times["t_train_test_split"] = timer() - t0 etl_times["t_etl"] = etl_times["t_groupby_merge_where"] + etl_times["t_train_test_split"] x_train = training_part.drop(['target0'],axis=1) y_train = training_part['target0'] x_valid = validation_part.drop(['target0'],axis=1) y_valid = validation_part['target0'] omnisci_server.terminate() omnisci_server = None return x_train, y_train, x_valid, y_valid, etl_times