def do_connect( self, config: Mapping[str, str | Path] | df.ExecutionContext, ) -> None: """Create a Datafusion backend for use with Ibis. Parameters ---------- config Mapping of table names to files. """ if isinstance(config, df.ExecutionContext): self._context = config else: self._context = df.ExecutionContext() for name, path in config.items(): strpath = str(path) if strpath.endswith('.csv'): self.register_csv(name, path) elif strpath.endswith('.parquet'): self.register_parquet(name, path) else: raise ValueError( "Currently the DataFusion backend only supports CSV " "files with the extension .csv and Parquet files with " "the .parquet extension." )
def test_join(self): ctx = datafusion.ExecutionContext() batch = pyarrow.RecordBatch.from_arrays( [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], names=["a", "b"], ) df = ctx.create_dataframe([[batch]]) batch = pyarrow.RecordBatch.from_arrays( [pyarrow.array([1, 2]), pyarrow.array([8, 10])], names=["a", "c"], ) df1 = ctx.create_dataframe([[batch]]) df = df.join(df1, on="a", how="inner") # execute and collect the first (and only) batch batch = df.collect()[0] if batch.column(0) == pyarrow.array([1, 2]): self.assertEqual(batch.column(0), pyarrow.array([1, 2])) self.assertEqual(batch.column(1), pyarrow.array([8, 10])) self.assertEqual(batch.column(2), pyarrow.array([4, 5])) else: self.assertEqual(batch.column(0), pyarrow.array([2, 1])) self.assertEqual(batch.column(1), pyarrow.array([10, 8])) self.assertEqual(batch.column(2), pyarrow.array([5, 4]))
def do_connect(self, config): """ Create a DataFusionClient for use with Ibis Parameters ---------- config : DataFusionContext or dict Returns ------- DataFusionClient """ if isinstance(config, df.ExecutionContext): self._context = config else: self._context = df.ExecutionContext() for name, path in config.items(): strpath = str(path) if strpath.endswith('.csv'): self.register_csv(name, path) elif strpath.endswith('.parquet'): self.register_parquet(name, path) else: raise ValueError( "Currently the DataFusion backend only supports CSV " "files with the extension .csv and Parquet files with " "the .parquet extension.")
def test_execute(self): data = [1, 1, 2, 2, 3, 11, 12] ctx = datafusion.ExecutionContext() # single column, "a" path = write_parquet( os.path.join(self.test_dir, "a.parquet"), pyarrow.array(data) ) ctx.register_parquet("t", path) self.assertEqual(ctx.tables(), {"t"}) # count result = ctx.sql("SELECT COUNT(a) FROM t").collect() expected = pyarrow.array([7], pyarrow.uint64()) expected = [pyarrow.RecordBatch.from_arrays([expected], ["COUNT(a)"])] self.assertEqual(expected, result) # where expected = pyarrow.array([2], pyarrow.uint64()) expected = [pyarrow.RecordBatch.from_arrays([expected], ["COUNT(a)"])] self.assertEqual( expected, ctx.sql("SELECT COUNT(a) FROM t WHERE a > 10").collect() ) # group by results = ctx.sql( "SELECT CAST(a as int), COUNT(a) FROM t GROUP BY CAST(a as int)" ).collect() # group by returns batches result_keys = [] result_values = [] for result in results: pydict = result.to_pydict() result_keys.extend(pydict["CAST(a AS Int32)"]) result_values.extend(pydict["COUNT(a)"]) result_keys, result_values = ( list(t) for t in zip(*sorted(zip(result_keys, result_values))) ) self.assertEqual(result_keys, [1, 2, 3, 11, 12]) self.assertEqual(result_values, [2, 2, 1, 1, 1]) # order by result = ctx.sql( "SELECT a, CAST(a AS int) FROM t ORDER BY a DESC LIMIT 2" ).collect() expected_a = pyarrow.array([50.0219, 50.0152], pyarrow.float64()) expected_cast = pyarrow.array([50, 50], pyarrow.int32()) expected = [ pyarrow.RecordBatch.from_arrays( [expected_a, expected_cast], ["a", "CAST(a AS Int32)"] ) ] numpy.testing.assert_equal(expected[0].column(1), expected[0].column(1))
def test_register(self): ctx = datafusion.ExecutionContext() path = write_parquet(os.path.join(self.test_dir, "a.parquet"), data()) ctx.register_parquet("t", path) self.assertEqual(ctx.tables(), {"t"})
def _prepare(self): ctx = datafusion.ExecutionContext() # create a RecordBatch and a new DataFrame from it batch = pyarrow.RecordBatch.from_arrays( [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])], names=["a", "b"], ) return ctx.create_dataframe([[batch]])
def _test_data(self, data): ctx = datafusion.ExecutionContext() # write to disk path = write_parquet(os.path.join(self.test_dir, "a.parquet"), data) ctx.register_parquet("t", path) batches = ctx.sql("SELECT a AS tt FROM t").collect() result = batches[0].column(0) numpy.testing.assert_equal(data, result)
def _test_udf(self, udf, args, return_type, array, expected): ctx = datafusion.ExecutionContext() # write to disk path = write_parquet(os.path.join(self.test_dir, "a.parquet"), array) ctx.register_parquet("t", path) ctx.register_udf("udf", udf, args, return_type) batches = ctx.sql("SELECT udf(a) AS tt FROM t").collect() result = batches[0].column(0) self.assertEqual(expected, result)
def make_execution_context(self, tmpdir, store, context): import datafusion as daf ctx = daf.ExecutionContext() register = self.data.get("register", []) store = store.root_store() path = Path(tmpdir) for query in register: context.info(f"Register {query}") try: q = parse(query) except: context.warning( f"Could not parse query '{query}' in parquet_sql recipe {self.recipe_name()}", traceback=traceback.format_exc()) if q.is_resource_query(): key = q.resource_query().path() if store.is_dir(key): context.info(f"Registering directory {key}") for k in store.listdir_keys(key): if not store.is_dir(k) and key_extension( k) == "parquet": (path / key_name(k)).write_bytes( store.get_bytes(k)) context.info( f"Registering {key_name_without_extension(k)} from {key}" ) ctx.register_parquet(key_name_without_extension(k), str(path / key_name(k))) else: (path / key_name(key)).write_bytes(store.get_bytes(key)) context.info(f"Registering resource {key}") ctx.register_parquet(key_name_without_extension(key), str(path / key_name(key))) else: filename = q.filename() if filename is None: context.warning( f"Skipping '{query}' registering because it is lacking a filename" ) continue v = filename.split(".") context.info(f"Evaluating query {query}") context.evaluate_and_save(query, target_directory=str(tmpdir), target_file=filename) context.info(f"Registering {v[0]} from query {query}") ctx.register_parquet(v[0], str(path / filename)) return ctx
def test_execute(self): ctx = datafusion.ExecutionContext() # single column, "a" path = write_parquet(os.path.join(self.test_dir, 'a.parquet'), data()) ctx.register_parquet("t", path) self.assertEqual(ctx.tables(), {"t"}) # count expected = pyarrow.array([100], pyarrow.uint64()) expected = [pyarrow.RecordBatch.from_arrays([expected], ['COUNT(a)'])] self.assertEqual(expected, ctx.sql("SELECT COUNT(a) FROM t").collect()) # where expected = pyarrow.array([50], pyarrow.uint64()) expected = [pyarrow.RecordBatch.from_arrays([expected], ['COUNT(a)'])] self.assertEqual( expected, ctx.sql("SELECT COUNT(a) FROM t WHERE a > 10").collect()) # group by result = ctx.sql( "SELECT CAST(a as int), COUNT(a) FROM t GROUP BY CAST(a as int)" ).collect() expected_cast = pyarrow.array([50, 0, 49], pyarrow.int32()) expected_count = pyarrow.array([31, 50, 19], pyarrow.uint64()) expected = [ pyarrow.RecordBatch.from_arrays([expected_cast, expected_count], ['CAST(a as Int32)', 'COUNT(a)']) ] numpy.testing.assert_equal(expected, result) # order by result = ctx.sql( "SELECT a, CAST(a AS int) FROM t ORDER BY a DESC LIMIT 2").collect( ) expected_a = pyarrow.array([50.0219, 50.0152], pyarrow.float64()) expected_cast = pyarrow.array([50, 50], pyarrow.int32()) expected = [ pyarrow.RecordBatch.from_arrays([expected_a, expected_cast], ['a', 'CAST(a as Int32)']) ] numpy.testing.assert_equal(expected[0].column(1), expected[0].column(1))
def test_join(self): ctx = datafusion.ExecutionContext() batch = pa.RecordBatch.from_arrays( [pa.array([1, 2, 3]), pa.array([4, 5, 6])], names=["a", "b"], ) df = ctx.create_dataframe([[batch]]) batch = pa.RecordBatch.from_arrays( [pa.array([1, 2]), pa.array([8, 10])], names=["a", "c"], ) df1 = ctx.create_dataframe([[batch]]) df = df.join(df1, on="a", how="inner") df = df.sort([f.col("a").sort(ascending=True)]) table = pa.Table.from_batches(df.collect()) expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]} self.assertEqual(table.to_pydict(), expected)
def test_cast(self): """ Verify that we can cast """ ctx = datafusion.ExecutionContext() path = write_parquet(os.path.join(self.test_dir, 'a.parquet'), data()) ctx.register_parquet("t", path) valid_types = [ 'smallint', 'int', 'bigint', 'float(32)', 'float(64)', 'float', ] select = ', '.join( [f'CAST(9 AS {t}) AS A{i}' for i, t in enumerate(valid_types)]) # can execute, which implies that we can cast ctx.sql(f'SELECT {select} FROM t').collect()
def test_cast(self): """ Verify that we can cast """ ctx = datafusion.ExecutionContext() path = write_parquet(os.path.join(self.test_dir, "a.parquet"), data()) ctx.register_parquet("t", path) valid_types = [ "smallint", "int", "bigint", "float(32)", "float(64)", "float", ] select = ", ".join( [f"CAST(9 AS {t}) AS A{i}" for i, t in enumerate(valid_types)] ) # can execute, which implies that we can cast ctx.sql(f"SELECT {select} FROM t").collect()
a = np.linspace(0,2*np.pi,n) segment = np.array(a*10/(2*np.pi),dtype=int) return pd.DataFrame( dict( a=a, x2=np.sin(2*a), y2=np.cos(2*a), segment=segment, label=[f"{i+1}/{n}" for i in range(n)] ) ) evaluate_and_save("harmonic/harmonic.parquet",".") evaluate_and_save("harmonic2/harmonic2.parquet",".") ctx = daf.ExecutionContext() ctx.register_parquet("a","harmonic.parquet") ctx.register_parquet("b","harmonic2.parquet") df=ctx.sql(""" SELECT * FROM a WHERE a>1 """) print(df.show()) table = pyarrow.Table.from_batches(df.collect()) print("To Pandas") print(table.to_pandas()) pq.write_table(table, 'result.parquet') df = pd.read_parquet("result.parquet") print(df)
def execution_context(): ctx = daf.ExecutionContext() ctx.register_parquet("a", str(path)) return ctx