def df(): ctx = ExecutionContext() # create a RecordBatch and a new DataFrame from it batch = pa.RecordBatch.from_arrays( [pa.array([1, 2, 3]), pa.array([4, 4, 6])], names=["a", "b"], ) return ctx.create_dataframe([[batch]])
def do_connect( self, config: Mapping[str, str | Path] | SessionContext, ) -> None: """Create a Datafusion backend for use with Ibis. Parameters ---------- config Mapping of table names to files. Examples -------- >>> import ibis >>> config = {"t": "path/to/file.parquet", "s": "path/to/file.csv"} >>> ibis.datafusion.connect(config) """ if isinstance(config, SessionContext): self._context = config else: self._context = SessionContext() for name, path in config.items(): strpath = str(path) if strpath.endswith('.csv'): self.register_csv(name, path) elif strpath.endswith('.parquet'): self.register_parquet(name, path) else: raise ValueError( "Currently the DataFusion backend only supports CSV " "files with the extension .csv and Parquet files with " "the .parquet extension.")
def test_join(): ctx = ExecutionContext() batch = pa.RecordBatch.from_arrays( [pa.array([1, 2, 3]), pa.array([4, 5, 6])], names=["a", "b"], ) df = ctx.create_dataframe([[batch]]) batch = pa.RecordBatch.from_arrays( [pa.array([1, 2]), pa.array([8, 10])], names=["a", "c"], ) df1 = ctx.create_dataframe([[batch]]) df = df.join(df1, join_keys=(["a"], ["a"]), how="inner") df = df.sort([f.col("a").sort(ascending=True)]) table = pa.Table.from_batches(df.collect()) expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]} assert table.to_pydict() == expected
def df(): ctx = ExecutionContext() # create a RecordBatch and a new DataFrame from it batch = pa.RecordBatch.from_arrays([pa.array([0.1, -0.7, 0.55])], names=["value"]) return ctx.create_dataframe([[batch]])
def ctx(): return ExecutionContext()
class Backend(BaseBackend): name = 'datafusion' builder = None @property def version(self): try: import importlib.metadata as importlib_metadata except ImportError: # TODO: remove this when Python 3.9 support is dropped import importlib_metadata return importlib_metadata.version("datafusion") def do_connect( self, config: Mapping[str, str | Path] | SessionContext, ) -> None: """Create a Datafusion backend for use with Ibis. Parameters ---------- config Mapping of table names to files. Examples -------- >>> import ibis >>> config = {"t": "path/to/file.parquet", "s": "path/to/file.csv"} >>> ibis.datafusion.connect(config) """ if isinstance(config, SessionContext): self._context = config else: self._context = SessionContext() for name, path in config.items(): strpath = str(path) if strpath.endswith('.csv'): self.register_csv(name, path) elif strpath.endswith('.parquet'): self.register_parquet(name, path) else: raise ValueError( "Currently the DataFusion backend only supports CSV " "files with the extension .csv and Parquet files with " "the .parquet extension.") def current_database(self) -> str: raise NotImplementedError() def list_databases(self, like: str | None = None) -> list[str]: raise NotImplementedError() def list_tables( self, like: str | None = None, database: str | None = None, ) -> list[str]: """List the available tables.""" tables = list(self._context.tables()) if like is not None: pattern = re.compile(like) return list(filter(lambda t: pattern.findall(t), tables)) return tables def table( self, name: str, schema: sch.Schema | None = None, ) -> ir.Table: """Get an ibis expression representing a DataFusion table. Parameters ---------- name The name of the table to retreive schema An optional schema for the table Returns ------- Table A table expression """ catalog = self._context.catalog() database = catalog.database('public') table = database.table(name) schema = sch.infer(table.schema) return self.table_class(name, schema, self).to_expr() def register_csv( self, name: str, path: str | Path, schema: sch.Schema | None = None, ) -> None: """Register a CSV file with with `name` located at `path`. Parameters ---------- name The name of the table path The path to the CSV file schema An optional schema """ self._context.register_csv(name, str(path), schema=schema) def register_parquet( self, name: str, path: str | Path, ) -> None: """Register a parquet file with with `name` located at `path`. Parameters ---------- name The name of the table path The path to the Parquet file """ self._context.register_parquet(name, str(path)) def execute( self, expr: ir.Expr, params: Mapping[ir.Expr, object] = None, limit: str = 'default', **kwargs: Any, ): if isinstance(expr, ir.Table): frame = self.compile(expr, params, **kwargs) table = _to_pyarrow_table(frame) return table.to_pandas() elif isinstance(expr, ir.Column): # expression must be named for the projection expr = expr.name('tmp').to_projection() frame = self.compile(expr, params, **kwargs) table = _to_pyarrow_table(frame) return table['tmp'].to_pandas() elif isinstance(expr, ir.Scalar): if expr.op().root_tables(): # there are associated datafusion tables so convert the expr # to a selection which we can directly convert to a datafusion # plan expr = expr.name('tmp').to_projection() frame = self.compile(expr, params, **kwargs) else: # doesn't have any tables associated so create a plan from a # dummy datafusion table compiled = self.compile(expr, params, **kwargs) frame = self._context.empty_table().select(compiled) table = _to_pyarrow_table(frame) return table[0][0].as_py() else: raise com.IbisError( f"Cannot execute expression of type: {type(expr)}") def compile( self, expr: ir.Expr, params: Mapping[ir.Expr, object] = None, **kwargs: Any, ): return translate(expr) @classmethod @lru_cache def _get_operations(cls): from ibis.backends.datafusion.compiler import translate return frozenset(op for op in translate.registry if issubclass(op, ops.Value)) @classmethod def has_operation(cls, operation: type[ops.Value]) -> bool: op_classes = cls._get_operations() return operation in op_classes or any( issubclass(operation, op_impl) for op_impl in op_classes)