Example #1
0
    def do_connect(
        self,
        config: Mapping[str, str | Path] | df.ExecutionContext,
    ) -> None:
        """Create a Datafusion backend for use with Ibis.

        Parameters
        ----------
        config
            Mapping of table names to files.
        """
        if isinstance(config, df.ExecutionContext):
            self._context = config
        else:
            self._context = df.ExecutionContext()

        for name, path in config.items():
            strpath = str(path)
            if strpath.endswith('.csv'):
                self.register_csv(name, path)
            elif strpath.endswith('.parquet'):
                self.register_parquet(name, path)
            else:
                raise ValueError(
                    "Currently the DataFusion backend only supports CSV "
                    "files with the extension .csv and Parquet files with "
                    "the .parquet extension."
                )
    def test_join(self):
        ctx = datafusion.ExecutionContext()

        batch = pyarrow.RecordBatch.from_arrays(
            [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])],
            names=["a", "b"],
        )
        df = ctx.create_dataframe([[batch]])

        batch = pyarrow.RecordBatch.from_arrays(
            [pyarrow.array([1, 2]), pyarrow.array([8, 10])],
            names=["a", "c"],
        )
        df1 = ctx.create_dataframe([[batch]])

        df = df.join(df1, on="a", how="inner")

        # execute and collect the first (and only) batch
        batch = df.collect()[0]

        if batch.column(0) == pyarrow.array([1, 2]):
            self.assertEqual(batch.column(0), pyarrow.array([1, 2]))
            self.assertEqual(batch.column(1), pyarrow.array([8, 10]))
            self.assertEqual(batch.column(2), pyarrow.array([4, 5]))
        else:
            self.assertEqual(batch.column(0), pyarrow.array([2, 1]))
            self.assertEqual(batch.column(1), pyarrow.array([10, 8]))
            self.assertEqual(batch.column(2), pyarrow.array([5, 4]))
Example #3
0
    def do_connect(self, config):
        """
        Create a DataFusionClient for use with Ibis

        Parameters
        ----------
        config : DataFusionContext or dict

        Returns
        -------
        DataFusionClient
        """
        if isinstance(config, df.ExecutionContext):
            self._context = config
        else:
            self._context = df.ExecutionContext()

        for name, path in config.items():
            strpath = str(path)
            if strpath.endswith('.csv'):
                self.register_csv(name, path)
            elif strpath.endswith('.parquet'):
                self.register_parquet(name, path)
            else:
                raise ValueError(
                    "Currently the DataFusion backend only supports CSV "
                    "files with the extension .csv and Parquet files with "
                    "the .parquet extension.")
Example #4
0
    def test_execute(self):
        data = [1, 1, 2, 2, 3, 11, 12]

        ctx = datafusion.ExecutionContext()

        # single column, "a"
        path = write_parquet(
            os.path.join(self.test_dir, "a.parquet"), pyarrow.array(data)
        )
        ctx.register_parquet("t", path)

        self.assertEqual(ctx.tables(), {"t"})

        # count
        result = ctx.sql("SELECT COUNT(a) FROM t").collect()

        expected = pyarrow.array([7], pyarrow.uint64())
        expected = [pyarrow.RecordBatch.from_arrays([expected], ["COUNT(a)"])]
        self.assertEqual(expected, result)

        # where
        expected = pyarrow.array([2], pyarrow.uint64())
        expected = [pyarrow.RecordBatch.from_arrays([expected], ["COUNT(a)"])]
        self.assertEqual(
            expected, ctx.sql("SELECT COUNT(a) FROM t WHERE a > 10").collect()
        )

        # group by
        results = ctx.sql(
            "SELECT CAST(a as int), COUNT(a) FROM t GROUP BY CAST(a as int)"
        ).collect()

        # group by returns batches
        result_keys = []
        result_values = []
        for result in results:
            pydict = result.to_pydict()
            result_keys.extend(pydict["CAST(a AS Int32)"])
            result_values.extend(pydict["COUNT(a)"])

        result_keys, result_values = (
            list(t) for t in zip(*sorted(zip(result_keys, result_values)))
        )

        self.assertEqual(result_keys, [1, 2, 3, 11, 12])
        self.assertEqual(result_values, [2, 2, 1, 1, 1])

        # order by
        result = ctx.sql(
            "SELECT a, CAST(a AS int) FROM t ORDER BY a DESC LIMIT 2"
        ).collect()
        expected_a = pyarrow.array([50.0219, 50.0152], pyarrow.float64())
        expected_cast = pyarrow.array([50, 50], pyarrow.int32())
        expected = [
            pyarrow.RecordBatch.from_arrays(
                [expected_a, expected_cast], ["a", "CAST(a AS Int32)"]
            )
        ]
        numpy.testing.assert_equal(expected[0].column(1), expected[0].column(1))
Example #5
0
    def test_register(self):
        ctx = datafusion.ExecutionContext()

        path = write_parquet(os.path.join(self.test_dir, "a.parquet"), data())

        ctx.register_parquet("t", path)

        self.assertEqual(ctx.tables(), {"t"})
Example #6
0
    def _prepare(self):
        ctx = datafusion.ExecutionContext()

        # create a RecordBatch and a new DataFrame from it
        batch = pyarrow.RecordBatch.from_arrays(
            [pyarrow.array([1, 2, 3]), pyarrow.array([4, 5, 6])],
            names=["a", "b"],
        )
        return ctx.create_dataframe([[batch]])
Example #7
0
    def _test_data(self, data):
        ctx = datafusion.ExecutionContext()

        # write to disk
        path = write_parquet(os.path.join(self.test_dir, "a.parquet"), data)
        ctx.register_parquet("t", path)

        batches = ctx.sql("SELECT a AS tt FROM t").collect()

        result = batches[0].column(0)

        numpy.testing.assert_equal(data, result)
Example #8
0
    def _test_udf(self, udf, args, return_type, array, expected):
        ctx = datafusion.ExecutionContext()

        # write to disk
        path = write_parquet(os.path.join(self.test_dir, "a.parquet"), array)
        ctx.register_parquet("t", path)

        ctx.register_udf("udf", udf, args, return_type)

        batches = ctx.sql("SELECT udf(a) AS tt FROM t").collect()

        result = batches[0].column(0)

        self.assertEqual(expected, result)
Example #9
0
    def make_execution_context(self, tmpdir, store, context):
        import datafusion as daf
        ctx = daf.ExecutionContext()
        register = self.data.get("register", [])
        store = store.root_store()

        path = Path(tmpdir)
        for query in register:
            context.info(f"Register {query}")
            try:
                q = parse(query)
            except:
                context.warning(
                    f"Could not parse query '{query}' in parquet_sql recipe {self.recipe_name()}",
                    traceback=traceback.format_exc())
            if q.is_resource_query():
                key = q.resource_query().path()
                if store.is_dir(key):
                    context.info(f"Registering directory {key}")
                    for k in store.listdir_keys(key):
                        if not store.is_dir(k) and key_extension(
                                k) == "parquet":
                            (path / key_name(k)).write_bytes(
                                store.get_bytes(k))
                            context.info(
                                f"Registering {key_name_without_extension(k)} from {key}"
                            )
                            ctx.register_parquet(key_name_without_extension(k),
                                                 str(path / key_name(k)))
                else:
                    (path / key_name(key)).write_bytes(store.get_bytes(key))
                    context.info(f"Registering resource {key}")
                    ctx.register_parquet(key_name_without_extension(key),
                                         str(path / key_name(key)))
            else:
                filename = q.filename()
                if filename is None:
                    context.warning(
                        f"Skipping '{query}' registering because it is lacking a filename"
                    )
                    continue
                v = filename.split(".")
                context.info(f"Evaluating query {query}")
                context.evaluate_and_save(query,
                                          target_directory=str(tmpdir),
                                          target_file=filename)
                context.info(f"Registering {v[0]} from query {query}")
                ctx.register_parquet(v[0], str(path / filename))
        return ctx
Example #10
0
    def test_execute(self):
        ctx = datafusion.ExecutionContext()

        # single column, "a"
        path = write_parquet(os.path.join(self.test_dir, 'a.parquet'), data())
        ctx.register_parquet("t", path)

        self.assertEqual(ctx.tables(), {"t"})

        # count
        expected = pyarrow.array([100], pyarrow.uint64())
        expected = [pyarrow.RecordBatch.from_arrays([expected], ['COUNT(a)'])]
        self.assertEqual(expected, ctx.sql("SELECT COUNT(a) FROM t").collect())

        # where
        expected = pyarrow.array([50], pyarrow.uint64())
        expected = [pyarrow.RecordBatch.from_arrays([expected], ['COUNT(a)'])]
        self.assertEqual(
            expected,
            ctx.sql("SELECT COUNT(a) FROM t WHERE a > 10").collect())

        # group by
        result = ctx.sql(
            "SELECT CAST(a as int), COUNT(a) FROM t GROUP BY CAST(a as int)"
        ).collect()

        expected_cast = pyarrow.array([50, 0, 49], pyarrow.int32())
        expected_count = pyarrow.array([31, 50, 19], pyarrow.uint64())
        expected = [
            pyarrow.RecordBatch.from_arrays([expected_cast, expected_count],
                                            ['CAST(a as Int32)', 'COUNT(a)'])
        ]
        numpy.testing.assert_equal(expected, result)

        # order by
        result = ctx.sql(
            "SELECT a, CAST(a AS int) FROM t ORDER BY a DESC LIMIT 2").collect(
            )
        expected_a = pyarrow.array([50.0219, 50.0152], pyarrow.float64())
        expected_cast = pyarrow.array([50, 50], pyarrow.int32())
        expected = [
            pyarrow.RecordBatch.from_arrays([expected_a, expected_cast],
                                            ['a', 'CAST(a as Int32)'])
        ]
        numpy.testing.assert_equal(expected[0].column(1),
                                   expected[0].column(1))
Example #11
0
    def test_join(self):
        ctx = datafusion.ExecutionContext()

        batch = pa.RecordBatch.from_arrays(
            [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
            names=["a", "b"],
        )
        df = ctx.create_dataframe([[batch]])

        batch = pa.RecordBatch.from_arrays(
            [pa.array([1, 2]), pa.array([8, 10])],
            names=["a", "c"],
        )
        df1 = ctx.create_dataframe([[batch]])

        df = df.join(df1, on="a", how="inner")
        df = df.sort([f.col("a").sort(ascending=True)])
        table = pa.Table.from_batches(df.collect())

        expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]}
        self.assertEqual(table.to_pydict(), expected)
Example #12
0
    def test_cast(self):
        """
        Verify that we can cast
        """
        ctx = datafusion.ExecutionContext()

        path = write_parquet(os.path.join(self.test_dir, 'a.parquet'), data())
        ctx.register_parquet("t", path)

        valid_types = [
            'smallint',
            'int',
            'bigint',
            'float(32)',
            'float(64)',
            'float',
        ]

        select = ', '.join(
            [f'CAST(9 AS {t}) AS A{i}' for i, t in enumerate(valid_types)])

        # can execute, which implies that we can cast
        ctx.sql(f'SELECT {select} FROM t').collect()
Example #13
0
    def test_cast(self):
        """
        Verify that we can cast
        """
        ctx = datafusion.ExecutionContext()

        path = write_parquet(os.path.join(self.test_dir, "a.parquet"), data())
        ctx.register_parquet("t", path)

        valid_types = [
            "smallint",
            "int",
            "bigint",
            "float(32)",
            "float(64)",
            "float",
        ]

        select = ", ".join(
            [f"CAST(9 AS {t}) AS A{i}" for i, t in enumerate(valid_types)]
        )

        # can execute, which implies that we can cast
        ctx.sql(f"SELECT {select} FROM t").collect()
Example #14
0
    a = np.linspace(0,2*np.pi,n)
    segment = np.array(a*10/(2*np.pi),dtype=int)
    return pd.DataFrame(
        dict(
            a=a,
            x2=np.sin(2*a),
            y2=np.cos(2*a),
            segment=segment,
            label=[f"{i+1}/{n}" for i in range(n)]
        )
    )

evaluate_and_save("harmonic/harmonic.parquet",".")
evaluate_and_save("harmonic2/harmonic2.parquet",".")

ctx = daf.ExecutionContext()
ctx.register_parquet("a","harmonic.parquet")
ctx.register_parquet("b","harmonic2.parquet")
df=ctx.sql("""
SELECT * FROM a WHERE a>1
""")
print(df.show())

table = pyarrow.Table.from_batches(df.collect())
print("To Pandas")
print(table.to_pandas())

pq.write_table(table, 'result.parquet')

df = pd.read_parquet("result.parquet")
print(df)
Example #15
0
 def execution_context():
     ctx = daf.ExecutionContext()
     ctx.register_parquet("a", str(path))
     return ctx