def _select_top(self, df: DataFrame, top_n: int): if top_n > 0: if len(self.partition_spec.partition_by) > 0: p_keys = ", ".join(self.partition_spec.partition_by) if len(self.partition_spec.presort) > 0: sort_expr = f"ORDER BY {self.partition_spec.presort_expr}" else: sort_expr = "" cols = ", ".join(df.schema.names) sql = """ SELECT {cols} FROM ( SELECT *, ROW_NUMBER() OVER(PARTITION BY {p_keys} {sort_expr}) AS __top_row_number__ FROM __plot_df__) WHERE __top_row_number__ <= {top_n} """.format(cols=cols, p_keys=p_keys, sort_expr=sort_expr, top_n=top_n) df = self.execution_engine.default_sql_engine.select( DataFrames(__plot_df__=df), sql) else: order_expr = "" if "order_by" in self.params: order_by = parse_presort_exp( self.params.get_or_throw("order_by", object)) if len(order_by) > 0: order_expr = "ORDER BY " + ", ".join( k + " " + ("ASC" if v else "DESC") for k, v in order_by.items()) sql = """ SELECT * FROM __plot_df__ {order_expr} LIMIT {top_n} """.format(order_expr=order_expr, top_n=top_n) df = self.execution_engine.default_sql_engine.select( DataFrames(__plot_df__=df), sql) return df
def test_comap_with_key(self): e = self.engine a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int") b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") c = e.to_df([[6, 1]], "c:int,a:int") z1 = e.persist(e.zip(a, b, df1_name="x", df2_name="y")) z2 = e.persist(e.zip_all(DataFrames(x=a, y=b, z=b))) z3 = e.persist( e.zip_all(DataFrames(z=c), partition_spec=PartitionSpec(by=["a"])) ) def comap(cursor, dfs): assert dfs.has_key v = ",".join([k + str(v.count()) for k, v in dfs.items()]) keys = cursor.key_value_array # if len(keys) == 0: # return ArrayDataFrame([[v]], "v:str") return ArrayDataFrame([keys + [v]], cursor.key_schema + "v:str") def on_init(partition_no, dfs): assert dfs.has_key assert partition_no >= 0 assert len(dfs) > 0 res = e.comap( z1, comap, "a:int,v:str", PartitionSpec(), metadata=dict(a=1), on_init=on_init, ) df_eq(res, [[1, "x2,y1"]], "a:int,v:str", metadata=dict(a=1), throw=True) res = e.comap( z2, comap, "a:int,v:str", PartitionSpec(), metadata=dict(a=1), on_init=on_init, ) df_eq(res, [[1, "x2,y1,z1"]], "a:int,v:str", metadata=dict(a=1), throw=True) res = e.comap( z3, comap, "a:int,v:str", PartitionSpec(), metadata=dict(a=1), on_init=on_init, ) df_eq(res, [[1, "z1"]], "a:int,v:str", metadata=dict(a=1), throw=True)
def select(self, dfs: DataFrames, statement: str) -> DataFrame: conn = duckdb.connect() try: for k, v in dfs.items(): conn.register_arrow(k, v.as_arrow()) result = conn.execute(statement).arrow() # see this issue to understand why use pandas output # https://github.com/duckdb/duckdb/issues/2446 # TODO: switch to ArrowDataFrame when duckdb 0.3.1 is released return PandasDataFrame(result.to_pandas()) finally: conn.close()
def select( self, dfs: DataFrames, ibis_func: Callable[[ibis.BaseBackend], ir.TableExpr] ) -> DataFrame: # pragma: no cover pdfs = {k: v.as_pandas() for k, v in dfs.items()} be = _BackendWrapper().connect(pdfs) be.set_schemas(dfs) expr = ibis_func(be) schema = to_schema(expr.schema()) result = expr.execute() assert_or_throw(isinstance(result, pd.DataFrame), "result must be a pandas DataFrame") return PandasDataFrame(result, schema=schema)
def select( self, dfs: DataFrames, ibis_func: Callable[[ibis.BaseBackend], ir.TableExpr]) -> DataFrame: for k, v in dfs.items(): self.execution_engine.register(v, k) # type: ignore con = ibis.pyspark.connect( self.execution_engine.spark_session) # type: ignore expr = ibis_func(con) schema = to_schema(expr.schema()) result = expr.compile() assert_or_throw( isinstance(result, PySparkDataFrame), lambda: ValueError( f"result must be a PySpark DataFrame ({type(result)})"), ) return SparkDataFrame(result, schema=schema)
def test_zip_all(self): e = self.engine a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int") z = e.persist(e.zip_all(DataFrames(a))) assert 1 == z.count() assert z.metadata.get("serialized", False) assert not z.metadata.get("serialized_has_name", False) z = e.persist(e.zip_all(DataFrames(x=a))) assert 1 == z.count() assert z.metadata.get("serialized", False) assert z.metadata.get("serialized_has_name", False) z = e.persist( e.zip_all(DataFrames(x=a), partition_spec=PartitionSpec(by=["a"])) ) assert 2 == z.count() assert z.metadata.get("serialized", False) assert z.metadata.get("serialized_has_name", False) b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") c = e.to_df([[6, 1], [2, 7]], "d:int,a:int") z = e.persist(e.zip_all(DataFrames(a, b, c))) assert 1 == z.count() assert not z.metadata.get("serialized_has_name", False) z = e.persist(e.zip_all(DataFrames(x=a, y=b, z=c))) assert 1 == z.count() assert z.metadata.get("serialized_has_name", False) z = e.persist(e.zip_all(DataFrames(b, b))) assert 2 == z.count() assert not z.metadata.get("serialized_has_name", False) assert ["a", "c"] in z.schema z = e.persist(e.zip_all(DataFrames(x=b, y=b))) assert 2 == z.count() assert z.metadata.get("serialized_has_name", False) assert ["a", "c"] in z.schema z = e.persist( e.zip_all(DataFrames(b, b), partition_spec=PartitionSpec(by=["a"])) ) assert 2 == z.count() assert not z.metadata.get("serialized_has_name", False) assert "c" not in z.schema
def select(self, dfs: DataFrames, statement: str) -> DataFrame: pd_dfs = {k: self.execution_engine.to_df(v).as_pandas() for k, v in dfs.items()} df = run_sql_on_pandas(statement, pd_dfs) return PandasDataFrame(df)
def set_schemas(self, dfs: DataFrames) -> None: self._schemas = {k: to_ibis_schema(v.schema) for k, v in dfs.items()}