コード例 #1
0
 def _select_top(self, df: DataFrame, top_n: int):
     if top_n > 0:
         if len(self.partition_spec.partition_by) > 0:
             p_keys = ", ".join(self.partition_spec.partition_by)
             if len(self.partition_spec.presort) > 0:
                 sort_expr = f"ORDER BY {self.partition_spec.presort_expr}"
             else:
                 sort_expr = ""
             cols = ", ".join(df.schema.names)
             sql = """
             SELECT {cols} FROM (
                 SELECT *, ROW_NUMBER() OVER(PARTITION BY {p_keys} {sort_expr})
                                             AS __top_row_number__
                 FROM __plot_df__) WHERE __top_row_number__ <= {top_n}
             """.format(cols=cols,
                        p_keys=p_keys,
                        sort_expr=sort_expr,
                        top_n=top_n)
             df = self.execution_engine.default_sql_engine.select(
                 DataFrames(__plot_df__=df), sql)
         else:
             order_expr = ""
             if "order_by" in self.params:
                 order_by = parse_presort_exp(
                     self.params.get_or_throw("order_by", object))
                 if len(order_by) > 0:
                     order_expr = "ORDER BY " + ", ".join(
                         k + " " + ("ASC" if v else "DESC")
                         for k, v in order_by.items())
             sql = """
             SELECT * FROM __plot_df__ {order_expr} LIMIT {top_n}
             """.format(order_expr=order_expr, top_n=top_n)
             df = self.execution_engine.default_sql_engine.select(
                 DataFrames(__plot_df__=df), sql)
     return df
コード例 #2
0
        def test_comap_with_key(self):
            e = self.engine
            a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int")
            b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
            c = e.to_df([[6, 1]], "c:int,a:int")
            z1 = e.persist(e.zip(a, b, df1_name="x", df2_name="y"))
            z2 = e.persist(e.zip_all(DataFrames(x=a, y=b, z=b)))
            z3 = e.persist(
                e.zip_all(DataFrames(z=c), partition_spec=PartitionSpec(by=["a"]))
            )

            def comap(cursor, dfs):
                assert dfs.has_key
                v = ",".join([k + str(v.count()) for k, v in dfs.items()])
                keys = cursor.key_value_array
                # if len(keys) == 0:
                #    return ArrayDataFrame([[v]], "v:str")
                return ArrayDataFrame([keys + [v]], cursor.key_schema + "v:str")

            def on_init(partition_no, dfs):
                assert dfs.has_key
                assert partition_no >= 0
                assert len(dfs) > 0

            res = e.comap(
                z1,
                comap,
                "a:int,v:str",
                PartitionSpec(),
                metadata=dict(a=1),
                on_init=on_init,
            )
            df_eq(res, [[1, "x2,y1"]], "a:int,v:str", metadata=dict(a=1), throw=True)

            res = e.comap(
                z2,
                comap,
                "a:int,v:str",
                PartitionSpec(),
                metadata=dict(a=1),
                on_init=on_init,
            )
            df_eq(res, [[1, "x2,y1,z1"]], "a:int,v:str", metadata=dict(a=1), throw=True)

            res = e.comap(
                z3,
                comap,
                "a:int,v:str",
                PartitionSpec(),
                metadata=dict(a=1),
                on_init=on_init,
            )
            df_eq(res, [[1, "z1"]], "a:int,v:str", metadata=dict(a=1), throw=True)
コード例 #3
0
 def select(self, dfs: DataFrames, statement: str) -> DataFrame:
     conn = duckdb.connect()
     try:
         for k, v in dfs.items():
             conn.register_arrow(k, v.as_arrow())
         result = conn.execute(statement).arrow()
         # see this issue to understand why use pandas output
         # https://github.com/duckdb/duckdb/issues/2446
         # TODO: switch to ArrowDataFrame when duckdb 0.3.1 is released
         return PandasDataFrame(result.to_pandas())
     finally:
         conn.close()
コード例 #4
0
ファイル: pandas_backend.py プロジェクト: gityow/fugue
 def select(
     self, dfs: DataFrames, ibis_func: Callable[[ibis.BaseBackend],
                                                ir.TableExpr]
 ) -> DataFrame:  # pragma: no cover
     pdfs = {k: v.as_pandas() for k, v in dfs.items()}
     be = _BackendWrapper().connect(pdfs)
     be.set_schemas(dfs)
     expr = ibis_func(be)
     schema = to_schema(expr.schema())
     result = expr.execute()
     assert_or_throw(isinstance(result, pd.DataFrame),
                     "result must be a pandas DataFrame")
     return PandasDataFrame(result, schema=schema)
コード例 #5
0
 def select(
         self, dfs: DataFrames,
         ibis_func: Callable[[ibis.BaseBackend],
                             ir.TableExpr]) -> DataFrame:
     for k, v in dfs.items():
         self.execution_engine.register(v, k)  # type: ignore
     con = ibis.pyspark.connect(
         self.execution_engine.spark_session)  # type: ignore
     expr = ibis_func(con)
     schema = to_schema(expr.schema())
     result = expr.compile()
     assert_or_throw(
         isinstance(result, PySparkDataFrame),
         lambda: ValueError(
             f"result must be a PySpark DataFrame ({type(result)})"),
     )
     return SparkDataFrame(result, schema=schema)
コード例 #6
0
        def test_zip_all(self):
            e = self.engine
            a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int")
            z = e.persist(e.zip_all(DataFrames(a)))
            assert 1 == z.count()
            assert z.metadata.get("serialized", False)
            assert not z.metadata.get("serialized_has_name", False)
            z = e.persist(e.zip_all(DataFrames(x=a)))
            assert 1 == z.count()
            assert z.metadata.get("serialized", False)
            assert z.metadata.get("serialized_has_name", False)
            z = e.persist(
                e.zip_all(DataFrames(x=a), partition_spec=PartitionSpec(by=["a"]))
            )
            assert 2 == z.count()
            assert z.metadata.get("serialized", False)
            assert z.metadata.get("serialized_has_name", False)

            b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
            c = e.to_df([[6, 1], [2, 7]], "d:int,a:int")
            z = e.persist(e.zip_all(DataFrames(a, b, c)))
            assert 1 == z.count()
            assert not z.metadata.get("serialized_has_name", False)
            z = e.persist(e.zip_all(DataFrames(x=a, y=b, z=c)))
            assert 1 == z.count()
            assert z.metadata.get("serialized_has_name", False)

            z = e.persist(e.zip_all(DataFrames(b, b)))
            assert 2 == z.count()
            assert not z.metadata.get("serialized_has_name", False)
            assert ["a", "c"] in z.schema
            z = e.persist(e.zip_all(DataFrames(x=b, y=b)))
            assert 2 == z.count()
            assert z.metadata.get("serialized_has_name", False)
            assert ["a", "c"] in z.schema

            z = e.persist(
                e.zip_all(DataFrames(b, b), partition_spec=PartitionSpec(by=["a"]))
            )
            assert 2 == z.count()
            assert not z.metadata.get("serialized_has_name", False)
            assert "c" not in z.schema
コード例 #7
0
 def select(self, dfs: DataFrames, statement: str) -> DataFrame:
     pd_dfs = {k: self.execution_engine.to_df(v).as_pandas() for k, v in dfs.items()}
     df = run_sql_on_pandas(statement, pd_dfs)
     return PandasDataFrame(df)
コード例 #8
0
 def set_schemas(self, dfs: DataFrames) -> None:
     self._schemas = {k: to_ibis_schema(v.schema) for k, v in dfs.items()}