def _test_expr(func, **dfs): con = ibis.pandas.connect(dfs) kwargs = {k: con.table(k) for k in dfs.keys()} expected = func(**kwargs).execute() lkwargs = {k: LazyIbisObject(k) for k in dfs.keys()} expr = func(**lkwargs) actual = materialize(expr, lambda k: kwargs[k]).execute() _df_eq(PandasDataFrame(actual), PandasDataFrame(expected), throw=True)
def trim_index( compute_engine: FugueExecutionEngine, df_graph: FugueDataFrame, indexed: bool = False, directed: bool = True, max_out_deg: int = 0, random_seed: Optional[int] = None, ) -> Tuple[FugueDataFrame, Optional[FugueDataFrame]]: """ The very first steps to treat the input graph: 1) basic validation of the input graph format: at least have ["src", "dst"] cols, it will be an unweighted graph if no "weight" col. 2) trim some edges to avoid super hotspot vertices: random sampling will be done on all the edges of a vertex if the number of edges is greater than a threshold, this is critical to reduce data skewness and save disk space 3) index the graph vertices by using sequential integers to represent vertices, this is critical to save memory :param compute_engine: an execution engine supported by Fugue :param df_graph: the input graph data as general Fugue dataframe :param indexed: if the input graph is using sequential integers to note vertices :param directed: if the graph is directed or not :param max_out_deg: the threshold for trimming hotspot vertices, set it to <= 0 to turn off trimming :param random_seed: optional random seed, for testing only Returns a validated, trimmed, and indexed graph """ logging.info("trim_index(): start validating, trimming, and indexing ...") if "src" not in df_graph.schema or "dst" not in df_graph.schema: raise ValueError( f"Input graph NOT in the right format: {df_graph.schema}") params = {"max_out_degree": max_out_deg, "random_seed": random_seed} dag = FugueWorkflow(compute_engine) df = (dag.df(df_graph).partition(by=["src"]).transform( trim_hotspot_vertices, schema="*", params=params, ).compute()) name_id = None if indexed is True: return df, name_id if isinstance(compute_engine, SparkExecutionEngine): df_res, name_id = index_graph_spark(df.native, directed) # type: ignore return SparkDataFrame(df_res), SparkDataFrame(name_id) else: df_res, name_id = index_graph_pandas(df.as_pandas(), directed) return PandasDataFrame(df_res), PandasDataFrame(name_id)
def test_transform(): pdf = pd.DataFrame([[1, 10], [0, 0], [1, 1], [0, 20]], columns=["a", "b"]) def f1(df: pd.DataFrame) -> pd.DataFrame: return df.sort_values("b").head(1) result = transform(pdf, f1, schema="*") assert isinstance(result, pd.DataFrame) assert result.values.tolist() == [[0, 0]] # schema: * def f2(df: pd.DataFrame) -> pd.DataFrame: return df.sort_values("b").head(1) result = transform(pdf, f2) assert isinstance(result, pd.DataFrame) assert result.values.tolist() == [[0, 0]] result = transform(pdf, f2, partition=dict(by=["a"])) assert isinstance(result, pd.DataFrame) assert sorted(result.values.tolist(), key=lambda x: x[0]) == [[0, 0], [1, 1]] ppdf = PandasDataFrame(pdf) assert isinstance(transform(ppdf, f2), DataFrame)
def select(self, dfs: DataFrames, statement: str) -> DataFrame: sql, sqlite_file = transform_sqlite_sql(statement, self._validate_database) if sqlite_file is None: return self.engine.select(dfs, statement) assert_or_throw(len(dfs) == 0, "sql to query sqlite can't have other tables") with sqlite3.connect( os.path.join(self.database_path, sqlite_file) ) as connection: df = pd.read_sql_query(sql, connection) return PandasDataFrame(df)
def test_trim_index(): """ test Fugue func trim_index() """ from node2vec.fugue import trim_index graph = [[0, 2, 0.41], [0, 4, 0.85], [3, 4, 0.36], [2, 0, 0.68], [4, 0, 0.1], [4, 3, 0.37]] df = ArrayDataFrame(graph, schema="src:int,dst:int,weight:double") df_res, name_id = trim_index(NativeExecutionEngine(), df, indexed=True) assert len(df_res.as_pandas()) == 6 and name_id is None df_res, name_id = trim_index( NativeExecutionEngine(), df, indexed=True, max_out_deg=1, ) assert len(df_res.as_pandas()) == 4 and name_id is None spark = SparkSession.builder.config("spark.executor.cores", 4).getOrCreate() dat1 = { 'src': ['a1', 'a1', 'a1', 'a2', 'b2'], 'dst': ['a2', 'b1', 'b2', 'b1', 'a2'], } dat2 = { 'dst': ['a2', 'b1', 'b2', 'a1'], 'weight': [0.8, 1.1, 1.0, 0.3] } df = spark.createDataFrame(pd.DataFrame.from_dict(dat1)) df_res, name_id = trim_index( SparkExecutionEngine(spark), SparkDataFrame(df), indexed=False, max_out_deg=2 ) assert df_res.count() == 4 and name_id.count() == 4 df = spark.createDataFrame(pd.DataFrame.from_dict(dat2)) pytest.raises( ValueError, trim_index, SparkExecutionEngine(spark), SparkDataFrame(df), True, ) df = pd.DataFrame.from_dict(dat1) df_res, name_id = trim_index( NativeExecutionEngine(), PandasDataFrame(df), indexed=False, ) assert len(df_res.as_pandas()) == 5 and len(name_id.as_pandas()) == 4 df = pd.DataFrame.from_dict(dat2) pytest.raises( ValueError, trim_index, NativeExecutionEngine(), PandasDataFrame(df), False, )
def select(self, dfs: DataFrames, statement: str) -> DataFrame: conn = duckdb.connect() try: for k, v in dfs.items(): conn.register_arrow(k, v.as_arrow()) result = conn.execute(statement).arrow() # see this issue to understand why use pandas output # https://github.com/duckdb/duckdb/issues/2446 # TODO: switch to ArrowDataFrame when duckdb 0.3.1 is released return PandasDataFrame(result.to_pandas()) finally: conn.close()
def select( self, dfs: DataFrames, ibis_func: Callable[[ibis.BaseBackend], ir.TableExpr] ) -> DataFrame: # pragma: no cover pdfs = {k: v.as_pandas() for k, v in dfs.items()} be = _BackendWrapper().connect(pdfs) be.set_schemas(dfs) expr = ibis_func(be) schema = to_schema(expr.schema()) result = expr.execute() assert_or_throw(isinstance(result, pd.DataFrame), "result must be a pandas DataFrame") return PandasDataFrame(result, schema=schema)
def select( self, dfs: DataFrames, ibis_func: Callable[[ibis.BaseBackend], ir.TableExpr] ) -> DataFrame: # pragma: no cover be = _BackendWrapper().connect({}) be.set_schemas(dfs) expr = ibis_func(be) schema = to_schema(expr.schema()) sql = str( ibis.postgres.compile(expr).compile( compile_kwargs={"literal_binds": True})) engine = DuckDBEngine(self.execution_engine) return PandasDataFrame(engine.select(dfs, sql).as_pandas(), schema=schema)
def test_transform(): pdf = pd.DataFrame([[1, 10], [0, 0], [1, 1], [0, 20]], columns=["a", "b"]) def f1(df: pd.DataFrame) -> pd.DataFrame: return df.sort_values("b").head(1) result = transform(pdf, f1, schema="*") assert isinstance(result, pd.DataFrame) assert result.values.tolist() == [[0, 0]] # schema: * def f2(df: pd.DataFrame) -> pd.DataFrame: return df.sort_values("b").head(1) result = transform(pdf, f2) assert isinstance(result, pd.DataFrame) assert result.values.tolist() == [[0, 0]] result = transform(pdf, f2, partition=dict(by=["a"])) assert isinstance(result, pd.DataFrame) assert sorted(result.values.tolist(), key=lambda x: x[0]) == [[0, 0], [1, 1]] result = transform(pdf, f2, partition=dict(by=["a"]), force_output_fugue_dataframe=True) assert isinstance(result, DataFrame) ppdf = PandasDataFrame(pdf) assert isinstance(transform(ppdf, f2), DataFrame) # schema: * def f3(df: pd.DataFrame, called: callable) -> pd.DataFrame: called() return df cb = Callback() result = transform(pdf, f3, callback=cb.called) assert 1 == cb.ct
def select(self, dfs: DataFrames, statement: str) -> DataFrame: pd_dfs = {k: self.execution_engine.to_df(v).as_pandas() for k, v in dfs.items()} df = run_sql_on_pandas(statement, pd_dfs) return PandasDataFrame(df)
def with_nat(cursor, data): df = data.as_pandas() df["nat"] = pd.NaT schema = data.schema + "nat:datetime" return PandasDataFrame(df, schema)