Exemple #1
0
def _test_expr(func, **dfs):
    con = ibis.pandas.connect(dfs)
    kwargs = {k: con.table(k) for k in dfs.keys()}
    expected = func(**kwargs).execute()

    lkwargs = {k: LazyIbisObject(k) for k in dfs.keys()}
    expr = func(**lkwargs)
    actual = materialize(expr, lambda k: kwargs[k]).execute()

    _df_eq(PandasDataFrame(actual), PandasDataFrame(expected), throw=True)
Exemple #2
0
def trim_index(
    compute_engine: FugueExecutionEngine,
    df_graph: FugueDataFrame,
    indexed: bool = False,
    directed: bool = True,
    max_out_deg: int = 0,
    random_seed: Optional[int] = None,
) -> Tuple[FugueDataFrame, Optional[FugueDataFrame]]:
    """
    The very first steps to treat the input graph:
    1) basic validation of the input graph format: at least have ["src", "dst"] cols,
       it will be an unweighted graph if no "weight" col.
    2) trim some edges to avoid super hotspot vertices: random sampling will be done
       on all the edges of a vertex if the number of edges is greater than a threshold,
       this is critical to reduce data skewness and save disk space
    3) index the graph vertices by using sequential integers to represent vertices,
       this is critical to save memory

    :param compute_engine: an execution engine supported by Fugue
    :param df_graph: the input graph data as general Fugue dataframe
    :param indexed: if the input graph is using sequential integers to note vertices
    :param directed: if the graph is directed or not
    :param max_out_deg: the threshold for trimming hotspot vertices, set it to <= 0
                        to turn off trimming
    :param random_seed: optional random seed, for testing only

    Returns a validated, trimmed, and indexed graph
    """
    logging.info("trim_index(): start validating, trimming, and indexing ...")
    if "src" not in df_graph.schema or "dst" not in df_graph.schema:
        raise ValueError(
            f"Input graph NOT in the right format: {df_graph.schema}")

    params = {"max_out_degree": max_out_deg, "random_seed": random_seed}
    dag = FugueWorkflow(compute_engine)
    df = (dag.df(df_graph).partition(by=["src"]).transform(
        trim_hotspot_vertices,
        schema="*",
        params=params,
    ).compute())

    name_id = None
    if indexed is True:
        return df, name_id
    if isinstance(compute_engine, SparkExecutionEngine):
        df_res, name_id = index_graph_spark(df.native,
                                            directed)  # type: ignore
        return SparkDataFrame(df_res), SparkDataFrame(name_id)
    else:
        df_res, name_id = index_graph_pandas(df.as_pandas(), directed)
        return PandasDataFrame(df_res), PandasDataFrame(name_id)
def test_transform():
    pdf = pd.DataFrame([[1, 10], [0, 0], [1, 1], [0, 20]], columns=["a", "b"])

    def f1(df: pd.DataFrame) -> pd.DataFrame:
        return df.sort_values("b").head(1)

    result = transform(pdf, f1, schema="*")
    assert isinstance(result, pd.DataFrame)
    assert result.values.tolist() == [[0, 0]]

    # schema: *
    def f2(df: pd.DataFrame) -> pd.DataFrame:
        return df.sort_values("b").head(1)

    result = transform(pdf, f2)
    assert isinstance(result, pd.DataFrame)
    assert result.values.tolist() == [[0, 0]]

    result = transform(pdf, f2, partition=dict(by=["a"]))
    assert isinstance(result, pd.DataFrame)
    assert sorted(result.values.tolist(), key=lambda x: x[0]) == [[0, 0],
                                                                  [1, 1]]

    ppdf = PandasDataFrame(pdf)
    assert isinstance(transform(ppdf, f2), DataFrame)
Exemple #4
0
 def select(self, dfs: DataFrames, statement: str) -> DataFrame:
     sql, sqlite_file = transform_sqlite_sql(statement, self._validate_database)
     if sqlite_file is None:
         return self.engine.select(dfs, statement)
     assert_or_throw(len(dfs) == 0, "sql to query sqlite can't have other tables")
     with sqlite3.connect(
         os.path.join(self.database_path, sqlite_file)
     ) as connection:
         df = pd.read_sql_query(sql, connection)
     return PandasDataFrame(df)
Exemple #5
0
def test_trim_index():
    """
    test Fugue func trim_index()
    """
    from node2vec.fugue import trim_index

    graph = [[0, 2, 0.41], [0, 4, 0.85], [3, 4, 0.36], [2, 0, 0.68], [4, 0, 0.1],
             [4, 3, 0.37]]
    df = ArrayDataFrame(graph, schema="src:int,dst:int,weight:double")

    df_res, name_id = trim_index(NativeExecutionEngine(), df, indexed=True)
    assert len(df_res.as_pandas()) == 6 and name_id is None
    df_res, name_id = trim_index(
        NativeExecutionEngine(), df, indexed=True, max_out_deg=1,
    )
    assert len(df_res.as_pandas()) == 4 and name_id is None

    spark = SparkSession.builder.config("spark.executor.cores", 4).getOrCreate()
    dat1 = {
        'src': ['a1', 'a1', 'a1', 'a2', 'b2'], 'dst': ['a2', 'b1', 'b2', 'b1', 'a2'],
    }
    dat2 = {
        'dst': ['a2', 'b1', 'b2', 'a1'], 'weight': [0.8, 1.1, 1.0, 0.3]
    }
    df = spark.createDataFrame(pd.DataFrame.from_dict(dat1))
    df_res, name_id = trim_index(
        SparkExecutionEngine(spark), SparkDataFrame(df), indexed=False, max_out_deg=2
    )
    assert df_res.count() == 4 and name_id.count() == 4
    df = spark.createDataFrame(pd.DataFrame.from_dict(dat2))
    pytest.raises(
        ValueError, trim_index, SparkExecutionEngine(spark), SparkDataFrame(df), True,
    )

    df = pd.DataFrame.from_dict(dat1)
    df_res, name_id = trim_index(
        NativeExecutionEngine(), PandasDataFrame(df), indexed=False,
    )
    assert len(df_res.as_pandas()) == 5 and len(name_id.as_pandas()) == 4
    df = pd.DataFrame.from_dict(dat2)
    pytest.raises(
        ValueError, trim_index, NativeExecutionEngine(), PandasDataFrame(df), False,
    )
Exemple #6
0
 def select(self, dfs: DataFrames, statement: str) -> DataFrame:
     conn = duckdb.connect()
     try:
         for k, v in dfs.items():
             conn.register_arrow(k, v.as_arrow())
         result = conn.execute(statement).arrow()
         # see this issue to understand why use pandas output
         # https://github.com/duckdb/duckdb/issues/2446
         # TODO: switch to ArrowDataFrame when duckdb 0.3.1 is released
         return PandasDataFrame(result.to_pandas())
     finally:
         conn.close()
Exemple #7
0
 def select(
     self, dfs: DataFrames, ibis_func: Callable[[ibis.BaseBackend],
                                                ir.TableExpr]
 ) -> DataFrame:  # pragma: no cover
     pdfs = {k: v.as_pandas() for k, v in dfs.items()}
     be = _BackendWrapper().connect(pdfs)
     be.set_schemas(dfs)
     expr = ibis_func(be)
     schema = to_schema(expr.schema())
     result = expr.execute()
     assert_or_throw(isinstance(result, pd.DataFrame),
                     "result must be a pandas DataFrame")
     return PandasDataFrame(result, schema=schema)
Exemple #8
0
 def select(
     self, dfs: DataFrames, ibis_func: Callable[[ibis.BaseBackend],
                                                ir.TableExpr]
 ) -> DataFrame:  # pragma: no cover
     be = _BackendWrapper().connect({})
     be.set_schemas(dfs)
     expr = ibis_func(be)
     schema = to_schema(expr.schema())
     sql = str(
         ibis.postgres.compile(expr).compile(
             compile_kwargs={"literal_binds": True}))
     engine = DuckDBEngine(self.execution_engine)
     return PandasDataFrame(engine.select(dfs, sql).as_pandas(),
                            schema=schema)
Exemple #9
0
def test_transform():
    pdf = pd.DataFrame([[1, 10], [0, 0], [1, 1], [0, 20]], columns=["a", "b"])

    def f1(df: pd.DataFrame) -> pd.DataFrame:
        return df.sort_values("b").head(1)

    result = transform(pdf, f1, schema="*")
    assert isinstance(result, pd.DataFrame)
    assert result.values.tolist() == [[0, 0]]

    # schema: *
    def f2(df: pd.DataFrame) -> pd.DataFrame:
        return df.sort_values("b").head(1)

    result = transform(pdf, f2)
    assert isinstance(result, pd.DataFrame)
    assert result.values.tolist() == [[0, 0]]

    result = transform(pdf, f2, partition=dict(by=["a"]))
    assert isinstance(result, pd.DataFrame)
    assert sorted(result.values.tolist(), key=lambda x: x[0]) == [[0, 0],
                                                                  [1, 1]]
    result = transform(pdf,
                       f2,
                       partition=dict(by=["a"]),
                       force_output_fugue_dataframe=True)
    assert isinstance(result, DataFrame)

    ppdf = PandasDataFrame(pdf)
    assert isinstance(transform(ppdf, f2), DataFrame)

    # schema: *
    def f3(df: pd.DataFrame, called: callable) -> pd.DataFrame:
        called()
        return df

    cb = Callback()
    result = transform(pdf, f3, callback=cb.called)
    assert 1 == cb.ct
Exemple #10
0
 def select(self, dfs: DataFrames, statement: str) -> DataFrame:
     pd_dfs = {k: self.execution_engine.to_df(v).as_pandas() for k, v in dfs.items()}
     df = run_sql_on_pandas(statement, pd_dfs)
     return PandasDataFrame(df)
 def with_nat(cursor, data):
     df = data.as_pandas()
     df["nat"] = pd.NaT
     schema = data.schema + "nat:datetime"
     return PandasDataFrame(df, schema)