def test_from_sql_distributed(make_sql_connection): # noqa: F811 if Engine.get() == "Ray": filename = "test_from_sql_distributed.db" table = "test_from_sql_distributed" conn = make_sql_connection(filename, table) query = "select * from {0}".format(table) pandas_df = pandas.read_sql(query, conn) modin_df_from_query = pd.read_sql( query, conn, partition_column="col1", lower_bound=0, upper_bound=6, max_sessions=2, ) modin_df_from_table = pd.read_sql( table, conn, partition_column="col1", lower_bound=0, upper_bound=6, max_sessions=2, ) df_equals(modin_df_from_query, pandas_df) df_equals(modin_df_from_table, pandas_df)
def test_from_sql_distributed(): if os.environ.get("MODIN_ENGINE", "") == "Ray": filename = "test_from_sql_distributed.db" teardown_sql_file(filename) table = "test_from_sql_distributed" db_uri = "sqlite:///" + filename setup_sql_file(db_uri, filename, table, True) query = "select * from {0}".format(table) pandas_df = pandas.read_sql(query, db_uri) modin_df_from_query = pd.read_sql(query, db_uri, partition_column="col1", lower_bound=0, upper_bound=6) modin_df_from_table = pd.read_sql(table, db_uri, partition_column="col1", lower_bound=0, upper_bound=6) assert modin_df_equals_pandas(modin_df_from_query, pandas_df) assert modin_df_equals_pandas(modin_df_from_table, pandas_df) teardown_sql_file(filename)
def test_from_sql_defaults(make_sql_connection): # noqa: F811 filename = "test_from_sql_distributed.db" table = "test_from_sql_distributed" conn = make_sql_connection(filename, table) query = "select * from {0}".format(table) pandas_df = pandas.read_sql(query, conn) with pytest.warns(UserWarning): modin_df_from_query = pd.read_sql(query, conn) with pytest.warns(UserWarning): modin_df_from_table = pd.read_sql(table, conn) df_equals(modin_df_from_query, pandas_df) df_equals(modin_df_from_table, pandas_df)
def test_from_sql_defaults(): filename = "test_from_sql_distributed.db" teardown_sql_file(filename) table = "test_from_sql_distributed" db_uri = "sqlite:///" + filename setup_sql_file(db_uri, filename, table, True) query = "select * from {0}".format(table) pandas_df = pandas.read_sql(query, db_uri) with pytest.warns(UserWarning): modin_df_from_query = pd.read_sql(query, db_uri) with pytest.warns(UserWarning): modin_df_from_table = pd.read_sql(table, db_uri) assert modin_df_equals_pandas(modin_df_from_query, pandas_df) assert modin_df_equals_pandas(modin_df_from_table, pandas_df) teardown_sql_file(filename)
def test_from_sql_distributed(make_sql_connection): # noqa: F811 if Engine.get() == "Ray": pytest.xfail("Distributed read_sql is broken, see GH#2194") filename = "test_from_sql_distributed.db" table = "test_from_sql_distributed" conn = make_sql_connection(filename, table) query = "select * from {0}".format(table) pandas_df = pandas.read_sql(query, conn) modin_df_from_query = pd.read_sql( query, conn, partition_column="col1", lower_bound=0, upper_bound=6 ) modin_df_from_table = pd.read_sql( table, conn, partition_column="col1", lower_bound=0, upper_bound=6 ) df_equals(modin_df_from_query, pandas_df) df_equals(modin_df_from_table, pandas_df)
def test_from_sql_distributed(make_sql_connection): # noqa: F811 if os.environ.get("MODIN_ENGINE", "") == "Ray": filename = "test_from_sql_distributed.db" table = "test_from_sql_distributed" conn = make_sql_connection(filename, table) query = "select * from {0}".format(table) pandas_df = pandas.read_sql(query, conn) modin_df_from_query = pd.read_sql(query, conn, partition_column="col1", lower_bound=0, upper_bound=6) modin_df_from_table = pd.read_sql(table, conn, partition_column="col1", lower_bound=0, upper_bound=6) assert modin_df_equals_pandas(modin_df_from_query, pandas_df) assert modin_df_equals_pandas(modin_df_from_table, pandas_df)
if __name__ == "__main__": args = docopt(__doc__, version="1.0") conn = os.environ["POSTGRES_URL"] table = os.environ["POSTGRES_TABLE"] partitions = int(args["<num>"]) # ray.init(num_cpus=partitions, object_store_memory=10**10, _plasma_directory="/tmp") ray.init(num_cpus=partitions, object_store_memory=10**10) import modin.config as config import modin.experimental.pandas as pd config.NPartitions.put(partitions) with Timer() as timer: df = pd.read_sql( f"{table}", # use table here, a bug exists in modin experimental read_sql for query conn, parse_dates=[ "l_shipdate", "l_commitdate", "l_receiptdate", ], partition_column="l_orderkey", lower_bound=0, upper_bound=60000000, max_sessions=partitions, ) print(f"[Total] {timer.elapsed:.2f}s") print(df.head())