def test_complex_query(c): df = timeseries(freq="1d").persist() c.create_table("timeseries", df) result = c.sql(""" SELECT lhs.name, lhs.id, lhs.x FROM timeseries AS lhs JOIN ( SELECT name AS max_name, MAX(x) AS max_x FROM timeseries GROUP BY name ) AS rhs ON lhs.name = rhs.max_name AND lhs.x = rhs.max_x """).compute() assert len(result) > 0
def timeseries_df(c): pdf = timeseries(freq="1d").compute().reset_index(drop=True) # input nans in pandas dataframe col1_index = np.random.randint(0, 30, size=int(pdf.shape[0] * 0.2)) col2_index = np.random.randint(0, 30, size=int(pdf.shape[0] * 0.3)) pdf.loc[col1_index, "x"] = np.nan pdf.loc[col2_index, "y"] = np.nan c.create_table("timeseries", pdf, persist=True) return None
def main(): # pragma: no cover parser = ArgumentParser() parser.add_argument( "--scheduler-address", default=None, help="Connect to this dask scheduler if given", ) parser.add_argument( "--log-level", default=None, help="Set the log level of the server. Defaults to info.", choices=["DEBUG", "INFO", "WARNING", "ERROR"], ) parser.add_argument( "--load-test-data", default=False, action="store_true", help="Preload some test data.", ) parser.add_argument( "--startup", default=False, action="store_true", help="Wait until Apache Calcite was properly loaded", ) args = parser.parse_args() client = None if args.scheduler_address: client = Client(args.scheduler_address) context = Context() if args.load_test_data: df = timeseries(freq="1d").reset_index(drop=False) context.create_table("timeseries", df.persist()) cmd_loop(context=context, client=client, startup=args.startup, log_level=args.log_level)
from dask.datasets import timeseries import time from dask.dataframe.shuffle import shuffle from dask.distributed import Client, wait if __name__ == "__main__": client = Client("127.0.0.1:8786") ddf_h = timeseries(start='2000-01-01', end='2000-01-02', partition_freq='1min') result = shuffle(ddf_h, "id", shuffle="tasks") ddf = client.persist(result) _ = wait(ddf) client.shutdown() time.sleep(0.5)
def gpu_training_df(c): if dask_cudf: df = timeseries(freq="1d").reset_index(drop=True) df = dask_cudf.from_dask_dataframe(df) c.create_table("timeseries", input_table=df) return None
def training_df(c): df = timeseries(freq="1d").reset_index(drop=True) c.create_table("timeseries", df, persist=True) return None
def setUpClass(cls): cls.c = Context() df = timeseries(freq="1d").persist() cls.c.register_dask_table(df, "timeseries")
dd = pytest.importorskip("dask.dataframe") pyspark = pytest.importorskip("pyspark") pytest.importorskip("pyarrow") pytest.importorskip("fastparquet") from dask.dataframe.utils import assert_eq if not sys.platform.startswith("linux"): pytest.skip( "Unnecessary, and hard to get spark working on non-linux platforms", allow_module_level=True, ) # pyspark auto-converts timezones -- round-tripping timestamps is easier if # we set everything to UTC. pdf = timeseries(freq="1H").compute() pdf.index = pdf.index.tz_localize("UTC") pdf = pdf.reset_index() @pytest.fixture(scope="module") def spark_session(): # Spark registers a global signal handler that can cause problems elsewhere # in the test suite. In particular, the handler fails if the spark session # is stopped (a bug in pyspark). prev = signal.getsignal(signal.SIGINT) # Create a spark session. Note that we set the timezone to UTC to avoid # conversion to local time when reading parquet files. spark = (pyspark.sql.SparkSession.builder.master("local").appName( "Dask Testing").config("spark.sql.session.timeZone", "UTC").getOrCreate())