Example #1
0
def test_read_parquet(ray_start_regular_shared, tmp_path):
    df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
    table = pa.Table.from_pandas(df1)
    pq.write_table(table, os.path.join(tmp_path, "test1.parquet"))
    df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]})
    table = pa.Table.from_pandas(df2)
    pq.write_table(table, os.path.join(tmp_path, "test2.parquet"))

    # without columns
    ds = ml_data.read_parquet(tmp_path, num_shards=2)
    result = list(ds.gather_sync())
    assert df1.equals(result[0])
    assert df2.equals(result[1])

    # with columns one
    ds = ml_data.read_parquet(tmp_path, num_shards=2, columns=["one"])
    result = list(ds.gather_sync())
    assert df1[["one"]].equals(result[0])
    assert df2[["one"]].equals(result[1])

    # with columns two
    ds = ml_data.read_parquet(tmp_path, num_shards=2, columns=["two"])
    result = list(ds.gather_sync())
    assert df1[["two"]].equals(result[0])
    assert df2[["two"]].equals(result[1])
Example #2
0
    def testDetectDistributed(self):
        with tempfile.TemporaryDirectory() as dir:
            parquet_file = os.path.join(dir, "file.parquet")
            csv_file = os.path.join(dir, "file.csv")

            data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"])
            data_df["label"] = pd.Series(self.y)

            data_df.to_parquet(parquet_file)
            data_df.to_csv(csv_file)

            mat = RayDMatrix(parquet_file, lazy=True)
            self.assertTrue(mat.distributed)

            mat = RayDMatrix(csv_file, lazy=True)
            # Single CSV files should not be distributed
            self.assertFalse(mat.distributed)

            mat = RayDMatrix([parquet_file] * 3, lazy=True)
            self.assertTrue(mat.distributed)

            mat = RayDMatrix([csv_file] * 3, lazy=True)
            self.assertTrue(mat.distributed)

            try:
                from ray.util import data as ml_data
                mat = RayDMatrix(ml_data.read_parquet(parquet_file,
                                                      num_shards=1),
                                 lazy=True)
                self.assertTrue(mat.distributed)
            except ImportError:
                print("MLDataset not available in current Ray version. "
                      "Skipping part of test.")
Example #3
0
    def testFromMLDataset(self):
        try:
            from ray.util import data as ml_data
        except ImportError:
            self.skipTest("MLDataset not available in current Ray version.")
            return

        with tempfile.TemporaryDirectory() as dir:
            data_file_1 = os.path.join(dir, "data_1.parquet")
            data_file_2 = os.path.join(dir, "data_2.parquet")

            data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"])
            data_df["label"] = pd.Series(self.y)

            df_1 = data_df[0:len(data_df) // 2]
            df_2 = data_df[len(data_df) // 2:]

            df_1.to_parquet(data_file_1)
            df_2.to_parquet(data_file_2)

            dataset = ml_data.read_parquet([data_file_1, data_file_2],
                                           num_shards=2)

            self._testMatrixCreation(dataset, "label", distributed=False)
            self._testMatrixCreation(dataset, "label", distributed=True)
Example #4
0
def main(fname, num_actors=2):
    ml_dataset = read_parquet(fname, num_shards=num_actors)

    dtrain = RayDMatrix(ml_dataset, label="labels", ignore=["partition"])

    config = {
        "tree_method": "hist",
        "eval_metric": ["logloss", "error"],
    }

    evals_result = {}

    start = time.time()
    bst = train(config,
                dtrain,
                evals_result=evals_result,
                ray_params=RayParams(max_actor_restarts=1,
                                     num_actors=num_actors),
                num_boost_round=10,
                evals=[(dtrain, "train")])
    taken = time.time() - start
    print(f"TRAIN TIME TAKEN: {taken:.2f} seconds")

    bst.save_model("test_data.xgb")
    print("Final training error: {:.4f}".format(
        evals_result["train"]["error"][-1]))
Example #5
0
def create_ml_dataset_from_spark(
        df: sql.DataFrame,
        num_shards: int,
        batch_size: int,
        fs_directory: Optional[str] = None,
        compression: Optional[str] = None) -> MLDataset:
    """ Create a MLDataset from Spark DataFrame

    This method will create a MLDataset from Spark DataFrame.

    :param df: the pyspark.sql.DataFrame
    :param num_shards: the number of shards will be created for the MLDataset
    :param batch_size: the batch size for the MLDataset
    :param fs_directory: an optional distributed file system directory for cache the
           DataFrame. We will write the DataFrame to the given directory with parquet
           format if this is provided. Otherwise, we will write the DataFrame to ray
           object store.
    :param compression: the optional compression for write the DataFrame as parquet
           file. This is only useful when the fs_directory set.
    :return: a MLDataset
    """
    df = df.repartition(num_shards)
    if fs_directory is None:
        # fs_directory has not provided, we save the Spark DataFrame to ray object store
        record_batch_set = _save_spark_df_to_object_store(df, num_shards)
        # TODO: we should specify the resource spec for each shard
        it = parallel_it.from_iterators(generators=record_batch_set,
                                        name="Spark DataFrame",
                                        repeat=False)
        ds = ml_dataset.from_parallel_iter(it,
                                           need_convert=False,
                                           batch_size=batch_size,
                                           repeated=False)
        return ds
    else:
        # fs_directory has provided, we write the Spark DataFrame as Parquet files
        df.write.parquet(fs_directory, compression=compression)
        # create the MLDataset from the parquet file
        ds = ml_dataset.read_parquet(fs_directory, num_shards)
        return ds