def test_local_shuffle(ray_start_regular_shared): para_it = parallel_it.from_range(100).for_each(lambda x: [x]) # batch_size larger than 1 and shuffle_buffer_size larger than 1 ds = ml_data.from_parallel_iter(para_it, batch_size=10) ds1 = ds.local_shuffle(shuffle_buffer_size=5) ds2 = ds.local_shuffle(shuffle_buffer_size=5) l1 = list(ds1.gather_sync()) l2 = list(ds2.gather_sync()) assert not all(df1.equals(df2) for df1, df2 in zip(l1, l2)) # batch_size equals 1 and shuffle_buffer_size larger than 1 ds = ml_data.from_parallel_iter(para_it, batch_size=1) ds1 = ds.local_shuffle(shuffle_buffer_size=5) ds2 = ds.local_shuffle(shuffle_buffer_size=5) l1 = list(ds1.gather_sync()) l2 = list(ds2.gather_sync()) assert not all(df1.equals(df2) for df1, df2 in zip(l1, l2)) # batch_size equals 1 and shuffle_buffer_size equals 1 ds = ml_data.from_parallel_iter(para_it, batch_size=1) ds1 = ds.local_shuffle(shuffle_buffer_size=1) ds2 = ds.local_shuffle(shuffle_buffer_size=1) l1 = list(ds1.gather_sync()) l2 = list(ds2.gather_sync()) assert all(df1.equals(df2) for df1, df2 in zip(l1, l2))
def _create_ml_dataset(name: str, record_pieces: List[RecordPiece], record_sizes: List[int], num_shards: int, shuffle: bool, shuffle_seed: int, RecordBatchCls, node_hints: List[str] = None) -> MLDataset: if node_hints is not None: assert num_shards % len(node_hints) == 0,\ f"num_shards: {num_shards} should be a multiple of length of node_hints: {node_hints}" if shuffle_seed: np.random.seed(shuffle_seed) else: np.random.seed(0) # split the piece into num_shards partitions divided_blocks = divide_blocks(blocks=record_sizes, world_size=num_shards, shuffle=shuffle, shuffle_seed=shuffle_seed) record_batches = [] for rank, blocks in divided_blocks.items(): pieces = [] for index, num_samples in blocks: record_size = record_sizes[index] piece = record_pieces[index] if num_samples != record_size: assert num_samples < record_size new_row_ids = np.random.choice( record_size, size=num_samples).tolist() piece = piece.with_row_ids(new_row_ids) pieces.append(piece) if shuffle: np.random.shuffle(pieces) record_batches.append(RecordBatchCls(shard_id=rank, prefix=name, record_pieces=pieces, shuffle=shuffle, shuffle_seed=shuffle_seed)) worker_cls = ray.remote(ParallelIteratorWorkerWithLen) if node_hints is not None: actors = [] multiplier = num_shards // len(node_hints) resource_keys = [f"node:{node_hints[i // multiplier]}" for i in range(num_shards)] for g, resource_key in zip(record_batches, resource_keys): actor = worker_cls.options(resources={resource_key: 0.01}).remote(g, False, len(g)) actors.append(actor) else: worker_cls = ray.remote(ParallelIteratorWorkerWithLen) actors = [worker_cls.remote(g, False, len(g)) for g in record_batches] it = parallel_it.from_actors(actors, name) ds = ml_dataset.from_parallel_iter( it, need_convert=False, batch_size=0, repeated=False) return ds
def test_tf_dataset(ray_start_4_cpus): # noqa: F811 num_points = 32 * 100 * 2 data = [i * (1 / num_points) for i in range(num_points)] it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x]) # this will create MLDataset with column RangeIndex(range(2)) ds = ml_data.from_parallel_iter(it, True, batch_size=32, repeated=False) tf_ds = ds.to_tf(feature_columns=[0], label_column=1) trainer = TFTrainer( model_creator=model_creator, data_creator=make_data_creator(tf_ds), num_replicas=2, config={ "batch_size": 32, "fit_config": { "steps_per_epoch": 100, }, }, ) for _ in range(10): trainer.train() model = trainer.get_model() prediction = model.predict([0.5])[0][0] assert 0.4 <= prediction <= 0.6 trainer.shutdown()
def main(): num_points = 32 * 100 * 2 data = [i * (1 / num_points) for i in range(num_points)] it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x]) # this will create MLDataset with column RangeIndex(range(2)) ds = ml_data.from_parallel_iter(it, True, batch_size=32, repeated=False) tf_ds = ds.to_tf(feature_columns=[0], label_column=1) trainer = TFTrainer( model_creator=model_creator, data_creator=make_data_creator(tf_ds), num_replicas=2, config={ "batch_size": 32, "fit_config": { "steps_per_epoch": 100, }, }, ) for _ in range(10): trainer.train() model = trainer.get_model() print("f(0.5)=", float(model.predict([0.5])))
def test_read_ray_mldataset(ray_start_regular): test_df1 = pd.DataFrame( { "a": np.arange(10).astype(np.int64, copy=False), "b": [f"s{i}" for i in range(10)], } ) test_df2 = pd.DataFrame( { "a": np.arange(10).astype(np.int64, copy=False), "b": [f"s{i}" for i in range(10)], } ) df = pd.concat([test_df1, test_df2]) import ray.util.iter from ray.util.data import from_parallel_iter ml_dataset = from_parallel_iter( ray.util.iter.from_items([test_df1, test_df2], num_shards=2), need_convert=False ) mdf = read_ray_mldataset(ml_dataset) assert mdf.shape[1] == 2 pd.testing.assert_index_equal(df.columns, mdf.columns_value.to_pandas()) pd.testing.assert_series_equal(df.dtypes, mdf.dtypes) mdf = tile(mdf) assert len(mdf.chunks) == 2 for chunk in mdf.chunks: assert isinstance(chunk.op, DataFrameReadMLDataset)
def test_from_parallel_it(ray_start_regular_shared): para_it = parallel_it.from_range(4).for_each(lambda x: [x]) ds = ml_data.from_parallel_iter(para_it, batch_size=2) assert repr(ds) == ("MLDataset[from_range[4, shards=2]" ".for_each().batch(2).to_pandas()]") collected = list(ds.gather_sync()) assert len(collected) == 2 assert all(d.shape == (2, 1) for d in collected) expected = para_it.flatten().batch(2).gather_sync().flatten() flattened = ds.gather_sync().for_each(lambda x: x[0].to_list()).flatten() assert list(flattened) == list(expected)
def test_union(ray_start_regular_shared): para_it1 = parallel_it.from_range(4, 2, False).for_each(lambda x: [x]) ds1 = ml_data.from_parallel_iter(para_it1, True, 2, False) para_it2 = parallel_it.from_range(4, 2, True).for_each(lambda x: [x]) ds2 = ml_data.from_parallel_iter(para_it2, True, 2, True) with pytest.raises(TypeError) as ex: ds1.union(ds2) assert "two MLDataset which have different repeated type" in str(ex.value) # union two MLDataset with same batch size para_it2 = parallel_it.from_range(4, 2, False).for_each(lambda x: [x]) ds2 = ml_data.from_parallel_iter(para_it2, True, 2, False) ds = ds1.union(ds2) assert ds.batch_size == 2 # union two MLDataset with different batch size para_it2 = parallel_it.from_range(4, 2, False).for_each(lambda x: [x]) ds2 = ml_data.from_parallel_iter(para_it2, True, 1, False) ds = ds1.union(ds2) # batch_size 0 means batch_size unknown assert ds.batch_size == 0
def main(): num_points = 32 * 100 * 2 data = [i * (1 / num_points) for i in range(num_points)] it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x]) # this will create MLDataset with column RangeIndex(range(2)) ds = ml_data.from_parallel_iter(it, True, batch_size=32, repeated=False) torch_ds = ds.to_torch(feature_columns=[0], label_column=1) trainer = TorchTrainer( num_workers=2, training_operator_cls=make_train_operator(torch_ds), add_dist_sampler=False, config={"batch_size": 32}) for i in range(10): trainer.train(num_steps=100) model = trainer.get_model() print("f(0.5)=", float(model(torch.tensor([[0.5]]).float())[0][0]))
def test_torch_dataset(ray_start_4_cpus, use_local): num_points = 32 * 100 * 2 data = [i * (1 / num_points) for i in range(num_points)] para_it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x]) ds = ml_data.from_parallel_iter(para_it, batch_size=32) torch_ds = ds.to_torch(feature_columns=[0], label_column=1) operator = make_train_operator(torch_ds) trainer = TorchTrainer(training_operator_cls=operator, num_workers=2, use_local=use_local, add_dist_sampler=False, config={"batch_size": 32}) for i in range(10): trainer.train(num_steps=100) model = trainer.get_model() prediction = float(model(torch.tensor([[0.5]]).float())[0][0]) assert 0.4 <= prediction <= 0.6 trainer.shutdown()
def create_ml_dataset_from_spark( df: sql.DataFrame, num_shards: int, batch_size: int, fs_directory: Optional[str] = None, compression: Optional[str] = None) -> MLDataset: """ Create a MLDataset from Spark DataFrame This method will create a MLDataset from Spark DataFrame. :param df: the pyspark.sql.DataFrame :param num_shards: the number of shards will be created for the MLDataset :param batch_size: the batch size for the MLDataset :param fs_directory: an optional distributed file system directory for cache the DataFrame. We will write the DataFrame to the given directory with parquet format if this is provided. Otherwise, we will write the DataFrame to ray object store. :param compression: the optional compression for write the DataFrame as parquet file. This is only useful when the fs_directory set. :return: a MLDataset """ df = df.repartition(num_shards) if fs_directory is None: # fs_directory has not provided, we save the Spark DataFrame to ray object store record_batch_set = _save_spark_df_to_object_store(df, num_shards) # TODO: we should specify the resource spec for each shard it = parallel_it.from_iterators(generators=record_batch_set, name="Spark DataFrame", repeat=False) ds = ml_dataset.from_parallel_iter(it, need_convert=False, batch_size=batch_size, repeated=False) return ds else: # fs_directory has provided, we write the Spark DataFrame as Parquet files df.write.parquet(fs_directory, compression=compression) # create the MLDataset from the parquet file ds = ml_dataset.read_parquet(fs_directory, num_shards) return ds
def test_read_ray_mldataset(ray_start_regular, ray_create_mars_cluster): test_dfs = [ pd.DataFrame({ "a": np.arange(i * 10, (i + 1) * 10).astype(np.int64, copy=False), "b": [f"s{j}" for j in range(i * 10, (i + 1) * 10)], }) for i in range(5) ] import ray.util.iter from ray.util.data import from_parallel_iter ml_dataset = from_parallel_iter(ray.util.iter.from_items(test_dfs, num_shards=4), need_convert=False) dfs = [] for shard in ml_dataset.shards(): dfs.extend(list(shard)) df = pd.concat(dfs).reset_index(drop=True) mdf = md.read_ray_mldataset(ml_dataset) pd.testing.assert_frame_equal(df, mdf.execute().fetch()) pd.testing.assert_frame_equal(df.head(5), mdf.head(5).execute().fetch()) pd.testing.assert_frame_equal(df.head(15), mdf.head(15).execute().fetch())