def test_metrics_union(ray_start_regular_shared): it1 = from_items([1, 2, 3, 4], num_shards=1) it2 = from_items([1, 2, 3, 4], num_shards=1) def foo_metrics(x): metrics = LocalIterator.get_metrics() metrics.counters["foo"] += x return metrics.counters["foo"] def bar_metrics(x): metrics = LocalIterator.get_metrics() metrics.counters["bar"] += 100 return metrics.counters["bar"] def verify_metrics(x): metrics = LocalIterator.get_metrics() metrics.counters["n"] += 1 # Check the metrics context is shared. if metrics.counters["n"] >= 2: assert "foo" in metrics.counters assert "bar" in metrics.counters return x it1 = it1.gather_async().for_each(foo_metrics) it2 = it2.gather_async().for_each(bar_metrics) it3 = it1.union(it2, deterministic=True) it3 = it3.for_each(verify_metrics) assert it3.take(10) == [1, 100, 3, 200, 6, 300, 10, 400]
def test_union(ray_start_regular_shared): it1 = from_items(["a", "b", "c"], 1) it2 = from_items(["x", "y", "z"], 1) it = it1.union(it2) assert (repr(it) == "ParallelIterator[ParallelUnion[ParallelIterator[" "from_items[str, 3, shards=1]], ParallelIterator[" "from_items[str, 3, shards=1]]]]") assert list(it.gather_sync()) == ["a", "x", "b", "y", "c", "z"]
def test_metrics(ray_start_regular_shared): it = from_items([1, 2, 3, 4], num_shards=1) it2 = from_items([1, 2, 3, 4], num_shards=1) def f(x): metrics = LocalIterator.get_metrics() metrics.counters["foo"] += x return metrics.counters["foo"] it = it.gather_sync().for_each(f) it2 = it2.gather_sync().for_each(f) # Tests iterators have isolated contexts. assert it.take(4) == [1, 3, 6, 10] assert it2.take(4) == [1, 3, 6, 10]
def test_local_shuffle(ray_start_regular_shared): # confirm that no data disappears, and they all stay within the same shard it = from_range(8, num_shards=2).local_shuffle(shuffle_buffer_size=2) assert repr(it) == ("ParallelIterator[from_range[8, shards=2]" + ".local_shuffle(shuffle_buffer_size=2, seed=None)]") shard_0 = it.get_shard(0) shard_1 = it.get_shard(1) assert set(shard_0) == {0, 1, 2, 3} assert set(shard_1) == {4, 5, 6, 7} # check that shuffling results in different orders it1 = from_range(100, num_shards=10).local_shuffle(shuffle_buffer_size=5) it2 = from_range(100, num_shards=10).local_shuffle(shuffle_buffer_size=5) assert list(it1.gather_sync()) != list(it2.gather_sync()) # buffer size of 1 should not result in any shuffling it3 = from_range(10, num_shards=1).local_shuffle(shuffle_buffer_size=1) assert list(it3.gather_sync()) == list(range(10)) # statistical test it4 = from_items([0, 1] * 10000, num_shards=1).local_shuffle(shuffle_buffer_size=100) result = "".join(it4.gather_sync().for_each(str)) freq_counter = Counter(zip(result[:-1], result[1:])) assert len(freq_counter) == 4 for key, value in freq_counter.items(): assert value / len(freq_counter) > 0.2
def main(): num_points = 32 * 100 * 2 data = [i * (1 / num_points) for i in range(num_points)] it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x]) # this will create MLDataset with column RangeIndex(range(2)) ds = ml_data.from_parallel_iter(it, True, batch_size=32, repeated=False) tf_ds = ds.to_tf(feature_columns=[0], label_column=1) trainer = TFTrainer( model_creator=model_creator, data_creator=make_data_creator(tf_ds), num_replicas=2, config={ "batch_size": 32, "fit_config": { "steps_per_epoch": 100, }, }, ) for _ in range(10): trainer.train() model = trainer.get_model() print("f(0.5)=", float(model.predict([0.5])))
def test_for_each_concur_sync(ray_start_regular_shared): main_wait = Semaphore.remote(value=0) test_wait = Semaphore.remote(value=0) def task(x): i, main_wait, test_wait = x ray.get(main_wait.release.remote()) ray.get(test_wait.acquire.remote()) return i + 10 @ray.remote(num_cpus=0.01) def to_list(it): return list(it) it = from_items([(i, main_wait, test_wait) for i in range(8)], num_shards=2) it = it.for_each(task, max_concurrency=2, resources={"num_cpus": 0.01}) list_promise = to_list.remote(it.gather_sync()) for i in range(4): assert i in [0, 1, 2, 3] ray.get(main_wait.acquire.remote()) # There should be exactly 4 tasks executing at this point. assert ray.get(main_wait.locked.remote()) is True, "Too much parallelism" for i in range(8): ray.get(test_wait.release.remote()) assert repr( it) == "ParallelIterator[from_items[tuple, 8, shards=2].for_each()]" result_list = ray.get(list_promise) assert set(result_list) == set(range(10, 18))
def test_tf_dataset(ray_start_4_cpus): # noqa: F811 num_points = 32 * 100 * 2 data = [i * (1 / num_points) for i in range(num_points)] it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x]) # this will create MLDataset with column RangeIndex(range(2)) ds = ml_data.from_parallel_iter(it, True, batch_size=32, repeated=False) tf_ds = ds.to_tf(feature_columns=[0], label_column=1) trainer = TFTrainer( model_creator=model_creator, data_creator=make_data_creator(tf_ds), num_replicas=2, config={ "batch_size": 32, "fit_config": { "steps_per_epoch": 100, }, }, ) for _ in range(10): trainer.train() model = trainer.get_model() prediction = model.predict([0.5])[0][0] assert 0.4 <= prediction <= 0.6 trainer.shutdown()
def test_zip_with_source_actor(ray_start_regular_shared): it = from_items([1, 2, 3, 4], num_shards=2) counts = collections.defaultdict(int) for actor, value in it.gather_async().zip_with_source_actor(): counts[actor] += 1 assert len(counts) == 2 for a, count in counts.items(): assert count == 2
def test_serialization(ray_start_regular_shared): it = (from_items([1, 2, 3, 4]).gather_sync().for_each(lambda x: x).filter( lambda x: True).batch(2).flatten()) assert (repr(it) == "LocalIterator[ParallelIterator[" "from_items[int, 4, shards=2]].gather_sync()." "for_each().filter().batch(2).flatten()]") @ray.remote def get(it): return list(it) assert ray.get(get.remote(it)) == [1, 2, 3, 4]
def test_metrics_union_recursive(ray_start_regular_shared): it1 = from_items([1, 2, 3, 4], num_shards=1) it2 = from_items([1, 2, 3, 4], num_shards=1) it3 = from_items([1, 2, 3, 4], num_shards=1) def foo_metrics(x): metrics = LocalIterator.get_metrics() metrics.counters["foo"] += 1 return metrics.counters["foo"] def bar_metrics(x): metrics = LocalIterator.get_metrics() metrics.counters["bar"] += 1 return metrics.counters["bar"] def baz_metrics(x): metrics = LocalIterator.get_metrics() metrics.counters["baz"] += 1 return metrics.counters["baz"] def verify_metrics(x): metrics = LocalIterator.get_metrics() metrics.counters["n"] += 1 # Check the metrics context is shared recursively. print(metrics.counters) if metrics.counters["n"] >= 3: assert "foo" in metrics.counters assert "bar" in metrics.counters assert "baz" in metrics.counters return x it1 = it1.gather_async().for_each(foo_metrics) it2 = it2.gather_async().for_each(bar_metrics) it3 = it3.gather_async().for_each(baz_metrics) it12 = it1.union(it2, deterministic=True) it123 = it12.union(it3, deterministic=True) out = it123.for_each(verify_metrics) assert out.take(20) == [1, 1, 1, 2, 2, 3, 2, 4, 3, 3, 4, 4]
def test_metrics(ray_start_regular_shared): it = from_items([1, 2, 3, 4], num_shards=1) it2 = from_items([1, 2, 3, 4], num_shards=1) def f(x): metrics = LocalIterator.get_metrics() metrics.counters["foo"] += x return metrics.counters["foo"] it = it.gather_sync().for_each(f) it2 = it2.gather_sync().for_each(f) # Context cannot be accessed outside the iterator. with pytest.raises(ValueError): LocalIterator.get_metrics() # Tests iterators have isolated contexts. assert it.take(4) == [1, 3, 6, 10] assert it2.take(4) == [1, 3, 6, 10] # Context cannot be accessed outside the iterator. with pytest.raises(ValueError): LocalIterator.get_metrics()
def main(): num_points = 32 * 100 * 2 data = [i * (1 / num_points) for i in range(num_points)] it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x]) # this will create MLDataset with column RangeIndex(range(2)) ds = ml_data.from_parallel_iter(it, True, batch_size=32, repeated=False) torch_ds = ds.to_torch(feature_columns=[0], label_column=1) trainer = TorchTrainer( num_workers=2, training_operator_cls=make_train_operator(torch_ds), add_dist_sampler=False, config={"batch_size": 32}) for i in range(10): trainer.train(num_steps=100) model = trainer.get_model() print("f(0.5)=", float(model(torch.tensor([[0.5]]).float())[0][0]))
def from_modin(cls, df, num_shards: int = 2): """Create a MLDataset from a Modin Dataframe. Args: df (modin.pandas.DataFrame): A Modin Dataframe. num_shards (int): The number of worker actors to create. """ try: import modin.pandas as pd except ImportError: raise ImportError("Cannot convert from Modin because " "Modin is not installed.") from None if not isinstance(df, (pd.DataFrame, pd.Series)): raise ValueError("Must provide a modin.pandas DataFrame or Series") from modin.distributed.dataframe.pandas.partitions import unwrap_partitions parts = unwrap_partitions(df) modin_iter = from_items(parts, num_shards=num_shards, repeat=False) return cls.from_parallel_it(modin_iter, batch_size=0, repeated=False)
def test_torch_dataset(ray_start_4_cpus, use_local): num_points = 32 * 100 * 2 data = [i * (1 / num_points) for i in range(num_points)] para_it = parallel_it.from_items(data, 2, False).for_each(lambda x: [x, x]) ds = ml_data.from_parallel_iter(para_it, batch_size=32) torch_ds = ds.to_torch(feature_columns=[0], label_column=1) operator = make_train_operator(torch_ds) trainer = TorchTrainer(training_operator_cls=operator, num_workers=2, use_local=use_local, add_dist_sampler=False, config={"batch_size": 32}) for i in range(10): trainer.train(num_steps=100) model = trainer.get_model() prediction = float(model(torch.tensor([[0.5]]).float())[0][0]) assert 0.4 <= prediction <= 0.6 trainer.shutdown()
def process_data(data_set_type: str, parallel=True): files = recurse_files(path.join("./data/webnlg", "raw", data_set_type)) xml_objs = [parse_xml_file(f) for f in files] entries = [] if not parallel: print(f"[Info] Processing data...") chunks = [RDFFileReader(x).data for x in xml_objs] entries = flatten_list(chunks) else: num_shards: int = num_cpus if parallel else 1 print(f"[Info] Processing data in {num_shards} shards...") iterator = ( pariter.from_items(xml_objs[0:5], num_shards=num_shards) # .for_each(lambda f: cleaner.clean) .for_each(lambda xmldata: RDFFileReader(xmldata).data).flatten()) entries = iterator.gather_async() return tqdm(entries, desc="WebNLG", unit="entry")
def test_for_each_concur(ray_start_regular_shared): main_wait = Semaphore.remote(value=0) test_wait = Semaphore.remote(value=0) def task(x): i, main_wait, test_wait = x ray.get(main_wait.release.remote()) ray.get(test_wait.acquire.remote()) return i + 10 @ray.remote(num_cpus=0.1) def to_list(it): return list(it) it = from_items([(i, main_wait, test_wait) for i in range(8)], num_shards=2) it = it.for_each(task, max_concurrency=2, resources={"num_cpus": 0.1}) for i in range(4): ray.get(main_wait.acquire.remote()) # There should be exactly 4 tasks executing at this point. assert ray.get(main_wait.locked.remote()) is True, "Too much parallelism" # When we finish one task, exactly one more should start. ray.get(test_wait.release.remote()) ray.get(main_wait.acquire.remote()) assert ray.get(main_wait.locked.remote()) is True, "Too much parallelism" # Finish everything and make sure the output matches a regular iterator. for i in range(3): ray.get(test_wait.release.remote()) assert repr( it) == "ParallelIterator[from_items[tuple, 8, shards=2].for_each()]" assert ray.get(to_list.remote(it.gather_sync())) == list(range(10, 18))
def test_from_items_repeat(ray_start_regular_shared): it = from_items([1, 2, 3, 4], repeat=True) assert repr( it) == "ParallelIterator[from_items[int, 4, shards=2, repeat=True]]" assert it.take(8) == [1, 2, 3, 4, 1, 2, 3, 4]
def test_select_shards(ray_start_regular_shared): it = from_items([1, 2, 3, 4], num_shards=4) it1 = it.select_shards([0, 2]) it2 = it.select_shards([1, 3]) assert it1.take(4) == [1, 3] assert it2.take(4) == [2, 4]
def test_union_local(ray_start_regular_shared): it1 = from_items(["a", "b", "c"], 1).gather_async() it2 = from_range(5, 2).for_each(str).gather_async() it = it1.union(it2) assert sorted(it) == ["0", "1", "2", "3", "4", "a", "b", "c"]
def test_from_items(ray_start_regular_shared): it = from_items([1, 2, 3, 4]) assert repr(it) == "ParallelIterator[from_items[int, 4, shards=2]]" assert list(it.gather_sync()) == [1, 2, 3, 4] assert next(it.gather_sync()) == 1
def test_flatten(ray_start_regular_shared): it = from_items([[1, 2], [3, 4]], 1).flatten() assert repr( it) == "ParallelIterator[from_items[list, 2, shards=1].flatten()]" assert list(it.gather_sync()) == [1, 2, 3, 4]