Beispiel #1
0
def test_examples_iterable_shuffle_data_sources(generate_examples_fn):
    ex_iterable = ExamplesIterable(generate_examples_fn,
                                   {"filepaths": ["0.txt", "1.txt"]})
    ex_iterable = ex_iterable.shuffle_data_sources(np.random.default_rng(40))
    expected = list(generate_examples_fn(
        filepaths=["1.txt", "0.txt"]))  # shuffle the filepaths
    assert list(ex_iterable) == expected
def test_randomly_cycling_multi_sources_examples_iterable(generate_examples_fn, probabilities):
    seed = 42
    generator = np.random.default_rng(seed)
    ex_iterable1 = ExamplesIterable(generate_examples_fn, {"text": "foo"})
    ex_iterable2 = ExamplesIterable(generate_examples_fn, {"text": "bar"})
    ex_iterable = RandomlyCyclingMultiSourcesExamplesIterable(
        [ex_iterable1, ex_iterable2], generator=generator, probabilities=probabilities
    )

    # The source used randomly changes at each example. It stops when one of the iterators is empty.
    rng = deepcopy(generator)
    iterators = (generate_examples_fn(text="foo"), generate_examples_fn(text="bar"))
    indices_iterator = RandomlyCyclingMultiSourcesExamplesIterable._iter_random_indices(
        rng, len(iterators), p=probabilities
    )
    expected = []
    for i in indices_iterator:
        for key, example in iterators[i]:
            expected.append((key, example))
            break
        else:
            break

    assert next(iter(ex_iterable)) == expected[0]
    assert list(ex_iterable) == expected
def test_cycling_multi_sources_examples_iterable(generate_examples_fn):
    ex_iterable1 = ExamplesIterable(generate_examples_fn, {"text": "foo"})
    ex_iterable2 = ExamplesIterable(generate_examples_fn, {"text": "bar"})
    ex_iterable = CyclingMultiSourcesExamplesIterable([ex_iterable1, ex_iterable2])
    expected = list(chain(*zip(generate_examples_fn(text="foo"), generate_examples_fn(text="bar"))))

    assert next(iter(ex_iterable)) == expected[0]
    assert list(ex_iterable) == expected
    assert all((x["id"], x["text"]) == (i // 2, "bar" if i % 2 else "foo") for i, (_, x) in enumerate(ex_iterable))
def test_iterable_dataset_shuffle(dataset: IterableDataset, generate_examples_fn, seed, epoch):
    buffer_size = 3
    dataset = deepcopy(dataset)
    dataset._ex_iterable.kwargs["filepaths"] = ["0.txt", "1.txt"]
    dataset = dataset.shuffle(seed, buffer_size=buffer_size)
    assert isinstance(dataset._shuffling, ShufflingConfig)
    assert isinstance(dataset._shuffling.generator, np.random.Generator)
    assert is_rng_equal(dataset._shuffling.generator, np.random.default_rng(seed))
    # Effective seed is sum of seed and epoch
    if epoch is None or epoch == 0:
        effective_seed = seed
    else:
        dataset.set_epoch(epoch)
        effective_seed = np.random.default_rng(seed).integers(0, 1 << 63) - epoch
    # Shuffling adds a shuffle buffer
    expected_first_example_index = next(
        iter(BufferShuffledExamplesIterable._iter_random_indices(np.random.default_rng(effective_seed), buffer_size))
    )
    assert isinstance(dataset._ex_iterable, BufferShuffledExamplesIterable)
    # It also shuffles the underlying examples iterable
    expected_ex_iterable = ExamplesIterable(
        generate_examples_fn, {"filepaths": ["0.txt", "1.txt"]}
    ).shuffle_data_sources(np.random.default_rng(effective_seed))
    assert isinstance(dataset._ex_iterable.ex_iterable, ExamplesIterable)
    assert next(iter(dataset)) == list(islice(expected_ex_iterable, expected_first_example_index + 1))[-1][1]
def test_mapped_examples_iterable(generate_examples_fn, n, func, batch_size):
    base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n})
    ex_iterable = MappedExamplesIterable(base_ex_iterable, func, batched=batch_size is not None, batch_size=batch_size)
    all_examples = list(generate_examples_fn(n=n))
    if batch_size is None:
        expected = [(key, func(x)) for key, x in all_examples]
    else:
        # For batched map we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function
        expected_examples_per_batch = [
            list(_batch_to_examples(func(_examples_to_batch([x for _, x in all_examples[i : i + batch_size]]))))
            for i in range(0, len(all_examples), batch_size)
        ]
        # The new key is the concatenation of the keys of each example in the batch
        expected_keys_per_batch = [
            ["_".join(key for key, _ in all_examples[i : i + batch_size])] * len(examples)
            for i, examples in zip(range(0, len(all_examples), batch_size), expected_examples_per_batch)
        ]
        # Combine keys and examples
        expected = [
            (key, example)
            for expected_keys, expected_examples in zip(expected_keys_per_batch, expected_examples_per_batch)
            for key, example in zip(expected_keys, expected_examples)
        ]
    assert next(iter(ex_iterable)) == expected[0]
    assert list(ex_iterable) == expected
def test_take_examples_iterable(generate_examples_fn):
    total, count = 10, 2
    base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": total})
    take_ex_iterable = TakeExamplesIterable(base_ex_iterable, n=count)
    expected = list(generate_examples_fn(n=total))[:count]
    assert list(take_ex_iterable) == expected
    assert take_ex_iterable.shuffle_data_sources(42) is take_ex_iterable, "skip examples makes the shards order fixed"
Beispiel #7
0
def test_mapped_examples_iterable_with_indices(generate_examples_fn, n, func,
                                               batch_size):
    base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n})
    ex_iterable = MappedExamplesIterable(base_ex_iterable,
                                         func,
                                         batched=batch_size is not None,
                                         batch_size=batch_size,
                                         with_indices=True)
    all_examples = [x for _, x in generate_examples_fn(n=n)]
    if batch_size is None:
        expected = [{
            **x,
            **func(x, idx)
        } for idx, x in enumerate(all_examples)]
    else:
        # For batched map we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function
        all_transformed_examples = []
        for batch_offset in range(0, len(all_examples), batch_size):
            examples = all_examples[batch_offset:batch_offset + batch_size]
            batch = _examples_to_batch(examples)
            indices = list(range(batch_offset, batch_offset + len(examples)))
            transformed_batch = func(batch, indices)
            all_transformed_examples.extend(
                _batch_to_examples(transformed_batch))
        expected = _examples_to_batch(all_examples)
        expected.update(_examples_to_batch(all_transformed_examples))
        expected = list(_batch_to_examples(expected))
    assert next(iter(ex_iterable))[1] == expected[0]
    assert list(x for _, x in ex_iterable) == expected
def test_iterable_dataset_cast(generate_examples_fn):
    ex_iterable = ExamplesIterable(generate_examples_fn, {"label": 10})
    features = Features({"id": Value("int64"), "label": Value("int64")})
    dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features))
    new_features = Features({"id": Value("int64"), "label": Value("bool")})
    casted_dataset = dataset.cast(new_features)
    assert list(casted_dataset) == [new_features.encode_example(ex) for _, ex in ex_iterable]
Beispiel #9
0
def test_filtered_examples_iterable_input_columns(generate_examples_fn, n,
                                                  func, batch_size,
                                                  input_columns):
    base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n})
    ex_iterable = FilteredExamplesIterable(base_ex_iterable,
                                           func,
                                           batched=batch_size is not None,
                                           batch_size=batch_size,
                                           input_columns=input_columns)
    all_examples = [x for _, x in generate_examples_fn(n=n)]
    columns_to_input = input_columns if isinstance(input_columns,
                                                   list) else [input_columns]
    if batch_size is None:
        expected = [
            x for x in all_examples
            if func(*[x[col] for col in columns_to_input])
        ]
    else:
        # For batched filter we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function
        expected = []
        for batch_offset in range(0, len(all_examples), batch_size):
            examples = all_examples[batch_offset:batch_offset + batch_size]
            batch = _examples_to_batch(examples)
            mask = func(*[batch[col] for col in columns_to_input])
            expected.extend(
                [x for x, to_keep in zip(examples, mask) if to_keep])
    assert next(iter(ex_iterable))[1] == expected[0]
    assert list(x for _, x in ex_iterable) == expected
Beispiel #10
0
def test_mapped_examples_iterable_input_columns(generate_examples_fn, n, func,
                                                batch_size, input_columns):
    base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n})
    ex_iterable = MappedExamplesIterable(base_ex_iterable,
                                         func,
                                         batched=batch_size is not None,
                                         batch_size=batch_size,
                                         input_columns=input_columns)
    all_examples = [x for _, x in generate_examples_fn(n=n)]
    columns_to_input = input_columns if isinstance(input_columns,
                                                   list) else [input_columns]
    if batch_size is None:
        expected = [{
            **x,
            **func(*[x[col] for col in columns_to_input])
        } for x in all_examples]
    else:
        # For batched map we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function
        all_transformed_examples = []
        for batch_offset in range(0, len(all_examples), batch_size):
            examples = all_examples[batch_offset:batch_offset + batch_size]
            batch = _examples_to_batch(examples)
            transformed_batch = func(*[batch[col] for col in columns_to_input])
            all_transformed_examples.extend(
                _batch_to_examples(transformed_batch))
        expected = _examples_to_batch(all_examples)
        expected.update(_examples_to_batch(all_transformed_examples))
        expected = list(_batch_to_examples(expected))
    assert next(iter(ex_iterable))[1] == expected[0]
    assert list(x for _, x in ex_iterable) == expected
def test_buffer_shuffled_examples_iterable(generate_examples_fn, seed):
    n, buffer_size = 100, 30
    generator = np.random.default_rng(seed)
    base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n})
    ex_iterable = BufferShuffledExamplesIterable(base_ex_iterable, buffer_size=buffer_size, generator=generator)

    rng = deepcopy(generator)
    expected_indices_used_for_shuffling = list(
        islice(BufferShuffledExamplesIterable._iter_random_indices(rng, buffer_size=buffer_size), n - buffer_size)
    )
    # indices to pick in the shuffle buffer should all be in the right range
    assert all(0 <= index_to_pick < buffer_size for index_to_pick in expected_indices_used_for_shuffling)
    # it should be random indices
    assert expected_indices_used_for_shuffling != list(range(buffer_size))

    # The final order of examples is the result of a shuffle buffer.
    all_examples = list(generate_examples_fn(n=n))
    # We create a buffer and we pick random examples from it.
    buffer, rest = all_examples[:buffer_size], all_examples[buffer_size:]
    expected = []
    for i, index_to_pick in enumerate(expected_indices_used_for_shuffling):
        expected.append(buffer[index_to_pick])
        # The picked examples are directly replaced by the next examples from the iterable.
        buffer[index_to_pick] = rest.pop(0)
    # Once we have reached the end of the iterable, we shuffle the buffer and return the remaining examples.
    rng.shuffle(buffer)
    expected += buffer

    assert next(iter(ex_iterable)) == expected[0]
    assert list(ex_iterable) == expected
    assert sorted(list(ex_iterable)) == sorted(all_examples)
Beispiel #12
0
def test_skip_examples_iterable(generate_examples_fn):
    total, count = 10, 2
    base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": total})
    skip_ex_iterable = SkipExamplesIterable(base_ex_iterable, n=count)
    expected = list(generate_examples_fn(n=total))[count:]
    assert list(skip_ex_iterable) == expected
    assert (skip_ex_iterable.shuffle_data_sources(np.random.default_rng(42)) is
            skip_ex_iterable), "skip examples makes the shards order fixed"
def test_examples_iterable_shuffle_shards_and_metadata():
    def gen(filepaths, all_metadata):
        for i, (filepath, metadata) in enumerate(zip(filepaths, all_metadata)):
            yield i, {"filepath": filepath, "metadata": metadata}

    ex_iterable = ExamplesIterable(
        gen,
        {
            "filepaths": [f"{i}.txt" for i in range(100)],
            "all_metadata": [{"id": str(i)} for i in range(100)],
        },
    )
    ex_iterable = ex_iterable.shuffle_data_sources(np.random.default_rng(42))
    out = list(ex_iterable)
    filepaths_ids = [x["filepath"].split(".")[0] for _, x in out]
    metadata_ids = [x["metadata"]["id"] for _, x in out]
    assert filepaths_ids == metadata_ids, "entangled lists of shards/metadata should be shuffled the same way"
def test_iterable_dataset_info(generate_examples_fn):
    info = DatasetInfo(description="desc", citation="@article{}", size_in_bytes=42)
    ex_iterable = ExamplesIterable(generate_examples_fn, {})
    dataset = IterableDataset(ex_iterable, info=info)
    assert dataset.info == info
    assert dataset.description == info.description
    assert dataset.citation == info.citation
    assert dataset.size_in_bytes == info.size_in_bytes
def test_iterable_dataset_features(generate_examples_fn, features):
    ex_iterable = ExamplesIterable(generate_examples_fn, {"label": 0})
    dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features))
    if features:
        expected = [features.encode_example(x) for _, x in ex_iterable]
    else:
        expected = [x for _, x in ex_iterable]
    assert list(dataset) == expected
def test_iterable_dataset_factory_torch_integration(generate_examples_fn):
    import torch

    ex_iterable = ExamplesIterable(generate_examples_fn, {})
    dataset = iterable_dataset(ex_iterable, format_type="torch")
    assert isinstance(dataset, IterableDataset)
    assert isinstance(dataset, torch.utils.data.IterableDataset)
    assert dataset._format_type == "torch"
    assert dataset._ex_iterable is ex_iterable
def test_iterable_dataset_shuffle_after_skip_or_take(generate_examples_fn, method):
    seed = 42
    n, n_shards = 3, 10
    count = 7
    ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n, "filepaths": [f"{i}.txt" for i in range(n_shards)]})
    dataset = IterableDataset(ex_iterable)
    dataset = dataset.skip(n) if method == "skip" else dataset.take(count)
    shuffled_dataset = dataset.shuffle(seed, buffer_size=DEFAULT_N_EXAMPLES)
    # shuffling a skip/take dataset should keep the same examples and don't shuffle the shards
    key = lambda x: f"{x['filepath']}_{x['id']}"  # noqa: E731
    assert sorted(dataset, key=key) == sorted(shuffled_dataset, key=key)
Beispiel #18
0
def test_examples_iterable_with_kwargs(generate_examples_fn):
    ex_iterable = ExamplesIterable(generate_examples_fn, {
        "filepaths": ["0.txt", "1.txt"],
        "split": "train"
    })
    expected = list(
        generate_examples_fn(filepaths=["0.txt", "1.txt"], split="train"))
    assert list(ex_iterable) == expected
    assert all("split" in ex for _, ex in ex_iterable)
    assert sorted({ex["filepath"]
                   for _, ex in ex_iterable}) == ["0.txt", "1.txt"]
Beispiel #19
0
def dataset_with_several_columns(generate_examples_fn):
    ex_iterable = ExamplesIterable(
        generate_examples_fn,
        {
            "filepath": ["data0.txt", "data1.txt", "data2.txt"],
            "metadata": {
                "sources": ["https://foo.bar"]
            }
        },
    )
    return IterableDataset(ex_iterable,
                           info=DatasetInfo(description="dummy"),
                           split="train")
def test_interleave_datasets_with_features(dataset: IterableDataset, generate_examples_fn):
    features = Features(
        {
            "id": Value("int64"),
            "label": ClassLabel(names=["negative", "positive"]),
        }
    )
    ex_iterable = ExamplesIterable(generate_examples_fn, {"label": 0})
    dataset_with_features = IterableDataset(ex_iterable, info=DatasetInfo(features=features))

    merged_dataset = interleave_datasets([dataset, dataset_with_features], probabilities=[0, 1])
    assert isinstance(merged_dataset._ex_iterable, CyclingMultiSourcesExamplesIterable)
    assert isinstance(merged_dataset._ex_iterable.ex_iterables[1], TypedExamplesIterable)
    assert merged_dataset._ex_iterable.ex_iterables[1].features == features
    assert next(iter(merged_dataset)) == next(iter(dataset_with_features))
def test_iterable_dataset_map_complex_features(dataset: IterableDataset, generate_examples_fn):
    # https://github.com/huggingface/datasets/issues/3505
    ex_iterable = ExamplesIterable(generate_examples_fn, {"label": "positive"})
    features = Features(
        {
            "id": Value("int64"),
            "label": Value("string"),
        }
    )
    dataset = IterableDataset(ex_iterable, info=DatasetInfo(features=features))
    dataset = dataset.cast_column("label", ClassLabel(names=["negative", "positive"]))
    dataset = dataset.map(lambda x: {"id+1": x["id"] + 1, **x})
    assert isinstance(dataset._ex_iterable, MappedExamplesIterable)
    features["label"] = ClassLabel(names=["negative", "positive"])
    assert [{k: v for k, v in ex.items() if k != "id+1"} for ex in dataset] == [
        features.encode_example(ex) for _, ex in ex_iterable
    ]
def test_filtered_examples_iterable_with_indices(generate_examples_fn, n, func, batch_size):
    base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n})
    ex_iterable = FilteredExamplesIterable(
        base_ex_iterable, func, batched=batch_size is not None, batch_size=batch_size, with_indices=True
    )
    all_examples = [x for _, x in generate_examples_fn(n=n)]
    if batch_size is None:
        expected = [x for idx, x in enumerate(all_examples) if func(x, idx)]
    else:
        # For batched filter we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function
        expected = []
        for batch_offset in range(0, len(all_examples), batch_size):
            examples = all_examples[batch_offset : batch_offset + batch_size]
            batch = _examples_to_batch(examples)
            indices = list(range(batch_offset, batch_offset + len(examples)))
            mask = func(batch, indices)
            expected.extend([x for x, to_keep in zip(examples, mask) if to_keep])
    assert next(iter(ex_iterable))[1] == expected[0]
    assert list(x for _, x in ex_iterable) == expected
Beispiel #23
0
def test_mapped_examples_iterable_drop_last_batch(generate_examples_fn, n,
                                                  func, batch_size):
    base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n})
    ex_iterable = MappedExamplesIterable(base_ex_iterable,
                                         func,
                                         batched=batch_size is not None,
                                         batch_size=batch_size,
                                         drop_last_batch=True)
    all_examples = [x for _, x in generate_examples_fn(n=n)]
    is_empty = False
    if batch_size is None:
        # `drop_last_batch` has no effect here
        expected = [{**x, **func(x)} for x in all_examples]
    else:
        # For batched map we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function
        all_transformed_examples = []
        for batch_offset in range(0, len(all_examples), batch_size):
            examples = all_examples[batch_offset:batch_offset + batch_size]
            if len(examples) < batch_size:  # ignore last batch
                break
            batch = _examples_to_batch(examples)
            transformed_batch = func(batch)
            all_transformed_examples.extend(
                _batch_to_examples(transformed_batch))
        all_examples = all_examples if n % batch_size == 0 else all_examples[:n
                                                                             //
                                                                             batch_size
                                                                             *
                                                                             batch_size]
        if all_examples:
            expected = _examples_to_batch(all_examples)
            expected.update(_examples_to_batch(all_transformed_examples))
            expected = list(_batch_to_examples(expected))
        else:
            is_empty = True

    if not is_empty:
        assert next(iter(ex_iterable))[1] == expected[0]
        assert list(x for _, x in ex_iterable) == expected
    else:
        with pytest.raises(StopIteration):
            next(iter(ex_iterable))
def test_mapped_examples_iterable_remove_columns(generate_examples_fn, n, func, batch_size, remove_columns):
    base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n, "extra_column": "foo"})
    ex_iterable = MappedExamplesIterable(
        base_ex_iterable, func, batched=batch_size is not None, batch_size=batch_size, remove_columns=remove_columns
    )
    all_examples = [x for _, x in generate_examples_fn(n=n)]
    columns_to_remove = remove_columns if isinstance(remove_columns, list) else [remove_columns]
    if batch_size is None:
        expected = [{**{k: v for k, v in x.items() if k not in columns_to_remove}, **func(x)} for x in all_examples]
    else:
        # For batched map we have to format the examples as a batch (i.e. in one single dictionary) to pass the batch to the function
        all_transformed_examples = []
        for batch_offset in range(0, len(all_examples), batch_size):
            examples = all_examples[batch_offset : batch_offset + batch_size]
            batch = _examples_to_batch(examples)
            transformed_batch = func(batch)
            all_transformed_examples.extend(_batch_to_examples(transformed_batch))
        expected = {k: v for k, v in _examples_to_batch(all_examples).items() if k not in columns_to_remove}
        expected.update(_examples_to_batch(all_transformed_examples))
        expected = list(_batch_to_examples(expected))
    assert next(iter(ex_iterable))[1] == expected[0]
    assert list(x for _, x in ex_iterable) == expected
def test_examples_iterable(generate_examples_fn):
    ex_iterable = ExamplesIterable(generate_examples_fn, {})
    expected = list(generate_examples_fn())
    assert next(iter(ex_iterable)) == expected[0]
    assert list(ex_iterable) == expected
def test_iterable_dataset(generate_examples_fn):
    dataset = IterableDataset(ExamplesIterable(generate_examples_fn, {}))
    expected = [x for _, x in generate_examples_fn()]
    assert next(iter(dataset)) == expected[0]
    assert list(dataset) == expected
def test_iterable_dataset_factory(generate_examples_fn):
    ex_iterable = ExamplesIterable(generate_examples_fn, {})
    dataset = iterable_dataset(ex_iterable)
    assert isinstance(dataset, IterableDataset)
    assert dataset._ex_iterable is ex_iterable
def dataset(generate_examples_fn):
    ex_iterable = ExamplesIterable(generate_examples_fn, {})
    return IterableDataset(ex_iterable, info=DatasetInfo(description="dummy"), split="train")