Esempio n. 1
0
def benchmark_map_filter():
    times = {"num examples": SPEED_TEST_N_EXAMPLES}
    with tempfile.TemporaryDirectory() as tmp_dir:
        features = nlp.Features({
            "text": nlp.Value("string"),
            "numbers": nlp.Value("float32")
        })
        dataset = generate_example_dataset(os.path.join(
            tmp_dir, "dataset.arrow"),
                                           features,
                                           num_examples=SPEED_TEST_N_EXAMPLES)

        tokenizer = transformers.AutoTokenizer.from_pretrained(
            "bert-base-cased", use_fast=True)

        def tokenize(examples):
            return tokenizer(examples["text"])

        times["map identity"] = map(dataset)

        times["map identity batched"] = map(dataset, batched=True)

        times["map no-op batched"] = map(dataset,
                                         function=lambda x: None,
                                         batched=True)

        with dataset.formatted_as(type="numpy"):
            times["map no-op batched numpy"] = map(dataset,
                                                   function=lambda x: None,
                                                   batched=True)

        with dataset.formatted_as(type="pandas"):
            times["map no-op batched pandas"] = map(dataset,
                                                    function=lambda x: None,
                                                    batched=True)

        with dataset.formatted_as(type="torch", columns="numbers"):
            times["map no-op batched pytorch"] = map(dataset,
                                                     function=lambda x: None,
                                                     batched=True)

        with dataset.formatted_as(type="tensorflow", columns="numbers"):
            times["map no-op batched tensorflow"] = map(
                dataset, function=lambda x: None, batched=True)

        times["map fast-tokenizer batched"] = map(dataset,
                                                  function=tokenize,
                                                  batched=True)

        times["filter"] = filter(dataset)

        # Activate later when tokenizer support batched inputs
        # with dataset.formatted_as(type='numpy'):
        #     times[func.__name__ + " fast-tokenizer batched numpy"] = func(dataset, function=tokenize, batched=True)

    with open(RESULTS_FILE_PATH, "wb") as f:
        f.write(json.dumps(times).encode("utf-8"))
Esempio n. 2
0
def benchmark_indices_mapping():
    times = {"num examples": SPEED_TEST_N_EXAMPLES}
    functions = (select, sort, shuffle, train_test_split, shard)
    with tempfile.TemporaryDirectory() as tmp_dir:
        print("generating dataset")
        features = nlp.Features({"text": nlp.Value("string"), "numbers": nlp.Value("float32")})
        dataset = generate_example_dataset(
            os.path.join(tmp_dir, "dataset.arrow"), features, num_examples=SPEED_TEST_N_EXAMPLES
        )
        print("Functions")
        for func in functions:
            print(func.__name__)
            times[func.__name__] = func(dataset)

    with open(RESULTS_FILE_PATH, "wb") as f:
        f.write(json.dumps(times).encode("utf-8"))
Esempio n. 3
0
def benchmark_iterating():
    times = {"num examples": SPEED_TEST_N_EXAMPLES}
    functions = [
        (read, {
            "length": SMALL_TEST
        }),
        (read, {
            "length": SPEED_TEST_N_EXAMPLES
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 10
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 100
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 1_000
        }),
        (read_formatted, {
            "type": "numpy",
            "length": SMALL_TEST
        }),
        (read_formatted, {
            "type": "pandas",
            "length": SMALL_TEST
        }),
        (read_formatted, {
            "type": "torch",
            "length": SMALL_TEST
        }),
        (read_formatted, {
            "type": "tensorflow",
            "length": SMALL_TEST
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 10
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 1_000
        }),
    ]

    functions_shuffled = [
        (read, {
            "length": SMALL_TEST
        }),
        (read, {
            "length": SPEED_TEST_N_EXAMPLES
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 10
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 100
        }),
        (read_batch, {
            "length": SPEED_TEST_N_EXAMPLES,
            "batch_size": 1_000
        }),
        (read_formatted, {
            "type": "numpy",
            "length": SMALL_TEST
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 10
        }),
        (read_formatted_batch, {
            "type": "numpy",
            "length": SMALL_TEST,
            "batch_size": 1_000
        }),
    ]
    with tempfile.TemporaryDirectory() as tmp_dir:
        print("generating dataset")
        features = datasets.Features({
            "list":
            datasets.Sequence(datasets.Value("float32")),
            "numbers":
            datasets.Value("float32")
        })
        dataset = generate_example_dataset(
            os.path.join(tmp_dir, "dataset.arrow"),
            features,
            num_examples=SPEED_TEST_N_EXAMPLES,
            seq_shapes={"list": (100, )},
        )
        print("first set of iterations")
        for func, kwargs in functions:
            print(func.__name__, str(kwargs))
            times[func.__name__ + " " +
                  " ".join(str(v) for v in kwargs.values())] = func(
                      dataset, **kwargs)

        print("shuffling dataset")
        dataset = dataset.shuffle()
        print("Second set of iterations (after shuffling")
        for func, kwargs in functions_shuffled:
            print("shuffled ", func.__name__, str(kwargs))
            times["shuffled " + func.__name__ + " " +
                  " ".join(str(v) for v in kwargs.values())] = func(
                      dataset, **kwargs)

    with open(RESULTS_FILE_PATH, "wb") as f:
        f.write(json.dumps(times).encode("utf-8"))