def benchmark_map_filter(): times = {"num examples": SPEED_TEST_N_EXAMPLES} with tempfile.TemporaryDirectory() as tmp_dir: features = nlp.Features({ "text": nlp.Value("string"), "numbers": nlp.Value("float32") }) dataset = generate_example_dataset(os.path.join( tmp_dir, "dataset.arrow"), features, num_examples=SPEED_TEST_N_EXAMPLES) tokenizer = transformers.AutoTokenizer.from_pretrained( "bert-base-cased", use_fast=True) def tokenize(examples): return tokenizer(examples["text"]) times["map identity"] = map(dataset) times["map identity batched"] = map(dataset, batched=True) times["map no-op batched"] = map(dataset, function=lambda x: None, batched=True) with dataset.formatted_as(type="numpy"): times["map no-op batched numpy"] = map(dataset, function=lambda x: None, batched=True) with dataset.formatted_as(type="pandas"): times["map no-op batched pandas"] = map(dataset, function=lambda x: None, batched=True) with dataset.formatted_as(type="torch", columns="numbers"): times["map no-op batched pytorch"] = map(dataset, function=lambda x: None, batched=True) with dataset.formatted_as(type="tensorflow", columns="numbers"): times["map no-op batched tensorflow"] = map( dataset, function=lambda x: None, batched=True) times["map fast-tokenizer batched"] = map(dataset, function=tokenize, batched=True) times["filter"] = filter(dataset) # Activate later when tokenizer support batched inputs # with dataset.formatted_as(type='numpy'): # times[func.__name__ + " fast-tokenizer batched numpy"] = func(dataset, function=tokenize, batched=True) with open(RESULTS_FILE_PATH, "wb") as f: f.write(json.dumps(times).encode("utf-8"))
def benchmark_indices_mapping(): times = {"num examples": SPEED_TEST_N_EXAMPLES} functions = (select, sort, shuffle, train_test_split, shard) with tempfile.TemporaryDirectory() as tmp_dir: print("generating dataset") features = nlp.Features({"text": nlp.Value("string"), "numbers": nlp.Value("float32")}) dataset = generate_example_dataset( os.path.join(tmp_dir, "dataset.arrow"), features, num_examples=SPEED_TEST_N_EXAMPLES ) print("Functions") for func in functions: print(func.__name__) times[func.__name__] = func(dataset) with open(RESULTS_FILE_PATH, "wb") as f: f.write(json.dumps(times).encode("utf-8"))
def benchmark_iterating(): times = {"num examples": SPEED_TEST_N_EXAMPLES} functions = [ (read, { "length": SMALL_TEST }), (read, { "length": SPEED_TEST_N_EXAMPLES }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 10 }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 100 }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 1_000 }), (read_formatted, { "type": "numpy", "length": SMALL_TEST }), (read_formatted, { "type": "pandas", "length": SMALL_TEST }), (read_formatted, { "type": "torch", "length": SMALL_TEST }), (read_formatted, { "type": "tensorflow", "length": SMALL_TEST }), (read_formatted_batch, { "type": "numpy", "length": SMALL_TEST, "batch_size": 10 }), (read_formatted_batch, { "type": "numpy", "length": SMALL_TEST, "batch_size": 1_000 }), ] functions_shuffled = [ (read, { "length": SMALL_TEST }), (read, { "length": SPEED_TEST_N_EXAMPLES }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 10 }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 100 }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 1_000 }), (read_formatted, { "type": "numpy", "length": SMALL_TEST }), (read_formatted_batch, { "type": "numpy", "length": SMALL_TEST, "batch_size": 10 }), (read_formatted_batch, { "type": "numpy", "length": SMALL_TEST, "batch_size": 1_000 }), ] with tempfile.TemporaryDirectory() as tmp_dir: print("generating dataset") features = datasets.Features({ "list": datasets.Sequence(datasets.Value("float32")), "numbers": datasets.Value("float32") }) dataset = generate_example_dataset( os.path.join(tmp_dir, "dataset.arrow"), features, num_examples=SPEED_TEST_N_EXAMPLES, seq_shapes={"list": (100, )}, ) print("first set of iterations") for func, kwargs in functions: print(func.__name__, str(kwargs)) times[func.__name__ + " " + " ".join(str(v) for v in kwargs.values())] = func( dataset, **kwargs) print("shuffling dataset") dataset = dataset.shuffle() print("Second set of iterations (after shuffling") for func, kwargs in functions_shuffled: print("shuffled ", func.__name__, str(kwargs)) times["shuffled " + func.__name__ + " " + " ".join(str(v) for v in kwargs.values())] = func( dataset, **kwargs) with open(RESULTS_FILE_PATH, "wb") as f: f.write(json.dumps(times).encode("utf-8"))