def test_textline_dataset_exceptions(): with pytest.raises(ValueError) as error_info: _ = ds.TextFileDataset(DATA_FILE, num_samples=-1) assert "num_samples exceeds the boundary" in str(error_info.value) with pytest.raises(ValueError) as error_info: _ = ds.TextFileDataset("does/not/exist/no.txt") assert "The following patterns did not match any files" in str( error_info.value) with pytest.raises(ValueError) as error_info: _ = ds.TextFileDataset("") assert "The following patterns did not match any files" in str( error_info.value) def exception_func(item): raise Exception("Error occur!") with pytest.raises(RuntimeError) as error_info: data = ds.TextFileDataset(DATA_FILE) data = data.map(operations=exception_func, input_columns=["text"], num_parallel_workers=1) for _ in data.__iter__(): pass assert "map operation: [PyFunc] failed. The corresponding data files" in str( error_info.value)
def test_shuffle(): FILES = ["../data/dataset/testTFTestAllTypes/test.data"] SCHEMA_FILE = "../data/dataset/testTFTestAllTypes/datasetSchema.json" ds.config.set_seed(1) data1 = ds.TFRecordDataset(FILES, schema=SCHEMA_FILE, shuffle=ds.Shuffle.GLOBAL) data2 = ds.TFRecordDataset(FILES, schema=SCHEMA_FILE, shuffle=ds.Shuffle.FILES) data2 = data2.shuffle(10000) for d1, d2 in zip(data1.create_tuple_iterator(output_numpy=True), data2.create_tuple_iterator(output_numpy=True)): for t1, t2 in zip(d1, d2): np.testing.assert_array_equal(t1, t2) ds.config.set_seed(1) DATA_ALL_FILE = "../data/dataset/testTextFileDataset/*" data1 = ds.TextFileDataset(DATA_ALL_FILE, shuffle=ds.Shuffle.GLOBAL) data2 = ds.TextFileDataset(DATA_ALL_FILE, shuffle=ds.Shuffle.FILES) data2 = data2.shuffle(10000) for d1, d2 in zip(data1.create_tuple_iterator(output_numpy=True), data2.create_tuple_iterator(output_numpy=True)): for t1, t2 in zip(d1, d2): np.testing.assert_array_equal(t1, t2) ds.config.set_seed(1) TRAIN_FILE = '../data/dataset/testCLUE/afqmc/train.json' data1 = ds.CLUEDataset(TRAIN_FILE, task='AFQMC', usage='train', shuffle=ds.Shuffle.GLOBAL) data2 = ds.CLUEDataset(TRAIN_FILE, task='AFQMC', usage='train', shuffle=ds.Shuffle.FILES) data2 = data2.shuffle(10000) for d1, d2 in zip(data1.create_tuple_iterator(output_numpy=True), data2.create_tuple_iterator(output_numpy=True)): for t1, t2 in zip(d1, d2): np.testing.assert_array_equal(t1, t2)
def test_from_list_lookup_empty_string(): # "" is a valid word in vocab, which can be looked up by LookupOp vocab = text.Vocab.from_list("home IS behind the world ahead !".split(" "), ["<pad>", ""], True) lookup = text.Lookup(vocab, "") data = ds.TextFileDataset(DATA_FILE, shuffle=False) data = data.map(operations=lookup, input_columns=["text"]) ind = 0 res = [2, 1, 4, 5, 6, 7] for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): assert d["text"] == res[ind], ind ind += 1 # unknown_token of LookUp is None, it will convert to std::nullopt in C++, # so it has nothing to do with "" in vocab and C++ will skip looking up unknown_token vocab = text.Vocab.from_list("home IS behind the world ahead !".split(" "), ["<pad>", ""], True) lookup = text.Lookup(vocab) data = ds.TextFileDataset(DATA_FILE, shuffle=False) data = data.map(operations=lookup, input_columns=["text"]) try: for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True): pass except RuntimeError as e: assert "token: \"is\" doesn't exist in vocab and no unknown token is specified" in str( e)
def test_with_zip_concat(): data = ds.TextFileDataset(VOCAB_FILE, shuffle=False) vocab = text.SentencePieceVocab.from_dataset(data, [""], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = dataset.map(operations=tokenizer, num_parallel_workers=2) zip_test(dataset) concat_test(dataset)
def test_text_file_dataset_size(): dataset = ds.TextFileDataset(TEXT_DATA_FILE) assert dataset.get_dataset_size() == 3 dataset_shard_2_0 = ds.TextFileDataset(TEXT_DATA_FILE, num_shards=2, shard_id=0) assert dataset_shard_2_0.get_dataset_size() == 2
def test_unmappable_invalid_input(): d = ds.TextFileDataset(text_file_dataset_path) split_with_invalid_inputs(d) d = ds.TextFileDataset(text_file_dataset_path, num_shards=2, shard_id=0) with pytest.raises(RuntimeError) as info: _, _ = d.split([4, 1]) assert "Dataset should not be sharded before split" in str(info.value)
def test_build_from_dataset(): data = ds.TextFileDataset(VOCAB_FILE, shuffle=False) vocab = text.SentencePieceVocab.from_dataset(data, [""], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = dataset.map(operations=tokenizer) expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.'] for i in dataset.create_dict_iterator(): ret = to_str(i["text"]) for key, value in enumerate(ret): assert value == expect[key]
def test_textline_dataset_exceptions(): with pytest.raises(ValueError) as error_info: _ = ds.TextFileDataset(DATA_FILE, num_samples=-1) assert "Input num_samples is not within the required interval" in str(error_info.value) with pytest.raises(ValueError) as error_info: _ = ds.TextFileDataset("does/not/exist/no.txt") assert "The following patterns did not match any files" in str(error_info.value) with pytest.raises(ValueError) as error_info: _ = ds.TextFileDataset("") assert "The following patterns did not match any files" in str(error_info.value)
def test_unmappable_randomize_repeatable(): original_num_parallel_workers = config_get_set_num_parallel_workers(4) # the labels outputted by ShuffleOp for seed 53 is [0, 2, 1, 4, 3] ds.config.set_seed(53) d = ds.TextFileDataset(text_file_dataset_path, shuffle=False) s1, s2 = d.split([0.8, 0.2]) num_epochs = 5 s1 = s1.repeat(num_epochs) s2 = s2.repeat(num_epochs) s1_output = [] for item in s1.create_dict_iterator(): s1_output.append(item["text"].item().decode("utf8")) s2_output = [] for item in s2.create_dict_iterator(): s2_output.append(item["text"].item().decode("utf8")) # note no overlap assert s1_output == [ text_file_data[0], text_file_data[2], text_file_data[1], text_file_data[4] ] * num_epochs assert s2_output == [text_file_data[3]] * num_epochs # Restore configuration num_parallel_workers ds.config.set_num_parallel_workers(original_num_parallel_workers)
def test_unicode_script_tokenizer_with_offsets2(): """ Test UnicodeScriptTokenizer when para keep_whitespace=True and with_offsets=True """ unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"], ["北京欢迎您", "!"], ["我喜欢", "English", "!"], [" "]] dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True) dataset = dataset.map( input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer) tokens = [] expected_offsets_start = [[0, 7, 8, 10, 11, 18], [0, 15], [0, 9, 16], [0]] expected_offsets_limit = [[7, 8, 10, 11, 18, 19], [15, 18], [9, 16, 17], [2]] count = 0 for i in dataset.create_dict_iterator(): token = text.to_str(i['token']).tolist() tokens.append(token) np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) count += 1 logger.info("The out tokens is :", tokens) assert unicode_script_strs2 == tokens
def test_unmappable_get_dataset_size(): d = ds.TextFileDataset(text_file_dataset_path, shuffle=False) s1, s2 = d.split([0.8, 0.2]) assert d.get_dataset_size() == 5 assert s1.get_dataset_size() == 4 assert s2.get_dataset_size() == 1
def test_whitespace_tokenizer_with_offsets(): """ Test WhitespaceTokenizer """ whitespace_strs = [["Welcome", "to", "Beijing!"], ["北京欢迎您!"], ["我喜欢English!"], [""]] dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) tokenizer = text.WhitespaceTokenizer(with_offsets=True) dataset = dataset.map( input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer) tokens = [] expected_offsets_start = [[0, 8, 11], [0], [0], [0]] expected_offsets_limit = [[7, 10, 19], [18], [17], [0]] count = 0 for i in dataset.create_dict_iterator(): token = text.to_str(i['token']).tolist() tokens.append(token) np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) count += 1 logger.info("The out tokens is : {}".format(tokens)) assert whitespace_strs == tokens
def regex_tokenizer(first, last, expect_str, expected_offsets_start, expected_offsets_limit, delim_pattern, keep_delim_pattern): dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False) if first > 1: dataset = dataset.skip(first - 1) if last >= first: dataset = dataset.take(last - first + 1) tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True) dataset = dataset.map( input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer_op) out_text = [] count = 0 for i in dataset.create_dict_iterator(): token = text.to_str(i['token']).tolist() np.testing.assert_array_equal(token, expect_str[count]) np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) count += 1 out_text.append(token) logger.info("Out:", out_text) logger.info("Exp:", expect_str)
def test_jieba_with_offsets_5(): """Test add dict with file path""" DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt" data = ds.TextFileDataset(DATA_FILE4) jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) jieba_op.add_word("江大桥", 20000) data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], columns_order=["token", "offsets_start", "offsets_limit"], operations=jieba_op, num_parallel_workers=1) expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式'] expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51] expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57] for i in data.create_dict_iterator(): ret = to_str(i["token"]) for index, item in enumerate(ret): assert item == expect[index] for index, item in enumerate(i["offsets_start"]): assert item == expected_offsets_start[index] for index, item in enumerate(i["offsets_limit"]): assert item == expected_offsets_limit[index]
def test_jieba_with_offsets_4(): DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt" DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt" data = ds.TextFileDataset(DATA_FILE4) jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) jieba_op.add_dict(DICT_FILE) data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], columns_order=["token", "offsets_start", "offsets_limit"], operations=jieba_op, num_parallel_workers=1) expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] expected_offsets_start = [0, 12, 21, 27, 33, 36, 42] expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48] for i in data.create_dict_iterator(): ret = to_str(i["token"]) for index, item in enumerate(ret): assert item == expect[index] for index, item in enumerate(i["offsets_start"]): assert item == expected_offsets_start[index] for index, item in enumerate(i["offsets_limit"]): assert item == expected_offsets_limit[index]
def test_jieba_with_offsets_3_1(): """Test add_dict with dict""" DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" user_dict = {"男默女泪": 10, "江大桥": 20000} data = ds.TextFileDataset(DATA_FILE4) jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) jieba_op.add_dict(user_dict) data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], columns_order=["token", "offsets_start", "offsets_limit"], operations=jieba_op, num_parallel_workers=1) expect = ['男默女泪', '市长', '江大桥'] expected_offsets_start = [0, 12, 18] expected_offsets_limit = [12, 18, 27] for i in data.create_dict_iterator(): ret = to_str(i["token"]) for index, item in enumerate(ret): assert item == expect[index] for index, item in enumerate(i["offsets_start"]): assert item == expected_offsets_start[index] for index, item in enumerate(i["offsets_limit"]): assert item == expected_offsets_limit[index]
def test_unicode_char_tokenizer_with_offsets(): """ Test UnicodeCharTokenizer """ input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ") dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) tokenizer = text.UnicodeCharTokenizer(with_offsets=True) dataset = dataset.map( input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer) tokens = [] expected_offsets_start = [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 ], [0, 3, 6, 9, 12, 15], [0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16], [0, 1]] expected_offsets_limit = [[ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ], [3, 6, 9, 12, 15, 18], [3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17], [1, 2]] count = 0 for i in dataset.create_dict_iterator(): token = text.to_str(i['token']).tolist() tokens.append(token) np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) count += 1 logger.info("The out tokens is : {}".format(tokens)) assert split_by_unicode_char(input_strs) == tokens
def test_jieba_with_offsets_2_1(): """Test add_word with freq""" DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" data = ds.TextFileDataset(DATA_FILE4) jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) jieba_op.add_word("男默女泪", 10) data = data.map(operations=jieba_op, input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], column_order=["token", "offsets_start", "offsets_limit"], num_parallel_workers=2) expect = ['男默女泪', '市', '长江大桥'] expected_offsets_start = [0, 12, 15] expected_offsets_limit = [12, 15, 27] for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): ret = to_str(i["token"]) for index, item in enumerate(ret): assert item == expect[index] for index, item in enumerate(i["offsets_start"]): assert item == expected_offsets_start[index] for index, item in enumerate(i["offsets_limit"]): assert item == expected_offsets_limit[index]
def check_basic_tokenizer_default( first, last, expected_tokens, expected_offsets_start, expected_offsets_limit, lower_case=False, keep_whitespace=False, normalization_form=text.utils.NormalizeForm.NONE, preserve_unused_token=False): dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False) if first > 1: dataset = dataset.skip(first - 1) if last >= first: dataset = dataset.take(last - first + 1) basic_tokenizer = text.BasicTokenizer( lower_case=lower_case, keep_whitespace=keep_whitespace, normalization_form=normalization_form, preserve_unused_token=preserve_unused_token) dataset = dataset.map(operations=basic_tokenizer) count = 0 for i in dataset.create_dict_iterator(): token = text.to_str(i['text']) logger.info("Out:", token) logger.info("Exp:", expected_tokens[count]) np.testing.assert_array_equal(token, expected_tokens[count]) count = count + 1
def check_wordpiece_tokenizer_with_offsets(first, last, expect_str, expected_offsets_start, expected_offsets_limit, vocab_list, unknown_token='[UNK]', max_bytes_per_token=100): dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False) if first > 1: dataset = dataset.skip(first - 1) if last >= first: dataset = dataset.take(last - first + 1) vocab = text.Vocab.from_list(vocab_list) tokenizer_op = text.WordpieceTokenizer( vocab=vocab, with_offsets=True, unknown_token=unknown_token, max_bytes_per_token=max_bytes_per_token) dataset = dataset.map( operations=tokenizer_op, input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], column_order=['token', 'offsets_start', 'offsets_limit']) count = 0 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): token = text.to_str(i['token']) logger.info("Out:", token) logger.info("Exp:", expect_str[count]) np.testing.assert_array_equal(token, expect_str[count]) np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) count = count + 1
def test_textline_dataset_all_file(): data = ds.TextFileDataset(DATA_ALL_FILE) count = 0 for i in data.create_dict_iterator(): logger.info("{}".format(i["text"])) count += 1 assert (count == 5)
def check_bert_tokenizer(first, last, expect_str, vocab_list, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]', lower_case=False, keep_whitespace=False, normalization_form=nlp.utils.NormalizeForm.NONE, preserve_unused_token=False): dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False) if first > 1: dataset = dataset.skip(first - 1) if last >= first: dataset = dataset.take(last - first + 1) vocab = nlp.Vocab.from_list(vocab_list) tokenizer_op = nlp.BertTokenizer( vocab=vocab, suffix_indicator=suffix_indicator, max_bytes_per_token=max_bytes_per_token, unknown_token=unknown_token, lower_case=lower_case, keep_whitespace=keep_whitespace, normalization_form=normalization_form, preserve_unused_token=preserve_unused_token) dataset = dataset.map(operations=tokenizer_op) count = 0 for i in dataset.create_dict_iterator(): text = nlp.to_str(i['text']) logger.info("Out:", text) logger.info("Exp:", expect_str[count]) np.testing.assert_array_equal(text, expect_str[count]) count = count + 1
def test_textline_dataset_num_samples_zero(): data = ds.TextFileDataset(DATA_FILE, num_samples=0) count = 0 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): logger.info("{}".format(i["text"])) count += 1 assert count == 3
def test_config(columns, freq_range, top_k, s): try: data = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False) vocab = text.Vocab.from_dataset(data, columns, freq_range, top_k) assert isinstance(vocab.text.Vocab) except ValueError as e: assert s in str(e), str(e)
def test_textline_dataset_num_samples_none(): # Do not provide a num_samples argument, so it would be None by default data = ds.TextFileDataset(DATA_FILE) count = 0 for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): logger.info("{}".format(i["text"])) count += 1 assert count == 3
def test_textline_dataset_get_datasetsize(): """ Test get_dataset_size of CLUE dataset """ TRAIN_FILE = '../data/dataset/testCLUE/afqmc/train.json' data = ds.TextFileDataset(TRAIN_FILE) size = data.get_dataset_size() assert size == 3
def test_demo_basic_from_dataset(): """ this is a tutorial on how from_dataset should be used in a normal use case""" data = ds.TextFileDataset("../data/dataset/testVocab/words.txt", shuffle=False) vocab = text.Vocab.from_dataset(data, "text", freq_range=None, top_k=None) data = data.map(input_columns=["text"], operations=text.Lookup(vocab)) res = [] for d in data.create_dict_iterator(): res.append(d["text"].item()) assert res == [4, 5, 3, 6, 7, 2]
def test_from_dict_tutorial(): vocab = text.Vocab.from_dict({"home": 3, "behind": 2, "the": 4, "world": 5, "<unk>": 6}) lookup = text.Lookup(vocab, "<unk>") # any unknown token will be mapped to the id of <unk> data = ds.TextFileDataset(DATA_FILE, shuffle=False) data = data.map(operations=lookup, input_columns=["text"]) res = [3, 6, 2, 4, 5, 6] ind = 0 for d in data.create_dict_iterator(num_epochs=1, output_numpy=True): assert d["text"] == res[ind], ind ind += 1
def test_from_vocab_to_int(): vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.INT) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = dataset.map(operations=tokenizer) expect = [6, 329, 183, 8, 945, 23, 8, 3783, 4382, 4641, 1405, 4] for i in dataset.create_dict_iterator(): ret = i["text"] for key, value in enumerate(ret): assert value == expect[key]
def test_from_vocab_to_str_WORD(): vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.WORD, {}) tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = dataset.map(operations=tokenizer) expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁telescope.'] for i in dataset.create_dict_iterator(): ret = to_str(i["text"]) for key, value in enumerate(ret): assert value == expect[key]