def test_mindrecord(): data = ds.MindDataset("../data/dataset/testTextMindRecord/test.mindrecord", shuffle=False) for i, d in enumerate(data.create_dict_iterator(num_epochs=1, output_numpy=True)): assert d["english"].shape == line[i].shape assert d["chinese"].shape == chinese[i].shape np.testing.assert_array_equal(line[i], to_str(d["english"])) np.testing.assert_array_equal(chinese[i], to_str(d["chinese"]))
def test_tfrecord2(): data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, schema='../data/dataset/testTextTFRecord/datasetSchema.json') for i, d in enumerate(data.create_dict_iterator(num_epochs=1, output_numpy=True)): assert d["line"].shape == line[i].shape assert d["words"].shape == words[i].shape assert d["chinese"].shape == chinese[i].shape np.testing.assert_array_equal(line[i], to_str(d["line"])) np.testing.assert_array_equal(words[i], to_str(d["words"])) np.testing.assert_array_equal(chinese[i], to_str(d["chinese"]))
def test_tfrecord1(): s = ds.Schema() s.add_column("line", "string", []) s.add_column("words", "string", [-1]) s.add_column("chinese", "string", []) data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, schema=s) for i, d in enumerate(data.create_dict_iterator()): assert d["line"].shape == line[i].shape assert d["words"].shape == words[i].shape assert d["chinese"].shape == chinese[i].shape np.testing.assert_array_equal(line[i], to_str(d["line"])) np.testing.assert_array_equal(words[i], to_str(d["words"])) np.testing.assert_array_equal(chinese[i], to_str(d["chinese"]))
def test_tfrecord3(): s = ds.Schema() s.add_column("line", mstype.string, []) s.add_column("words", mstype.string, [-1, 2]) s.add_column("chinese", mstype.string, []) data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, schema=s) for i, d in enumerate(data.create_dict_iterator(num_epochs=1, output_numpy=True)): assert d["line"].shape == line[i].shape assert d["words"].shape == words[i].reshape([2, 2]).shape assert d["chinese"].shape == chinese[i].shape np.testing.assert_array_equal(line[i], to_str(d["line"])) np.testing.assert_array_equal(words[i].reshape([2, 2]), to_str(d["words"])) np.testing.assert_array_equal(chinese[i], to_str(d["chinese"]))
def test_jieba_with_offsets_2_1(): """Test add_word with freq""" DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" data = ds.TextFileDataset(DATA_FILE4) jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) jieba_op.add_word("男默女泪", 10) data = data.map(operations=jieba_op, input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], column_order=["token", "offsets_start", "offsets_limit"], num_parallel_workers=2) expect = ['男默女泪', '市', '长江大桥'] expected_offsets_start = [0, 12, 15] expected_offsets_limit = [12, 15, 27] for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): ret = to_str(i["token"]) for index, item in enumerate(ret): assert item == expect[index] for index, item in enumerate(i["offsets_start"]): assert item == expected_offsets_start[index] for index, item in enumerate(i["offsets_limit"]): assert item == expected_offsets_limit[index]
def test_unicode_char_tokenizer_with_offsets(): """ Test UnicodeCharTokenizer """ input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ") dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) tokenizer = text.UnicodeCharTokenizer(with_offsets=True) dataset = dataset.map( input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer) tokens = [] expected_offsets_start = [[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 ], [0, 3, 6, 9, 12, 15], [0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16], [0, 1]] expected_offsets_limit = [[ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ], [3, 6, 9, 12, 15, 18], [3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17], [1, 2]] count = 0 for i in dataset.create_dict_iterator(): token = text.to_str(i['token']).tolist() tokens.append(token) np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) count += 1 logger.info("The out tokens is : {}".format(tokens)) assert split_by_unicode_char(input_strs) == tokens
def regex_tokenizer(first, last, expect_str, expected_offsets_start, expected_offsets_limit, delim_pattern, keep_delim_pattern): dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False) if first > 1: dataset = dataset.skip(first - 1) if last >= first: dataset = dataset.take(last - first + 1) tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True) dataset = dataset.map( input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer_op) out_text = [] count = 0 for i in dataset.create_dict_iterator(): token = text.to_str(i['token']).tolist() np.testing.assert_array_equal(token, expect_str[count]) np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) count += 1 out_text.append(token) logger.info("Out:", out_text) logger.info("Exp:", expect_str)
def test_unicode_script_tokenizer_with_offsets2(): """ Test UnicodeScriptTokenizer when para keep_whitespace=True and with_offsets=True """ unicode_script_strs2 = [["Welcome", " ", "to", " ", "Beijing", "!"], ["北京欢迎您", "!"], ["我喜欢", "English", "!"], [" "]] dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) tokenizer = text.UnicodeScriptTokenizer(keep_whitespace=True, with_offsets=True) dataset = dataset.map( input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer) tokens = [] expected_offsets_start = [[0, 7, 8, 10, 11, 18], [0, 15], [0, 9, 16], [0]] expected_offsets_limit = [[7, 8, 10, 11, 18, 19], [15, 18], [9, 16, 17], [2]] count = 0 for i in dataset.create_dict_iterator(): token = text.to_str(i['token']).tolist() tokens.append(token) np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) count += 1 logger.info("The out tokens is :", tokens) assert unicode_script_strs2 == tokens
def test_whitespace_tokenizer_with_offsets(): """ Test WhitespaceTokenizer """ whitespace_strs = [["Welcome", "to", "Beijing!"], ["北京欢迎您!"], ["我喜欢English!"], [""]] dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) tokenizer = text.WhitespaceTokenizer(with_offsets=True) dataset = dataset.map( input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer) tokens = [] expected_offsets_start = [[0, 8, 11], [0], [0], [0]] expected_offsets_limit = [[7, 10, 19], [18], [17], [0]] count = 0 for i in dataset.create_dict_iterator(): token = text.to_str(i['token']).tolist() tokens.append(token) np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) count += 1 logger.info("The out tokens is : {}".format(tokens)) assert whitespace_strs == tokens
def check_wordpiece_tokenizer_with_offsets(first, last, expect_str, expected_offsets_start, expected_offsets_limit, vocab_list, unknown_token='[UNK]', max_bytes_per_token=100): dataset = ds.TextFileDataset(WORDPIECE_TOKENIZER_FILE, shuffle=False) if first > 1: dataset = dataset.skip(first - 1) if last >= first: dataset = dataset.take(last - first + 1) vocab = text.Vocab.from_list(vocab_list) tokenizer_op = text.WordpieceTokenizer( vocab=vocab, with_offsets=True, unknown_token=unknown_token, max_bytes_per_token=max_bytes_per_token) dataset = dataset.map( operations=tokenizer_op, input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], column_order=['token', 'offsets_start', 'offsets_limit']) count = 0 for i in dataset.create_dict_iterator(num_epochs=1, output_numpy=True): token = text.to_str(i['token']) logger.info("Out:", token) logger.info("Exp:", expect_str[count]) np.testing.assert_array_equal(token, expect_str[count]) np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) count = count + 1
def check_basic_tokenizer_default( first, last, expected_tokens, expected_offsets_start, expected_offsets_limit, lower_case=False, keep_whitespace=False, normalization_form=text.utils.NormalizeForm.NONE, preserve_unused_token=False): dataset = ds.TextFileDataset(BASIC_TOKENIZER_FILE, shuffle=False) if first > 1: dataset = dataset.skip(first - 1) if last >= first: dataset = dataset.take(last - first + 1) basic_tokenizer = text.BasicTokenizer( lower_case=lower_case, keep_whitespace=keep_whitespace, normalization_form=normalization_form, preserve_unused_token=preserve_unused_token) dataset = dataset.map(operations=basic_tokenizer) count = 0 for i in dataset.create_dict_iterator(): token = text.to_str(i['text']) logger.info("Out:", token) logger.info("Exp:", expected_tokens[count]) np.testing.assert_array_equal(token, expected_tokens[count]) count = count + 1
def check_bert_tokenizer(first, last, expect_str, vocab_list, suffix_indicator='##', max_bytes_per_token=100, unknown_token='[UNK]', lower_case=False, keep_whitespace=False, normalization_form=nlp.utils.NormalizeForm.NONE, preserve_unused_token=False): dataset = ds.TextFileDataset(BERT_TOKENIZER_FILE, shuffle=False) if first > 1: dataset = dataset.skip(first - 1) if last >= first: dataset = dataset.take(last - first + 1) vocab = nlp.Vocab.from_list(vocab_list) tokenizer_op = nlp.BertTokenizer( vocab=vocab, suffix_indicator=suffix_indicator, max_bytes_per_token=max_bytes_per_token, unknown_token=unknown_token, lower_case=lower_case, keep_whitespace=keep_whitespace, normalization_form=normalization_form, preserve_unused_token=preserve_unused_token) dataset = dataset.map(operations=tokenizer_op) count = 0 for i in dataset.create_dict_iterator(): text = nlp.to_str(i['text']) logger.info("Out:", text) logger.info("Exp:", expect_str[count]) np.testing.assert_array_equal(text, expect_str[count]) count = count + 1
def pytoken_op(input_data): te = str(to_str(input_data)) tokens = [] tokens.append(te[:5].encode("UTF8")) tokens.append(te[5:10].encode("UTF8")) tokens.append(te[10:].encode("UTF8")) return np.array(tokens, dtype='S')
def test_jieba_with_offsets_3_1(): """Test add_dict with dict""" DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" user_dict = {"男默女泪": 10, "江大桥": 20000} data = ds.TextFileDataset(DATA_FILE4) jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) jieba_op.add_dict(user_dict) data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], columns_order=["token", "offsets_start", "offsets_limit"], operations=jieba_op, num_parallel_workers=1) expect = ['男默女泪', '市长', '江大桥'] expected_offsets_start = [0, 12, 18] expected_offsets_limit = [12, 18, 27] for i in data.create_dict_iterator(): ret = to_str(i["token"]) for index, item in enumerate(ret): assert item == expect[index] for index, item in enumerate(i["offsets_start"]): assert item == expected_offsets_start[index] for index, item in enumerate(i["offsets_limit"]): assert item == expected_offsets_limit[index]
def test_jieba_with_offsets_5(): """Test add dict with file path""" DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt" data = ds.TextFileDataset(DATA_FILE4) jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) jieba_op.add_word("江大桥", 20000) data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], columns_order=["token", "offsets_start", "offsets_limit"], operations=jieba_op, num_parallel_workers=1) expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式'] expected_offsets_start = [0, 6, 12, 21, 27, 30, 42, 45, 51] expected_offsets_limit = [6, 12, 21, 27, 30, 42, 45, 51, 57] for i in data.create_dict_iterator(): ret = to_str(i["token"]) for index, item in enumerate(ret): assert item == expect[index] for index, item in enumerate(i["offsets_start"]): assert item == expected_offsets_start[index] for index, item in enumerate(i["offsets_limit"]): assert item == expected_offsets_limit[index]
def test_jieba_with_offsets_4(): DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt" DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt" data = ds.TextFileDataset(DATA_FILE4) jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP, with_offsets=True) jieba_op.add_dict(DICT_FILE) data = data.map(input_columns=["text"], output_columns=["token", "offsets_start", "offsets_limit"], columns_order=["token", "offsets_start", "offsets_limit"], operations=jieba_op, num_parallel_workers=1) expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] expected_offsets_start = [0, 12, 21, 27, 33, 36, 42] expected_offsets_limit = [12, 21, 27, 33, 36, 42, 48] for i in data.create_dict_iterator(): ret = to_str(i["token"]) for index, item in enumerate(ret): assert item == expect[index] for index, item in enumerate(i["offsets_start"]): assert item == expected_offsets_start[index] for index, item in enumerate(i["offsets_limit"]): assert item == expected_offsets_limit[index]
def concat_test(dataset): dataset_1 = copy.deepcopy(dataset) dataset = dataset.concat(dataset_1) expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.'] for i in dataset.create_dict_iterator(): ret = to_str(i["text"]) for key, value in enumerate(ret): assert value == expect[key]
def test_jieba_6(): data = ds.GeneratorDataset(gen, column_names=["text"]) data = data.map(operations=pytoken_op, input_columns=["text"], num_parallel_workers=1) expect = ['今天天气太', '好了我们一', '起去外面玩吧'] for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): ret = to_str(i["text"]) for index, item in enumerate(ret): assert item == expect[index]
def zip_test(dataset): dataset_1 = copy.deepcopy(dataset) dataset_2 = copy.deepcopy(dataset) dataset_1 = dataset_1.apply(apply_func) dataset_zip = ds.zip((dataset_1, dataset_2)) expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.'] for i in dataset_zip.create_dict_iterator(): ret = to_str(i["text"]) for key, value in enumerate(ret): assert value == expect[key]
def test_from_vocab_to_str_WORD(): vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.WORD, {}) tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = dataset.map(operations=tokenizer) expect = ['▁I', '▁saw', '▁a', '▁girl', '▁with', '▁a', '▁telescope.'] for i in dataset.create_dict_iterator(): ret = to_str(i["text"]) for key, value in enumerate(ret): assert value == expect[key]
def test_jieba_1_2(): """Test jieba tokenizer with HMM MIX""" data = ds.TextFileDataset(DATA_FILE) jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MIX) data = data.map(operations=jieba_op, input_columns=["text"], num_parallel_workers=1) expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): ret = to_str(i["text"]) for index, item in enumerate(ret): assert item == expect[index]
def test_from_vocab_to_str_CHAR(): vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.CHAR, {}) tokenizer = text.SentencePieceTokenizer(vocab, out_type=SPieceTokenizerOutType.STRING) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = dataset.map(operations=tokenizer) expect = ['▁', 'I', '▁', 's', 'a', 'w', '▁', 'a', '▁', 'g', 'i', 'r', 'l', '▁', 'w', 'i', 't', 'h',\ '▁', 'a', '▁', 't', 'e', 'l', 'e', 's', 'c', 'o', 'p', 'e', '.'] for i in dataset.create_dict_iterator(): ret = to_str(i["text"]) for key, value in enumerate(ret): assert value == expect[key]
def test_from_file_to_str(): vocab = text.SentencePieceVocab.from_file([VOCAB_FILE], 5000, 0.9995, SentencePieceModel.UNIGRAM, {}) text.SentencePieceVocab.save_model(vocab, "./", "m.model") tokenizer = text.SentencePieceTokenizer("./m.model", out_type=SPieceTokenizerOutType.STRING) dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) dataset = dataset.map(operations=tokenizer) expect = ['▁I', '▁sa', 'w', '▁a', '▁girl', '▁with', '▁a', '▁te', 'les', 'co', 'pe', '.'] for i in dataset.create_dict_iterator(): ret = to_str(i["text"]) for key, value in enumerate(ret): assert value == expect[key]
def test_jieba_1_1(): """Test jieba tokenizer with HMM mode""" data = ds.TextFileDataset(DATA_FILE) jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.HMM) data = data.map(input_columns=["text"], operations=jieba_op, num_parallel_workers=1) expect = ['今天', '天气', '太', '好', '了', '我们', '一起', '去', '外面', '玩', '吧'] for i in data.create_dict_iterator(): ret = to_str(i["text"]) for index, item in enumerate(ret): assert item == expect[index]
def normalize(normalize_form): dataset = ds.TextFileDataset(NORMALIZE_FILE, shuffle=False) normalize = nlp.NormalizeUTF8(normalize_form=normalize_form) dataset = dataset.map(operations=normalize) out_bytes = [] out_texts = [] for i in dataset.create_dict_iterator(): out_bytes.append(i['text']) out_texts.append(nlp.to_str(i['text']).tolist()) logger.info("The out bytes is : ", out_bytes) logger.info("The out texts is: ", out_texts) return out_bytes
def test_jieba_2_3(): """Test add_word with freq, the value of freq affects the result of segmentation""" DATA_FILE4 = "../data/dataset/testJiebaDataset/6.txt" data = ds.TextFileDataset(DATA_FILE4) jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) jieba_op.add_word("江大桥", 20000) data = data.map(operations=jieba_op, input_columns=["text"], num_parallel_workers=2) expect = ['江州', '市长', '江大桥', '参加', '了', '长江大桥', '的', '通车', '仪式'] for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): ret = to_str(i["text"]) for index, item in enumerate(ret): assert item == expect[index]
def test_jieba_2(): """Test add_word""" DATA_FILE4 = "../data/dataset/testJiebaDataset/4.txt" data = ds.TextFileDataset(DATA_FILE4) jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) jieba_op.add_word("男默女泪") expect = ['男默女泪', '市', '长江大桥'] data = data.map(operations=jieba_op, input_columns=["text"], num_parallel_workers=2) for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): ret = to_str(i["text"]) for index, item in enumerate(ret): assert item == expect[index]
def test_case_fold(): """ Test CaseFold """ expect_strs = ["welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "] dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) op = nlp.CaseFold() dataset = dataset.map(operations=op) lower_strs = [] for i in dataset.create_dict_iterator(): text = nlp.to_str(i['text']).tolist() lower_strs.append(text) assert lower_strs == expect_strs
def test_jieba_4(): DATA_FILE4 = "../data/dataset/testJiebaDataset/3.txt" DICT_FILE = "../data/dataset/testJiebaDataset/user_dict.txt" data = ds.TextFileDataset(DATA_FILE4) jieba_op = JiebaTokenizer(HMM_FILE, MP_FILE, mode=JiebaMode.MP) jieba_op.add_dict(DICT_FILE) data = data.map(operations=jieba_op, input_columns=["text"], num_parallel_workers=1) expect = ['今天天气', '太好了', '我们', '一起', '去', '外面', '玩吧'] for i in data.create_dict_iterator(num_epochs=1, output_numpy=True): ret = to_str(i["text"]) for index, item in enumerate(ret): assert item == expect[index]
def test_unicode_char_tokenizer(): """ Test UnicodeCharTokenizer """ input_strs = ("Welcome to Beijing!", "北京欢迎您!", "我喜欢English!", " ") dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) tokenizer = nlp.UnicodeCharTokenizer() dataset = dataset.map(operations=tokenizer) tokens = [] for i in dataset.create_dict_iterator(): text = nlp.to_str(i['text']).tolist() tokens.append(text) logger.info("The out tokens is : {}".format(tokens)) assert split_by_unicode_char(input_strs) == tokens