def test_whitespace_tokenizer_with_offsets(): """ Test WhitespaceTokenizer """ whitespace_strs = [["Welcome", "to", "Beijing!"], ["北京欢迎您!"], ["我喜欢English!"], [""]] dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) tokenizer = text.WhitespaceTokenizer(with_offsets=True) dataset = dataset.map( input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer) tokens = [] expected_offsets_start = [[0, 8, 11], [0], [0], [0]] expected_offsets_limit = [[7, 10, 19], [18], [17], [0]] count = 0 for i in dataset.create_dict_iterator(): token = text.to_str(i['token']).tolist() tokens.append(token) np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) count += 1 logger.info("The out tokens is : {}".format(tokens)) assert whitespace_strs == tokens
def test_whitespace_tokenizer_default(): """ Test WhitespaceTokenizer """ whitespace_strs = [["Welcome", "to", "Beijing!"], ["北京欢迎您!"], ["我喜欢English!"], [""]] dataset = ds.TextFileDataset(DATA_FILE, shuffle=False) tokenizer = text.WhitespaceTokenizer() dataset = dataset.map(operations=tokenizer) tokens = [] for i in dataset.create_dict_iterator(): token = text.to_str(i['text']).tolist() tokens.append(token) logger.info("The out tokens is : {}".format(tokens)) assert whitespace_strs == tokens