def regex_tokenizer(first, last, expect_str, expected_offsets_start, expected_offsets_limit, delim_pattern, keep_delim_pattern): dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False) if first > 1: dataset = dataset.skip(first - 1) if last >= first: dataset = dataset.take(last - first + 1) tokenizer_op = text.RegexTokenizer(delim_pattern, keep_delim_pattern, with_offsets=True) dataset = dataset.map( input_columns=['text'], output_columns=['token', 'offsets_start', 'offsets_limit'], columns_order=['token', 'offsets_start', 'offsets_limit'], operations=tokenizer_op) out_text = [] count = 0 for i in dataset.create_dict_iterator(): token = text.to_str(i['token']).tolist() np.testing.assert_array_equal(token, expect_str[count]) np.testing.assert_array_equal(i['offsets_start'], expected_offsets_start[count]) np.testing.assert_array_equal(i['offsets_limit'], expected_offsets_limit[count]) count += 1 out_text.append(token) logger.info("Out:", out_text) logger.info("Exp:", expect_str)
def regex_tokenizer(first, last, expect_str, delim_pattern, keep_delim_pattern): dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False) if first > 1: dataset = dataset.skip(first - 1) if last >= first: dataset = dataset.take(last - first + 1) tokenizer_op = nlp.RegexTokenizer(delim_pattern, keep_delim_pattern) dataset = dataset.map(operations=tokenizer_op) out_text = [] count = 0 for i in dataset.create_dict_iterator(): text = nlp.to_str(i['text']).tolist() np.testing.assert_array_equal(text, expect_str[count]) count += 1 out_text.append(text) logger.info("Out:", out_text) logger.info("Exp:", expect_str)