def regex_tokenizer(first, last, expect_str, expected_offsets_start,
                     expected_offsets_limit, delim_pattern,
                     keep_delim_pattern):
     dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
     if first > 1:
         dataset = dataset.skip(first - 1)
     if last >= first:
         dataset = dataset.take(last - first + 1)
     tokenizer_op = text.RegexTokenizer(delim_pattern,
                                        keep_delim_pattern,
                                        with_offsets=True)
     dataset = dataset.map(
         input_columns=['text'],
         output_columns=['token', 'offsets_start', 'offsets_limit'],
         columns_order=['token', 'offsets_start', 'offsets_limit'],
         operations=tokenizer_op)
     out_text = []
     count = 0
     for i in dataset.create_dict_iterator():
         token = text.to_str(i['token']).tolist()
         np.testing.assert_array_equal(token, expect_str[count])
         np.testing.assert_array_equal(i['offsets_start'],
                                       expected_offsets_start[count])
         np.testing.assert_array_equal(i['offsets_limit'],
                                       expected_offsets_limit[count])
         count += 1
         out_text.append(token)
     logger.info("Out:", out_text)
     logger.info("Exp:", expect_str)
Exemple #2
0
 def regex_tokenizer(first, last, expect_str, delim_pattern, keep_delim_pattern):
     dataset = ds.TextFileDataset(REGEX_TOKENIZER_FILE, shuffle=False)
     if first > 1:
         dataset = dataset.skip(first - 1)
     if last >= first:
         dataset = dataset.take(last - first + 1)
     tokenizer_op = nlp.RegexTokenizer(delim_pattern, keep_delim_pattern)
     dataset = dataset.map(operations=tokenizer_op)
     out_text = []
     count = 0
     for i in dataset.create_dict_iterator():
         text = nlp.to_str(i['text']).tolist()
         np.testing.assert_array_equal(text, expect_str[count])
         count += 1
         out_text.append(text)
     logger.info("Out:", out_text)
     logger.info("Exp:", expect_str)