def test_extract_regex_tok_len_2(self): file_name = "test_data/io/test_systemt/test.dict" file_text = self._read_file(file_name) tokenizer = self._make_tokenizer() char_span = make_tokens(file_text, tokenizer) match_regex = re.compile(r".*y$") result_df = extract_regex_tok(char_span, match_regex, min_len=2, max_len=2, output_col_name="result") self.assertIn("result", result_df.columns) # print(f"****\n{result_df}\n****") self.assertEqual( repr(result_df), textwrap.dedent("""\ result 0 [0, 16): 'Dictionary Entry' 1 [11, 22): 'Entry Entry' 2 [50, 63): 'Haiku factory' 3 [73, 84): 'before they'"""), )
def test_extract_dict(self): file_name = "test_data/io/test_systemt/test.dict" file_text = self._read_file(file_name) tokenizer = self._make_tokenizer() char_span = make_tokens(file_text, tokenizer) dict_df = load_dict(file_name, tokenizer) result_df = extract_dict(char_span, dict_df, "result") self.assertIn("result", result_df.columns) #print(f"****\n{result_df}\n****") self.assertEqual( repr(result_df), textwrap.dedent( """\ result 2 [0, 16): 'Dictionary Entry' 0 [11, 16): 'Entry' 1 [17, 22): 'Entry' 5 [23, 44): 'Help me! I am trapped' 4 [45, 64): 'In a Haiku factory!' 3 [65, 84): 'Save me before they'""" ) )
overlap_join, ) # SpaCy tokenizer (only) setup nlp = English() # Create a Tokenizer with the default settings for English # including punctuation rules and exceptions _tokenizer = nlp.tokenizer # Build up some example relations for the tests in this file _TEXT = """ In AD 932, King Arthur and his squire, Patsy, travel throughout Britain searching for men to join the Knights of the Round Table. Along the way, he recruits Sir Bedevere the Wise, Sir Lancelot the Brave, Sir Galahad the Pure... """ _TOKENS_SERIES = make_tokens(_TEXT, _tokenizer) _TOKENS_ARRAY = _TOKENS_SERIES.array # type: SpanArray _TOKEN_SPANS_ARRAY = TokenSpanArray.from_char_offsets(_TOKENS_ARRAY) _CAPS_WORD = extract_regex_tok(_TOKENS_ARRAY, regex.compile("[A-Z][a-z]*")) _CAPS_WORDS = extract_regex_tok( _TOKENS_ARRAY, regex.compile("[A-Z][a-z]*(\\s([A-Z][a-z]*))*"), 1, 2 ) _THE = extract_regex_tok(_TOKENS_ARRAY, regex.compile("[Tt]he")) class JoinTest(TestBase): def setUp(self): # Make it easier to see what's going on with join results self._prev_token_offsets_flag_value = TokenSpan.USE_TOKEN_OFFSETS_IN_REPR TokenSpan.USE_TOKEN_OFFSETS_IN_REPR = True