def test_exact_entry(self): text = 'The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.' phrases = ['regular expressions'] tagged = PhrasePositionFinder.find_phrase_int_source_text( text, phrases)[0] pos = tagged[1] self.assertEqual(text.find(phrases[0]), pos)
def get_trademark_annotations(text: str) -> \ Generator[TrademarkAnnotation, None, None]: """ Find trademarks in text. """ # Iterate through sentences if TRADEMARK_PTN_RE.search(text): for scd in get_sentence_span(text): sentence = scd[2] phrases = list(np_extractor.get_np(sentence)) tagged_phrases = PhrasePositionFinder.find_phrase_int_source_text( sentence, phrases) for phrase in tagged_phrases: for tm in TRADEMARK_PTN_RE.finditer(phrase[0]): coords = tm.span() coords = (coords[0] + scd[0] + phrase[1], coords[1] + scd[0] + phrase[1]) if coords[1] >= len(text): coords = (coords[0], len(text) - 1) ant = TrademarkAnnotation(coords=coords, trademark=tm.group()) yield ant
def test_similar_entries(self): text = 'aa aaa aaa aaaaa aa aaa aa' tagged = PhrasePositionFinder.find_phrase_int_source_text( text, ['aaa', 'aa']) self.assertEqual((3, 7), (tagged[0][1], tagged[1][1]))
def test_corrupted_entry(self): text = 'The Treebank tokenizer uses regular(expressions) to tokenize text as in Penn Treebank.' phrases = ['regular expressions'] tagged = PhrasePositionFinder.find_phrase_int_source_text( text, phrases)[0] self.assertEqual(0, tagged[1])
def extract_phrases_with_coords(cls, sentence: str) -> List[Tuple[str, int]]: phrases = list(np_extractor.get_np(sentence)) tagged_phrases = PhrasePositionFinder.find_phrase_int_source_text( sentence, phrases) return tagged_phrases