def test_exact_entry(self):
     text = 'The Treebank tokenizer uses regular  expressions to tokenize text as in Penn Treebank.'
     phrases = ['regular  expressions']
     tagged = PhrasePositionFinder.find_phrase_int_source_text(
         text, phrases)[0]
     pos = tagged[1]
     self.assertEqual(text.find(phrases[0]), pos)
Exemple #2
0
def get_trademark_annotations(text: str) -> \
        Generator[TrademarkAnnotation, None, None]:
    """
    Find trademarks in text.
    """
    # Iterate through sentences
    if TRADEMARK_PTN_RE.search(text):
        for scd in get_sentence_span(text):
            sentence = scd[2]
            phrases = list(np_extractor.get_np(sentence))
            tagged_phrases = PhrasePositionFinder.find_phrase_int_source_text(
                sentence, phrases)
            for phrase in tagged_phrases:
                for tm in TRADEMARK_PTN_RE.finditer(phrase[0]):
                    coords = tm.span()
                    coords = (coords[0] + scd[0] + phrase[1],
                              coords[1] + scd[0] + phrase[1])
                    if coords[1] >= len(text):
                        coords = (coords[0], len(text) - 1)
                    ant = TrademarkAnnotation(coords=coords,
                                              trademark=tm.group())
                    yield ant
 def test_similar_entries(self):
     text = 'aa aaa aaa aaaaa aa aaa aa'
     tagged = PhrasePositionFinder.find_phrase_int_source_text(
         text, ['aaa', 'aa'])
     self.assertEqual((3, 7), (tagged[0][1], tagged[1][1]))
 def test_corrupted_entry(self):
     text = 'The Treebank tokenizer uses regular(expressions) to tokenize text as in Penn Treebank.'
     phrases = ['regular expressions']
     tagged = PhrasePositionFinder.find_phrase_int_source_text(
         text, phrases)[0]
     self.assertEqual(0, tagged[1])
Exemple #5
0
 def extract_phrases_with_coords(cls,
                                 sentence: str) -> List[Tuple[str, int]]:
     phrases = list(np_extractor.get_np(sentence))
     tagged_phrases = PhrasePositionFinder.find_phrase_int_source_text(
         sentence, phrases)
     return tagged_phrases