def test_tag_partially(): tag = Tag(pos=PartOfSpeech.ADJECTIVE) _assert_analyzed_equal( expected=[ Morph(word='hello', lemma='hello', tag=_UNKNOWN), Morph(word='world', lemma='world', tag=tag), ], taggers=[ConstantTagger(word='world', tag=tag)], text=['hello', 'world'], )
def test_tag(): tag = Tag(pos=PartOfSpeech.NOUN) _assert_analyzed_equal( expected=[Morph(word='hello', lemma='hello', tag=tag)], taggers=[ConstantTagger(word='hello', tag=tag)], text=['hello'], )
def test_unknown(): _assert_analyzed_equal( expected=[ Morph( word='hello', lemma='hello', tag=_UNKNOWN, ), ], taggers=[], text=['hello'], )
def analyze(self, text: Text) -> Iterable[Morph]: tags: Dict[Index, Tag] = {} length = len(text) indices: Sequence[int] = range(length) for tagger in self._taggers: tags.update(tagger.tag(text, indices)) indices = [index for index in indices if index not in tags] if not indices: break lemmatizer = self._lemmatizer for index, word in enumerate(text): tag = tags.get(index, _UNKNOWN) lemma = lemmatizer.lemmatize(word, tag) yield Morph(word, lemma, tag)