def test(): assert_tags_equal( tagger=LinearTagger(), expected=[ (0, Tag( pos=PartOfSpeech.ADJECTIVE, case=Case.NOMINATIVE, degree=Degree.POSITIVE, gender=Gender.NEUTER, number=Number.SINGULAR, variant=Variant.FULL, )), ( 1, Tag( pos=PartOfSpeech.NOUN, animacy=Animacy.INANIMATE, case=Case.NOMINATIVE, gender=Gender.NEUTER, number=Number.SINGULAR, ), ), ], words=['чёрное', 'зеркало'], )
def test(): assert_tags_equal( tagger=CRFTagger(), expected=[ (0, Tag( pos=PartOfSpeech.ADJECTIVE, case=Case.NOMINATIVE, degree=Degree.POSITIVE, gender=Gender.MASCULINE, number=Number.SINGULAR, variant=Variant.FULL, )), ( 1, Tag( pos=PartOfSpeech.NOUN, animacy=Animacy.ANIMATE, case=Case.NOMINATIVE, gender=Gender.MASCULINE, number=Number.SINGULAR, ), ), ], words=['настоящий', 'детектив'], )
def test(): assert_tags_equal(tagger=RNNTagger(), expected=[ ( 0, Tag( pos=PartOfSpeech.ADJECTIVE, case=Case.NOMINATIVE, degree=Degree.POSITIVE, number=Number.PLURAL, variant=Variant.FULL, ), ), ( 1, Tag( pos=PartOfSpeech.NOUN, animacy=Animacy.INANIMATE, case=Case.NOMINATIVE, gender=Gender.NEUTER, number=Number.PLURAL, ), ), ], words=['необычные', 'дела'])
def test_tag(): tag = Tag(pos=PartOfSpeech.NOUN) _assert_analyzed_equal( expected=[Morph(word='hello', lemma='hello', tag=tag)], taggers=[ConstantTagger(word='hello', tag=tag)], text=['hello'], )
def test_tag_partially(): tag = Tag(pos=PartOfSpeech.ADJECTIVE) _assert_analyzed_equal( expected=[ Morph(word='hello', lemma='hello', tag=_UNKNOWN), Morph(word='world', lemma='world', tag=tag), ], taggers=[ConstantTagger(word='world', tag=tag)], text=['hello', 'world'], )
def get_tag(parse: pymorphy2.analyzer.Parse) -> Tag: return Tag( pos=get_part_of_speech(parse), animacy=get_animacy(parse), aspect=get_aspect(parse), case=get_case(parse), degree=get_degree(parse), gender=get_gender(parse), mood=get_mood(parse), number=get_number(parse), person=get_person(parse), tense=get_tense(parse), verbform=get_verbform(parse), voice=get_voice(parse), )
import re from typing import Iterator from maru.grammeme import PartOfSpeech from maru.grammeme.numform import NumericalForm from maru.tag import Tag from maru.tagger.abstract import ITagger, Tagged from maru.types import Text, Indices _REGEX = re.compile(f'(?P<{NumericalForm.REAL}>\d+[.,]\d+$)|' f'(?P<{NumericalForm.INTEGER}>\d+$)') _INTEGER = Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.INTEGER) _REAL = Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.REAL) class NumericalTagger(ITagger): def tag(self, text: Text, indices: Indices) -> Iterator[Tagged]: for index in indices: match = _REGEX.match(text[index]) if match is not None: group = match.lastgroup tag = _REAL if group == NumericalForm.REAL else _INTEGER yield index, tag
import pytest from maru.grammeme import PartOfSpeech from maru.tag import Tag from maru.tagger.punctuation import PunctuationTagger from tests.tagger.base import TaggerTest _PUNCTUATION = Tag(pos=PartOfSpeech.PUNCTUATION) @pytest.fixture(name='tagger', scope='session') def create_tagger(): return PunctuationTagger() @pytest.mark.parametrize( 'test', [ TaggerTest( words=['!', '@', '.....,'], tags=[(0, _PUNCTUATION), (1, _PUNCTUATION), (2, _PUNCTUATION)], ), TaggerTest( words=['?!', '"', ':', ';'], tags=[ (0, _PUNCTUATION), (1, _PUNCTUATION), (2, _PUNCTUATION), (3, _PUNCTUATION), ], ),
def test(): lemmatizer = PymorphyLemmatizer() assert lemmatizer.lemmatize('мыло', Tag(pos=PartOfSpeech.VERB)) == 'мыть'
@pytest.fixture(name='tagger', scope='session') def create_tagger(): return RNNTagger() @pytest.mark.parametrize( 'test', [ TaggerTest( words=['необычные', 'дела'], tags=[ ( 0, Tag( pos=PartOfSpeech.ADJECTIVE, case=Case.NOMINATIVE, degree=Degree.POSITIVE, number=Number.PLURAL, variant=Variant.FULL, ), ), ( 1, Tag( pos=PartOfSpeech.NOUN, animacy=Animacy.INANIMATE, case=Case.NOMINATIVE, gender=Gender.NEUTER, number=Number.PLURAL, ),
from typing import Sequence from maru.grammeme import PartOfSpeech from maru.lemmatizer import DummyLemmatizer from maru.morph import Morph from maru.analyzer import Analyzer from maru.tag import Tag from maru.tagger import ITagger from maru.types import Text from tests.stubs.tagger import ConstantTagger _UNKNOWN = Tag(pos=PartOfSpeech.UNKNOWN) def _assert_analyzed_equal(expected: Sequence[Morph], taggers: Sequence[ITagger], text: Text, ): analyzer = Analyzer(taggers, lemmatizer=DummyLemmatizer()) assert expected == list(analyzer.analyze(text)) def test_unknown(): _assert_analyzed_equal( expected=[ Morph( word='hello', lemma='hello', tag=_UNKNOWN,
import re from typing import Iterator from maru.grammeme import PartOfSpeech from maru.grammeme.numform import NumericalForm from maru.tag import Tag from maru.tagger.abstract import ITagger, Tagged from maru.types import Indices, Text _REGEX = re.compile(rf'(?P<{NumericalForm.REAL}>\d+[.,/]\d+$)|' rf'(?P<{NumericalForm.INTEGER}>\d+$)|' rf'(?P<{NumericalForm.RANGE}>\d+[‑–—−-]\d+)') _TAGS = { NumericalForm.REAL: Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.REAL), NumericalForm.INTEGER: Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.INTEGER), NumericalForm.RANGE: Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.RANGE), } class NumericalTagger(ITagger): def tag(self, text: Text, indices: Indices) -> Iterator[Tagged]: for index in indices: match = _REGEX.match(text[index]) if match is not None: form = NumericalForm(match.lastgroup) yield index, _TAGS[form]