Ejemplo n.º 1
0
def test():
    assert_tags_equal(
        tagger=LinearTagger(),
        expected=[
            (0,
             Tag(
                 pos=PartOfSpeech.ADJECTIVE,
                 case=Case.NOMINATIVE,
                 degree=Degree.POSITIVE,
                 gender=Gender.NEUTER,
                 number=Number.SINGULAR,
                 variant=Variant.FULL,
             )),
            (
                1,
                Tag(
                    pos=PartOfSpeech.NOUN,
                    animacy=Animacy.INANIMATE,
                    case=Case.NOMINATIVE,
                    gender=Gender.NEUTER,
                    number=Number.SINGULAR,
                ),
            ),
        ],
        words=['чёрное', 'зеркало'],
    )
Ejemplo n.º 2
0
def test():
    assert_tags_equal(
        tagger=CRFTagger(),
        expected=[
            (0,
             Tag(
                 pos=PartOfSpeech.ADJECTIVE,
                 case=Case.NOMINATIVE,
                 degree=Degree.POSITIVE,
                 gender=Gender.MASCULINE,
                 number=Number.SINGULAR,
                 variant=Variant.FULL,
             )),
            (
                1,
                Tag(
                    pos=PartOfSpeech.NOUN,
                    animacy=Animacy.ANIMATE,
                    case=Case.NOMINATIVE,
                    gender=Gender.MASCULINE,
                    number=Number.SINGULAR,
                ),
            ),
        ],
        words=['настоящий', 'детектив'],
    )
Ejemplo n.º 3
0
def test():
    assert_tags_equal(tagger=RNNTagger(),
                      expected=[
                          (
                              0,
                              Tag(
                                  pos=PartOfSpeech.ADJECTIVE,
                                  case=Case.NOMINATIVE,
                                  degree=Degree.POSITIVE,
                                  number=Number.PLURAL,
                                  variant=Variant.FULL,
                              ),
                          ),
                          (
                              1,
                              Tag(
                                  pos=PartOfSpeech.NOUN,
                                  animacy=Animacy.INANIMATE,
                                  case=Case.NOMINATIVE,
                                  gender=Gender.NEUTER,
                                  number=Number.PLURAL,
                              ),
                          ),
                      ],
                      words=['необычные', 'дела'])
Ejemplo n.º 4
0
def test_tag():
    tag = Tag(pos=PartOfSpeech.NOUN)

    _assert_analyzed_equal(
        expected=[Morph(word='hello', lemma='hello', tag=tag)],
        taggers=[ConstantTagger(word='hello', tag=tag)],
        text=['hello'],
    )
Ejemplo n.º 5
0
def test_tag_partially():
    tag = Tag(pos=PartOfSpeech.ADJECTIVE)

    _assert_analyzed_equal(
        expected=[
            Morph(word='hello', lemma='hello', tag=_UNKNOWN),
            Morph(word='world', lemma='world', tag=tag),
        ],
        taggers=[ConstantTagger(word='world', tag=tag)],
        text=['hello', 'world'],
    )
Ejemplo n.º 6
0
def get_tag(parse: pymorphy2.analyzer.Parse) -> Tag:
    return Tag(
        pos=get_part_of_speech(parse),
        animacy=get_animacy(parse),
        aspect=get_aspect(parse),
        case=get_case(parse),
        degree=get_degree(parse),
        gender=get_gender(parse),
        mood=get_mood(parse),
        number=get_number(parse),
        person=get_person(parse),
        tense=get_tense(parse),
        verbform=get_verbform(parse),
        voice=get_voice(parse),
    )
Ejemplo n.º 7
0
import re
from typing import Iterator

from maru.grammeme import PartOfSpeech
from maru.grammeme.numform import NumericalForm
from maru.tag import Tag
from maru.tagger.abstract import ITagger, Tagged
from maru.types import Text, Indices

_REGEX = re.compile(f'(?P<{NumericalForm.REAL}>\d+[.,]\d+$)|'
                    f'(?P<{NumericalForm.INTEGER}>\d+$)')

_INTEGER = Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.INTEGER)
_REAL = Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.REAL)


class NumericalTagger(ITagger):
    def tag(self, text: Text, indices: Indices) -> Iterator[Tagged]:
        for index in indices:
            match = _REGEX.match(text[index])
            if match is not None:
                group = match.lastgroup
                tag = _REAL if group == NumericalForm.REAL else _INTEGER
                yield index, tag
Ejemplo n.º 8
0
import pytest

from maru.grammeme import PartOfSpeech
from maru.tag import Tag
from maru.tagger.punctuation import PunctuationTagger
from tests.tagger.base import TaggerTest

_PUNCTUATION = Tag(pos=PartOfSpeech.PUNCTUATION)


@pytest.fixture(name='tagger', scope='session')
def create_tagger():
    return PunctuationTagger()


@pytest.mark.parametrize(
    'test',
    [
        TaggerTest(
            words=['!', '@', '.....,'],
            tags=[(0, _PUNCTUATION), (1, _PUNCTUATION), (2, _PUNCTUATION)],
        ),
        TaggerTest(
            words=['?!', '"', ':', ';'],
            tags=[
                (0, _PUNCTUATION),
                (1, _PUNCTUATION),
                (2, _PUNCTUATION),
                (3, _PUNCTUATION),
            ],
        ),
Ejemplo n.º 9
0
def test():
    lemmatizer = PymorphyLemmatizer()

    assert lemmatizer.lemmatize('мыло', Tag(pos=PartOfSpeech.VERB)) == 'мыть'
Ejemplo n.º 10
0
@pytest.fixture(name='tagger', scope='session')
def create_tagger():
    return RNNTagger()


@pytest.mark.parametrize(
    'test',
    [
        TaggerTest(
            words=['необычные', 'дела'],
            tags=[
                (
                    0,
                    Tag(
                        pos=PartOfSpeech.ADJECTIVE,
                        case=Case.NOMINATIVE,
                        degree=Degree.POSITIVE,
                        number=Number.PLURAL,
                        variant=Variant.FULL,
                    ),
                ),
                (
                    1,
                    Tag(
                        pos=PartOfSpeech.NOUN,
                        animacy=Animacy.INANIMATE,
                        case=Case.NOMINATIVE,
                        gender=Gender.NEUTER,
                        number=Number.PLURAL,
                    ),
Ejemplo n.º 11
0
from typing import Sequence

from maru.grammeme import PartOfSpeech
from maru.lemmatizer import DummyLemmatizer
from maru.morph import Morph
from maru.analyzer import Analyzer
from maru.tag import Tag
from maru.tagger import ITagger
from maru.types import Text

from tests.stubs.tagger import ConstantTagger

_UNKNOWN = Tag(pos=PartOfSpeech.UNKNOWN)


def _assert_analyzed_equal(expected: Sequence[Morph],
                           taggers: Sequence[ITagger],
                           text: Text,
                           ):
    analyzer = Analyzer(taggers, lemmatizer=DummyLemmatizer())

    assert expected == list(analyzer.analyze(text))


def test_unknown():
    _assert_analyzed_equal(
        expected=[
            Morph(
                word='hello',
                lemma='hello',
                tag=_UNKNOWN,
Ejemplo n.º 12
0
import re
from typing import Iterator

from maru.grammeme import PartOfSpeech
from maru.grammeme.numform import NumericalForm
from maru.tag import Tag
from maru.tagger.abstract import ITagger, Tagged
from maru.types import Indices, Text

_REGEX = re.compile(rf'(?P<{NumericalForm.REAL}>\d+[.,/]\d+$)|'
                    rf'(?P<{NumericalForm.INTEGER}>\d+$)|'
                    rf'(?P<{NumericalForm.RANGE}>\d+[‑–—−-]\d+)')
_TAGS = {
    NumericalForm.REAL:
    Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.REAL),
    NumericalForm.INTEGER:
    Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.INTEGER),
    NumericalForm.RANGE:
    Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.RANGE),
}


class NumericalTagger(ITagger):
    def tag(self, text: Text, indices: Indices) -> Iterator[Tagged]:
        for index in indices:
            match = _REGEX.match(text[index])
            if match is not None:
                form = NumericalForm(match.lastgroup)
                yield index, _TAGS[form]