コード例 #1
0
def test():
    assert_tags_equal(
        tagger=LinearTagger(),
        expected=[
            (0,
             Tag(
                 pos=PartOfSpeech.ADJECTIVE,
                 case=Case.NOMINATIVE,
                 degree=Degree.POSITIVE,
                 gender=Gender.NEUTER,
                 number=Number.SINGULAR,
                 variant=Variant.FULL,
             )),
            (
                1,
                Tag(
                    pos=PartOfSpeech.NOUN,
                    animacy=Animacy.INANIMATE,
                    case=Case.NOMINATIVE,
                    gender=Gender.NEUTER,
                    number=Number.SINGULAR,
                ),
            ),
        ],
        words=['чёрное', 'зеркало'],
    )
コード例 #2
0
def test():
    assert_tags_equal(
        tagger=CRFTagger(),
        expected=[
            (0,
             Tag(
                 pos=PartOfSpeech.ADJECTIVE,
                 case=Case.NOMINATIVE,
                 degree=Degree.POSITIVE,
                 gender=Gender.MASCULINE,
                 number=Number.SINGULAR,
                 variant=Variant.FULL,
             )),
            (
                1,
                Tag(
                    pos=PartOfSpeech.NOUN,
                    animacy=Animacy.ANIMATE,
                    case=Case.NOMINATIVE,
                    gender=Gender.MASCULINE,
                    number=Number.SINGULAR,
                ),
            ),
        ],
        words=['настоящий', 'детектив'],
    )
コード例 #3
0
def test():
    assert_tags_equal(tagger=RNNTagger(),
                      expected=[
                          (
                              0,
                              Tag(
                                  pos=PartOfSpeech.ADJECTIVE,
                                  case=Case.NOMINATIVE,
                                  degree=Degree.POSITIVE,
                                  number=Number.PLURAL,
                                  variant=Variant.FULL,
                              ),
                          ),
                          (
                              1,
                              Tag(
                                  pos=PartOfSpeech.NOUN,
                                  animacy=Animacy.INANIMATE,
                                  case=Case.NOMINATIVE,
                                  gender=Gender.NEUTER,
                                  number=Number.PLURAL,
                              ),
                          ),
                      ],
                      words=['необычные', 'дела'])
コード例 #4
0
def test_tag():
    tag = Tag(pos=PartOfSpeech.NOUN)

    _assert_analyzed_equal(
        expected=[Morph(word='hello', lemma='hello', tag=tag)],
        taggers=[ConstantTagger(word='hello', tag=tag)],
        text=['hello'],
    )
コード例 #5
0
def test_tag_partially():
    tag = Tag(pos=PartOfSpeech.ADJECTIVE)

    _assert_analyzed_equal(
        expected=[
            Morph(word='hello', lemma='hello', tag=_UNKNOWN),
            Morph(word='world', lemma='world', tag=tag),
        ],
        taggers=[ConstantTagger(word='world', tag=tag)],
        text=['hello', 'world'],
    )
コード例 #6
0
ファイル: tag.py プロジェクト: gilyazutdinov/maru
def get_tag(parse: pymorphy2.analyzer.Parse) -> Tag:
    return Tag(
        pos=get_part_of_speech(parse),
        animacy=get_animacy(parse),
        aspect=get_aspect(parse),
        case=get_case(parse),
        degree=get_degree(parse),
        gender=get_gender(parse),
        mood=get_mood(parse),
        number=get_number(parse),
        person=get_person(parse),
        tense=get_tense(parse),
        verbform=get_verbform(parse),
        voice=get_voice(parse),
    )
コード例 #7
0
import re
from typing import Iterator

from maru.grammeme import PartOfSpeech
from maru.grammeme.numform import NumericalForm
from maru.tag import Tag
from maru.tagger.abstract import ITagger, Tagged
from maru.types import Text, Indices

_REGEX = re.compile(f'(?P<{NumericalForm.REAL}>\d+[.,]\d+$)|'
                    f'(?P<{NumericalForm.INTEGER}>\d+$)')

_INTEGER = Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.INTEGER)
_REAL = Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.REAL)


class NumericalTagger(ITagger):
    def tag(self, text: Text, indices: Indices) -> Iterator[Tagged]:
        for index in indices:
            match = _REGEX.match(text[index])
            if match is not None:
                group = match.lastgroup
                tag = _REAL if group == NumericalForm.REAL else _INTEGER
                yield index, tag
コード例 #8
0
ファイル: test_punctuation.py プロジェクト: janyfe/maru
import pytest

from maru.grammeme import PartOfSpeech
from maru.tag import Tag
from maru.tagger.punctuation import PunctuationTagger
from tests.tagger.base import TaggerTest

_PUNCTUATION = Tag(pos=PartOfSpeech.PUNCTUATION)


@pytest.fixture(name='tagger', scope='session')
def create_tagger():
    return PunctuationTagger()


@pytest.mark.parametrize(
    'test',
    [
        TaggerTest(
            words=['!', '@', '.....,'],
            tags=[(0, _PUNCTUATION), (1, _PUNCTUATION), (2, _PUNCTUATION)],
        ),
        TaggerTest(
            words=['?!', '"', ':', ';'],
            tags=[
                (0, _PUNCTUATION),
                (1, _PUNCTUATION),
                (2, _PUNCTUATION),
                (3, _PUNCTUATION),
            ],
        ),
コード例 #9
0
def test():
    lemmatizer = PymorphyLemmatizer()

    assert lemmatizer.lemmatize('мыло', Tag(pos=PartOfSpeech.VERB)) == 'мыть'
コード例 #10
0
ファイル: test_rnn.py プロジェクト: janyfe/maru
@pytest.fixture(name='tagger', scope='session')
def create_tagger():
    return RNNTagger()


@pytest.mark.parametrize(
    'test',
    [
        TaggerTest(
            words=['необычные', 'дела'],
            tags=[
                (
                    0,
                    Tag(
                        pos=PartOfSpeech.ADJECTIVE,
                        case=Case.NOMINATIVE,
                        degree=Degree.POSITIVE,
                        number=Number.PLURAL,
                        variant=Variant.FULL,
                    ),
                ),
                (
                    1,
                    Tag(
                        pos=PartOfSpeech.NOUN,
                        animacy=Animacy.INANIMATE,
                        case=Case.NOMINATIVE,
                        gender=Gender.NEUTER,
                        number=Number.PLURAL,
                    ),
コード例 #11
0
ファイル: test_analyzer.py プロジェクト: gilyazutdinov/maru
from typing import Sequence

from maru.grammeme import PartOfSpeech
from maru.lemmatizer import DummyLemmatizer
from maru.morph import Morph
from maru.analyzer import Analyzer
from maru.tag import Tag
from maru.tagger import ITagger
from maru.types import Text

from tests.stubs.tagger import ConstantTagger

_UNKNOWN = Tag(pos=PartOfSpeech.UNKNOWN)


def _assert_analyzed_equal(expected: Sequence[Morph],
                           taggers: Sequence[ITagger],
                           text: Text,
                           ):
    analyzer = Analyzer(taggers, lemmatizer=DummyLemmatizer())

    assert expected == list(analyzer.analyze(text))


def test_unknown():
    _assert_analyzed_equal(
        expected=[
            Morph(
                word='hello',
                lemma='hello',
                tag=_UNKNOWN,
コード例 #12
0
import re
from typing import Iterator

from maru.grammeme import PartOfSpeech
from maru.grammeme.numform import NumericalForm
from maru.tag import Tag
from maru.tagger.abstract import ITagger, Tagged
from maru.types import Indices, Text

_REGEX = re.compile(rf'(?P<{NumericalForm.REAL}>\d+[.,/]\d+$)|'
                    rf'(?P<{NumericalForm.INTEGER}>\d+$)|'
                    rf'(?P<{NumericalForm.RANGE}>\d+[‑–—−-]\d+)')
_TAGS = {
    NumericalForm.REAL:
    Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.REAL),
    NumericalForm.INTEGER:
    Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.INTEGER),
    NumericalForm.RANGE:
    Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.RANGE),
}


class NumericalTagger(ITagger):
    def tag(self, text: Text, indices: Indices) -> Iterator[Tagged]:
        for index in indices:
            match = _REGEX.match(text[index])
            if match is not None:
                form = NumericalForm(match.lastgroup)
                yield index, _TAGS[form]