Ejemplo n.º 1
0
    def __init__(self):
        super(POSFeatureExtractor, self).__init__(
            token_features=[
                features.bias,
                features.token_lower,
                features.suffix2,
                features.suffix3,
                features.Grammeme(threshold=0.01, add_unambig=False, ignore=self.IGNORE),
                features.GrammemePair(threshold=0.01**2, add_unambig=False, ignore=self.IGNORE),
            ],
            global_features=[
                features.sentence_start,
                features.sentence_end,

                # features.the_only_verb,

                features.Pattern([-1, 'token_lower']),
                # features.Pattern([+1, 'token_lower']),

                features.Pattern([-1, 'Grammeme']),
                features.Pattern([+1, 'Grammeme']),

                features.Pattern([-1, 'GrammemePair']),
                features.Pattern([+1, 'GrammemePair']),

                # features.Pattern([-1, 'GrammemePair'], [0, 'GrammemePair']),
            ],
        )
Ejemplo n.º 2
0
def test_pattern_low_high():
    assert features.Pattern([-2, 'foo']).index_low == 2
    assert features.Pattern([-2, 'foo']).index_high == 0
    assert features.Pattern([-2, 'foo'], index_low=5).index_low == 5
    assert features.Pattern([-2, 'foo'], [1, 'bar']).index_high == 1
    assert features.Pattern([-2, 'foo'], [1, 'bar'],
                            index_high=2).index_high == 2
Ejemplo n.º 3
0
    def __init__(self):
        super(TaggerFeatureExtractor, self).__init__(
            token_features=[
                features.bias,
                features.token_lower,
                features.Grammeme(threshold=0.01,
                                  add_unambig=True,
                                  ignore=self.IGNORE),
                # features.GrammemePair(threshold=0.0, add_unambig=True, ignore=self.IGNORE),
            ],
            global_features=[
                features.sentence_start,
                features.sentence_end,
                features.Pattern([-1, 'token_lower']),
                features.Pattern([-2, 'token_lower']),
                features.Pattern([-1, 'Grammeme']),
                features.Pattern([+1, 'Grammeme']),

                # features.Pattern([-2, 'Grammeme'], [-1, 'Grammeme']),
                # features.Pattern([-1, 'Grammeme'], [0, 'Grammeme']),
                # features.Pattern([-1, 'Grammeme'], [0, 'GrammemePair']),
                #
                # features.Pattern([-1, 'GrammemePair']),
                # features.Pattern([+1, 'GrammemePair']),
            ],
        )
Ejemplo n.º 4
0
def test_pattern(morph):
    fe = FeatureExtractor(
        [features.token_lower],
        [
            features.sentence_start,
            features.sentence_end,
            features.Pattern([-1, 'token_lower']),
        ],
    )
    sent = 'Летят гуси на юг'.split()
    parsed = [morph.parse(t) for t in sent]
    assert fe.transform_single(sent, parsed) == [
        {
            'token_lower': 'летят',
            'sentence_start': 1.0
        },
        {
            'token_lower': 'гуси',
            'token_lower[i-1]': 'летят'
        },
        {
            'token_lower': 'на',
            'token_lower[i-1]': 'гуси'
        },
        {
            'token_lower': 'юг',
            'sentence_end': 1.0,
            'token_lower[i-1]': 'на'
        },
    ]
Ejemplo n.º 5
0
def test_pattern_callable_complex(morph):
    sent = 'Летят гуси на юг'.split()
    parsed = [morph.parse(t) for t in sent]

    def not_title(token, parses, feature_dict):
        return not feature_dict.get('title', False)

    fe = FeatureExtractor([], [
        features.Pattern(
            [0, lambda token, parses: token.istitle(), 'title'], ),
        features.Pattern(
            [-1, 'title'],
            [0, 'title'],
        ),
        features.Pattern(
            [-1, not_title],
            [0, not_title],
            [+1, not_title],
        )
    ])
    assert fe.transform_single(sent, parsed) == [
        {
            'title': True
        },
        {
            'title': False,
            'title[i-1]/title[i]': 'True/False',
            'not_title[i-1]/not_title[i]/not_title[i+1]': 'False/True/True'
        },
        {
            'title': False,
            'title[i-1]/title[i]': 'False/False',
            'not_title[i-1]/not_title[i]/not_title[i+1]': 'True/True/True'
        },
        {
            'title': False,
            'title[i-1]/title[i]': 'False/False'
        },
    ]
Ejemplo n.º 6
0
def test_pattern_names():
    assert features.Pattern([0, 'foo']).name == 'foo[i]'
    assert features.Pattern([-1, 'foo'],
                            [2, 'bar']).name == 'foo[i-1]/bar[i+2]'

    def baz(token, parses):
        pass

    assert features.Pattern([0, baz]).name == 'baz[i]'
    assert features.Pattern([0, baz], [1, 'spam']).name == 'baz[i]/spam[i+1]'
    assert features.Pattern([1, 'spam'], [0, baz]).name == 'spam[i+1]/baz[i]'

    assert features.Pattern([0, baz, 'egg']).name == 'egg'

    assert features.Pattern([0, baz], name='fuzz').name == 'fuzz'
    assert features.Pattern([0, baz, 'egg'], name='fuzz').name == 'fuzz'
Ejemplo n.º 7
0
def test_pattern_bigram_with_dict(morph):
    sent = 'Летят гуси на юг'.split()
    parsed = [morph.parse(t) for t in sent]
    fe = FeatureExtractor(
        [features.token_lower,
         features.Grammeme(threshold=0.1)],
        [
            features.Pattern([-1, 'Grammeme'], [0, 'token_lower']),
            features.Pattern([-1, 'token_lower'], [0, 'Grammeme']),
        ],
    )
    xseq = fe.transform_single(sent, parsed)
    assert sorted(xseq[1].keys()) == sorted([
        'Grammeme',
        'Grammeme[i-1]/token_lower[i]',
        'token_lower',
        'token_lower[i-1]/Grammeme[i]',
    ])
    assert xseq[1]['Grammeme[i-1]/token_lower[i]'] == {
        'гуси': xseq[0]['Grammeme']
    }
    assert xseq[1]['token_lower[i-1]/Grammeme[i]'] == {
        'летят': xseq[1]['Grammeme']
    }
Ejemplo n.º 8
0
def test_pattern_cartesian(morph):
    sent = 'Летят гуси на юг'.split()
    parsed = [morph.parse(t) for t in sent]
    fe = FeatureExtractor(
        [features.token_lower,
         features.Grammeme(threshold=0.1)],
        [
            features.Pattern([-1, 'Grammeme'], [0, 'Grammeme']),
            features.Drop('Grammeme')
        ],
    )
    xseq = fe.transform_single(sent, parsed)
    assert xseq[0] == {'token_lower': 'летят'}
    assert sorted(xseq[1].keys()) == sorted(
        ['Grammeme[i-1]/Grammeme[i]', 'token_lower'])
    assert xseq[1]['Grammeme[i-1]/Grammeme[i]']['VERB/NOUN'] == 1.0
Ejemplo n.º 9
0
def test_pattern_callable(morph):
    sent = 'Летят гуси на юг'.split()
    parsed = [morph.parse(t) for t in sent]
    fe = FeatureExtractor([], [
        features.Pattern(
            [0, lambda token, parses: token.istitle(), 'title'], ),
    ])
    assert fe.transform_single(sent, parsed) == [
        {
            'title': True
        },
        {
            'title': False
        },
        {
            'title': False
        },
        {
            'title': False
        },
    ]
Ejemplo n.º 10
0
def test_pattern_kwargs(morph):
    sent = 'Летят гуси на юг'.split()
    parsed = [morph.parse(t) for t in sent]
    fe = FeatureExtractor(
        [features.token_lower],
        [
            features.sentence_start,
            features.sentence_end,
            features.Pattern(
                [+1, 'token_lower'],
                [-1, 'sentence_start'],
                name='low+1 BOS-1',
                index_low=0,
                index_high=0,
            ),
        ],
    )
    assert fe.transform_single(sent, parsed) == [
        {
            'token_lower': 'летят',
            'sentence_start': 1.0,
            'low+1 BOS-1': 'гуси/?'
        },
        {
            'token_lower': 'гуси',
            'low+1 BOS-1': 'на/1.0'
        },
        {
            'token_lower': 'на',
            'low+1 BOS-1': 'юг/0.0'
        },
        {
            'token_lower': 'юг',
            'sentence_end': 1.0,
            'low+1 BOS-1': '?/0.0'
        },
    ]