def __init__(self): super(NumberFeatureExtractor, self).__init__( token_features=[ features.bias, features.token_lower, features.Grammeme(threshold=0.01, add_unambig=True, ignore=self.IGNORE), features.GrammemePair(threshold=0.0, add_unambig=True, ignore=self.IGNORE), ], global_features=[ # features.sentence_start, # features.sentence_end, # features.Pattern([-1, 'token_lower']), # features.Pattern([-2, 'token_lower']), features.Pattern([-1, 'Grammeme']), features.Pattern([+1, 'Grammeme']), # features.Pattern([-2, 'Grammeme'], [-1, 'Grammeme']), features.Pattern([-1, 'Grammeme'], [0, 'Grammeme']), features.Pattern([-1, 'Grammeme'], [0, 'GrammemePair']), features.Pattern([-1, 'GrammemePair']), features.Pattern([+1, 'GrammemePair']), ], )
def __init__(self): super(POSFeatureExtractor, self).__init__( token_features=[ features.bias, features.token_lower, features.suffix2, features.suffix3, features.Grammeme(threshold=0.01, add_unambig=False, ignore=self.IGNORE), features.GrammemePair(threshold=0.01**2, add_unambig=False, ignore=self.IGNORE), ], global_features=[ features.sentence_start, features.sentence_end, # features.the_only_verb, features.Pattern([-1, 'token_lower']), # features.Pattern([+1, 'token_lower']), features.Pattern([-1, 'Grammeme']), features.Pattern([+1, 'Grammeme']), features.Pattern([-1, 'GrammemePair']), features.Pattern([+1, 'GrammemePair']), # features.Pattern([-1, 'GrammemePair'], [0, 'GrammemePair']), ], )
def test_Grammeme(morph): feat = features.Grammeme() res = feat('на', morph.parse('на')) assert sorted(res.keys()) == ['Grammeme'] assert sorted(res['Grammeme'].keys()) == ['INTJ', 'PRCL', 'PREP'] assert res['Grammeme']['PREP'] > res['Grammeme']['PRCL'] assert res['Grammeme']['PREP'] > res['Grammeme']['INTJ'] res = feat('стали', morph.parse('стали')) assert 'past' in res['Grammeme'] assert 'accs' in res['Grammeme'] assert res['Grammeme']['VERB'] > res['Grammeme']['NOUN']
def test_pattern_cartesian(morph): sent = 'Летят гуси на юг'.split() parsed = [morph.parse(t) for t in sent] fe = FeatureExtractor( [features.token_lower, features.Grammeme(threshold=0.1)], [ features.Pattern([-1, 'Grammeme'], [0, 'Grammeme']), features.Drop('Grammeme') ], ) xseq = fe.transform_single(sent, parsed) assert xseq[0] == {'token_lower': 'летят'} assert sorted(xseq[1].keys()) == sorted( ['Grammeme[i-1]/Grammeme[i]', 'token_lower']) assert xseq[1]['Grammeme[i-1]/Grammeme[i]']['VERB/NOUN'] == 1.0
def test_pattern_bigram_with_dict(morph): sent = 'Летят гуси на юг'.split() parsed = [morph.parse(t) for t in sent] fe = FeatureExtractor( [features.token_lower, features.Grammeme(threshold=0.1)], [ features.Pattern([-1, 'Grammeme'], [0, 'token_lower']), features.Pattern([-1, 'token_lower'], [0, 'Grammeme']), ], ) xseq = fe.transform_single(sent, parsed) assert sorted(xseq[1].keys()) == sorted([ 'Grammeme', 'Grammeme[i-1]/token_lower[i]', 'token_lower', 'token_lower[i-1]/Grammeme[i]', ]) assert xseq[1]['Grammeme[i-1]/token_lower[i]'] == { 'гуси': xseq[0]['Grammeme'] } assert xseq[1]['token_lower[i-1]/Grammeme[i]'] == { 'летят': xseq[1]['Grammeme'] }
def test_Grammeme_threshold(morph): feat = features.Grammeme(threshold=0.1) res = feat('на', morph.parse('на')) assert sorted(res['Grammeme'].keys()) == ['PREP'] assert res['Grammeme']['PREP'] > 0.99