Beispiel #1
0
    def test(self):
        from highlighter import score_index
        from highlighter import build_stem_index
        from highlighter import tokenize
        from highlighter import english_suffix_stemmer as stemmer
        from highlighter import build_scorecard

        regex = re.compile(r'(\W)')

        doc = "This ham sammy is the bomb!"
        tokens = tokenize(doc, regex)
        stem_lookup = dict((token, stemmer(token)) for token in tokens)
        stem_index = build_stem_index(tokens, stem_lookup)
        scorecard = build_scorecard(doc, 0.0)

        score_index(stem_index, scorecard, lambda x: 3.3)
        assert scorecard == [3.3, 3.3, 3.3, 0.0, 0.0, 3.3, 3.3, 3.3, 0.0, 3.3, 3.3, 3.3, 3.3, 3.3, 0.0, 0.0, 0.0, 0.0, 3.3, 3.3, 3.3, 0.0, 3.3, 3.3, 3.3, 3.3, 3.3]
        score_index(stem_index, scorecard, lambda x: 1.0 if len(x) > 1 else 0.0)
        assert scorecard == [4.3, 4.3, 4.3, 0.0, 0.0, 4.3, 4.3, 4.3, 0.0, 4.3, 4.3, 4.3, 4.3, 4.3, 0.0, 0.0, 0.0, 0.0, 4.3, 4.3, 4.3, 0.0, 4.3, 4.3, 4.3, 4.3, 3.3]
Beispiel #2
0
    def test(self):
        from highlighter import build_stem_index
        from highlighter import tokenize
        from highlighter import english_suffix_stemmer as stemmer

        regex = re.compile(' ')

        doc = "This ham sammy is the bomb!"
        tokens = tokenize(doc, regex)
        stem_lookup = dict((token, stemmer(token)) for token in tokens)

        assert build_stem_index(None, None) == {}
        assert build_stem_index('', None) == {}
        assert build_stem_index(tokens, None) == {}
        assert build_stem_index(tokens, {'This': 'dis'}) == {'dis': [(0, 4)]}
        assert build_stem_index(tokens, stem_lookup) == {'the': [(14, 17)],
                'sammy': [(7, 12)],
                'bomb!': [(17, 22)],
                'ham': [(4, 7)],
                'thi': [(0, 4)]}
Beispiel #3
0
    def test(self):
        from highlighter import tokenize

        regex = re.compile(' ')

        assert tokenize(None, None) == []
        assert tokenize(None, '') == []
        assert tokenize('', None) == []
        assert tokenize('', '') == []
        assert tokenize('hello world', None) == []
        assert tokenize(None, regex) == []
        assert tokenize('hello world', regex) == ['hello', 'world']

        regex = re.compile(r'\W', flags=re.UNICODE)
        assert tokenize('hello,world', regex) == ['hello', 'world']
        assert tokenize('hello!world', regex) == ['hello', 'world']
        assert tokenize('hello world   ', regex) == ['hello', 'world', '', '', '']
        assert tokenize(u'goodbye sweet ☃!', regex) == [u'goodbye', u'sweet', '', '', '']
        assert tokenize(u'Yelp是涼爽', regex) == [u'Yelp是涼爽']
        assert tokenize(u'Yelp 是涼爽', regex) == [u'Yelp', u'是涼爽']