def test_ngram_tokenization(): tokenizer = get_tokenizer('ngram', level=3) text = '你好啊!吃过了没有?' tokens = ['你好啊', '吃过了', '过了没', '了没有'] assert tokenizer.tokenize(text) == tokens assert tokenizer.cut(text) == tokens assert tokenizer.lcut(text) == tokens assert tokenizer(text) == tokens
def test_space_tokenization(): tokenizer = get_tokenizer('space') text = 'hello world' tokens = [Token('hello', 0, 5), Token('world', 6, 11)] words = ['hello', 'world'] assert list(tokenizer.tokenize(text)) == tokens assert list(tokenizer.cut(text)) == words assert tokenizer.lcut(text) == words assert list(tokenizer(text)) == tokens
def test_ngram_tokenization(): tokenizer = get_tokenizer('ngram', level=3) text = '你好啊!吃过了没有?' tokens = [ Token('你好啊', 0, 3), Token('吃过了', 4, 7), Token('过了没', 5, 8), Token('了没有', 6, 9) ] words = ['你好啊', '吃过了', '过了没', '了没有'] assert list(tokenizer.tokenize(text)) == tokens assert list(tokenizer.cut(text)) == words assert tokenizer.lcut(text) == words assert list(tokenizer(text)) == tokens
def test_jieba_tokenizer(): tokenizer = get_tokenizer('jieba', lazy_load=False) text = '今天的天气真好' tokens = [ Token('今天', 0, 2), Token('的', 2, 3), Token('天气', 3, 5), Token('真', 5, 6), Token('好', 6, 7) ] words = ['今天', '的', '天气', '真', '好'] assert list(tokenizer.tokenize(text)) == tokens assert list(tokenizer.cut(text)) == words assert tokenizer.lcut(text) == words assert list(tokenizer(text)) == tokens
def test_corenlp_tokenizer(): tokenizer = get_tokenizer('corenlp', url="http://127.0.0.1:8000") text = '今天的天气真好' tokens = [ Token('今天', 0, 2), Token('的', 2, 3), Token('天气', 3, 5), Token('真', 5, 6), Token('好', 6, 7) ] words = ['今天', '的', '天气', '真', '好'] assert list(tokenizer.tokenize(text)) == tokens assert list(tokenizer.cut(text)) == words assert tokenizer.lcut(text) == words assert list(tokenizer(text)) == tokens
def test_pku_tokenizer(): tokenizer = get_tokenizer('pku') text = '今天的天气真好' tokens = [ Token('今天', 0, 2), Token('的', 2, 3), Token('天气', 3, 5), Token('真', 5, 6), Token('好', 6, 7) ] words = ['今天', '的', '天气', '真', '好'] assert list(tokenizer.tokenize(text)) == tokens assert list(tokenizer.cut(text)) == words assert tokenizer.lcut(text) == words assert list(tokenizer(text)) == tokens
def compute_similarity(first, second, method='jaccard', tokenizer=None, partial=False, ngram_range=None, ngram_weights=None): assert isinstance(first, str) and isinstance(second, str) if not ngram_range: ngram_range = [1] ngram_weights = [1.0] metric_func = { 'lcs': lcs, 'jaccard': jaccard, 'cosine': cosine, 'dice': dice, }.get(method) if not metric_func: raise ValueError("unsupported method `{}`".format(method)) tokenizer = tokenizer or get_tokenizer("ngram", level=1) first_terms = tokenizer.lcut(first) second_terms = tokenizer.lcut(second) similarity = 0.0 ngram_levels = list(range(ngram_range[0], ngram_range[-1] + 1)) if not ngram_weights: ngram_weights = [1 for _ in range(len(ngram_levels))] for ngram_level, weight in zip(ngram_levels, ngram_weights): first_ngrams = list(windowed(first_terms, ngram_level)) second_ngrams = list(windowed(second_terms, ngram_level)) similarity += weight * metric_func( first_ngrams, second_ngrams, partial=partial) return similarity / sum(ngram_weights)
def __init__(self, schema): self.schema = IndexSchema(schema) self.fields = sorted(self.schema.index_fields) self.term_dict = dict() self.index = [defaultdict(set) for _ in self.fields] self.tokenizer = get_tokenizer("ngram", level=2)
import pytest from zhtools.tokenize import get_tokenizer from zhtools.similarity import compute_similarity @pytest.mark.parametrize( 'first, second, tokenizer, ngram_range, ngram_weights, sim', [ ('abcde', 'bcd', None, None, None, 0.6), ('abcde', 'bcd', None, [1, 2], None, 0.55), ('abcde', 'bcd', None, [1, 2], [0.1, 0.4], 0.52), ('abcde', 'bcd', get_tokenizer('ngram', level=2), None, None, 0.5), ]) def test_similarity(first, second, tokenizer, ngram_range, ngram_weights, sim): # use jaccard default similarity = compute_similarity(first, second, tokenizer=tokenizer, ngram_range=ngram_range, ngram_weights=ngram_weights) assert similarity == sim @pytest.mark.parametrize('first, second, method', [('abcde', 'bcd', 'some_unknown_method')]) def test_similarity_error(first, second, method): with pytest.raises(ValueError): compute_similarity(first, second, method=method)
def test_get_tokenizer_error(): with pytest.raises(ValueError): get_tokenizer('some_unknown_tokenizer')
def test_get_tokenizer(): first = get_tokenizer('ngram', level=3) second = get_tokenizer('ngram', level=3) assert first is second
from contextlib import contextmanager from os.path import join import shutil import tempfile import pytest from zhtools.utils.storage import MemoryDocumentStorage from zhtools.utils.inverted_index import IndexSchema, InvertedIndex, FieldNotExistsError from zhtools.similarity import compute_similarity from zhtools.tokenize import get_tokenizer TOKENIZER = get_tokenizer('ngram', level=2) @contextmanager def tempdir(): path = tempfile.mkdtemp() try: yield path finally: try: shutil.rmtree(path) except IOError: pass class TestIndexSchema(): def setup(self):