Python Cache.add_docの例

プログラミング言語: Python

名前空間/パッケージ名: lsh.cache

クラス/型: Cache

メソッド/関数: add_doc

hotexamples.comのコード掲載数: 5

Python Cache.add_doc - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのlsh.cache.Cache.add_docの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

Cache(5)

add_doc(3)

get_all_duplicates(2)

get_duplicates_of(2)

from_json(1)

is_duplicate(1)

コード例 #1

ファイルを表示

ファイル: test_cache.py プロジェクト: mattilyra/LSH

def test_invalid_settings(num_bands, default_hasher, default_cache):
    with pytest.raises(AssertionError):
        lsh = Cache(default_hasher, num_bands=num_bands)
        lsh.add_doc('Hi', 1)
        lsh.get_duplicates_of('Hello')

    default_cache.add_doc('Hi', 0)
    with pytest.raises(ValueError):
        default_cache.get_duplicates_of(doc_id=123)

コード例 #2

ファイルを表示

def test_invalid_settings(num_bands, default_hasher, default_cache):
    with pytest.raises(AssertionError):
        lsh = Cache(default_hasher, num_bands=num_bands)
        lsh.add_doc('Hi', 1)
        lsh.get_duplicates_of('Hello')

    default_cache.add_doc('Hi', 0)
    with pytest.raises(ValueError):
        default_cache.get_duplicates_of(doc_id=123)

コード例 #3

ファイルを表示

def test_cache(char_ngram, hashbytes, num_bands, seed):
    hasher = MinHasher(seeds=200,
                       char_ngram=char_ngram,
                       hashbytes=hashbytes,
                       random_state=seed)
    lsh = Cache(hasher, num_bands=num_bands)
    # very small band width => always find duplicates

    short_doc = 'This is a simple document'
    another_doc = 'Some text about animals.'
    long_doc = 'A much longer document that contains lots of information\
       different words. The document produces many more shingles.'

    assert not lsh.is_duplicate(short_doc)
    lsh.add_doc(short_doc, 0)
    assert lsh.get_duplicates_of(short_doc) == {0}
    assert lsh.is_duplicate(short_doc, doc_id=0)
    assert lsh.is_duplicate(short_doc)

    assert not lsh.is_duplicate(long_doc)
    lsh.add_doc(long_doc, 1)
    lsh.add_doc(another_doc, 2)
    assert lsh.is_duplicate(another_doc)

    assert lsh.is_duplicate(long_doc, doc_id=1)
    assert lsh.is_duplicate(long_doc)

    words = long_doc.split()
    long_doc_missing_word = ' '.join([words[0]] + words[2:])

    assert lsh.get_duplicates_of(long_doc_missing_word) == {1}
    assert lsh.is_duplicate(long_doc_missing_word)
    assert lsh.is_duplicate(long_doc + ' Word.')

    assert lsh.get_all_duplicates() == set()
    lsh.add_doc(long_doc_missing_word, 3)
    assert lsh.get_all_duplicates() == {(1, 3)}

    lsh.add_doc(long_doc_missing_word, 4)
    assert lsh.get_all_duplicates() == {(1, 3), (1, 4), (3, 4)}

コード例 #4

ファイルを表示

ファイル: test_cache.py プロジェクト: mattilyra/LSH

def test_cache(char_ngram, hashbytes, num_bands, seed):
    hasher = MinHasher(seeds=200, char_ngram=char_ngram,
                       hashbytes=hashbytes, random_state=seed)
    lsh = Cache(hasher, num_bands=num_bands)
    # very small band width => always find duplicates

    short_doc = 'This is a simple document'
    another_doc = 'Some text about animals.'
    long_doc = 'A much longer document that contains lots of information\
       different words. The document produces many more shingles.'

    assert not lsh.is_duplicate(short_doc)
    lsh.add_doc(short_doc, 0)
    assert lsh.get_duplicates_of(short_doc) == {0}
    assert lsh.is_duplicate(short_doc, doc_id=0)
    assert lsh.is_duplicate(short_doc)

    assert not lsh.is_duplicate(long_doc)
    lsh.add_doc(long_doc, 1)
    lsh.add_doc(another_doc, 2)
    assert lsh.is_duplicate(another_doc)

    assert lsh.is_duplicate(long_doc, doc_id=1)
    assert lsh.is_duplicate(long_doc)

    words = long_doc.split()
    long_doc_missing_word = ' '.join([words[0]] + words[2:])

    assert lsh.get_duplicates_of(long_doc_missing_word) == {1}
    assert lsh.is_duplicate(long_doc_missing_word)
    assert lsh.is_duplicate(long_doc + ' Word.')

    assert lsh.get_all_duplicates() == set()
    lsh.add_doc(long_doc_missing_word, 3)
    assert lsh.get_all_duplicates() == {(1, 3)}

    lsh.add_doc(long_doc_missing_word, 4)
    assert lsh.get_all_duplicates() == {(1, 3), (1, 4), (3, 4)}

コード例 #5

ファイルを表示

 def dedup(self):
     deduper = Cache(MinHasher(100))
     for x, doc in enumerate(self.data):
         deduper.add_doc(doc[0], x)
     dups = deduper.get_all_duplicates(min_jaccard=0.80)
     return dups