コード例 #1
0
ファイル: Cluster.py プロジェクト: Daybreak2019/SLTCS
class LshCluster():
    def __init__(self, Content, nGram=3, PrenutNum=16, BandNum=8, MinJaccard=0.2):
        self.nGram      = nGram
        self.BandNum    = BandNum
        self.PrenutNum  = PrenutNum
        self.MinJaccard = MinJaccard
        self.Seed       = 3
        self.CreateLsh (Content)

    def Transform(self, Contexts):
        NewContexts = []
        for ctx in Contexts:
            ctx = ctx.strip()
            ctx = ctx.replace("_", "")
            ctx = ctx.lower()
            NewContexts.append(ctx) 
     
        return NewContexts

    def CreateLsh(self, InContext):
        Labels = range(len(InContext))
        InContext = self.Transform (InContext)
        Hash   = MinHash(InContext, n_gram=self.nGram, permutations=self.PrenutNum, hash_bits=64, seed=self.Seed)
        self.Lsh = LSH(Hash, Labels, no_of_bands=self.BandNum)
        self.MaxIndex = len (self.Lsh.adjacency_list(min_jaccard=self.MinJaccard))

    def QuerySimilars(self, Index):
        if Index >= self.MaxIndex:
            return []
        Results = self.Lsh.query(Index, min_jaccard=self.MinJaccard)
        return Results
コード例 #2
0
def find_near_duplicate(dataset, query, targets, labels, min_jaccard_value, no_of_bands, n_permutations, n_gram, n_gram_type='char'):
    """Using LSH object finds the near duplicate strings.

    Args:
        query_sentences (dict): Dict with query strings and version of string in lower case and without comma.
        sentences (dict): Dict with target strings and version of string in lower case and without comma.
        min_jaccard_value (float): Minimum value for the Jaccard Distance.
        no_of_bands (int): Number of bands to break minhash signature into before hashing into buckets.
        n_permutations (int): Number of permutations used to create minhash signatures used in LSH model.
        n_gram (int): Size of each overlapping text shingle to break text into prior to hashing.
    """
    for i in targets:
        if len(i) < n_gram:
            pp(i)
    # Create MinHash object.
    minhash = MinHash(targets, n_gram=n_gram, n_gram_type=n_gram_type,
                      permutations=n_permutations, hash_bits=64, seed=SEED)

    # Create LSH model.
    lsh = LSH(minhash, labels, no_of_bands=no_of_bands)

    # Query to find near duplicates the string in `search`
    closest_results = lsh.query(labels[0], min_jaccard=min_jaccard_value)

    # print("QUERY: {}".format(labels[0]))
    # pp(closest_results)

    return {"dataset": dataset, "query": labels[0], "duplicates": ' '.join(closest_results)}
コード例 #3
0
def test_lsh_query():
    lsh = LSH(minhash, labels)
    with pytest.raises(KeyError):
        lsh.query(10)
    with pytest.raises(KeyError):
        lsh.query(0)
    with pytest.raises(ValueError):
        lsh.query(2, sensitivity=100)
    result = lsh.query(1)
    assert result == [8, 4]
    result = lsh.query(1, sensitivity=29)
    assert result == [4]
    result = lsh.query(1, min_jaccard=0.55)
    assert result == [4]
コード例 #4
0
#hash value size to be used to generate minhash signitures from shingles (32,64, or 128 bit).
#NOTE: should be chosen based on text length and a trade off between performance ad accuracy
hash_bits = 64

# Create MinHash object.
minhash = MinHash(content,
                  n_gram=n_gram,
                  permutations=permutations,
                  hash_bits=hash_bits,
                  seed=seed)

# Create LSH model.
lsh = LSH(minhash, labels, no_of_bands=50)

#query to find near duplicates for text 1
print(lsh.query(1, min_jaccard=.5))

#update model
#generate minhash aignitures for new text, and add new texts to LSH model
new_text = [
    'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium',
    'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.',
]

new_labels = ['new_doc1', 'new_doc2']

#1.create minhash signitues for new text
new_minhash = MinHash(new_text,
                      n_gram=n_gram,
                      permutations=permutations,
                      hash_bits=hash_bits,
コード例 #5
0
from snapy import MinHash, LSH
import numpy as np
from fasta_parser import parse_to_list

contigs_file = "../contigs-outputs/basic_k-mer24/basic_try_k-mer24.contigs.fa"
print("parsing contigs to list...")
contigs_list, num_contigs = parse_to_list(contigs_file)
print("number of contigs (shorter then 1,00bp):", num_contigs)
labels = np.arange(num_contigs).tolist()
# Create MinHash object.
print("creating minhash object...")
minhash = MinHash(contigs_list, n_gram=24)
# Create LSH model.
print("creating LSH object...")
lsh = LSH(minhash, labels)
print("query object:")
print("similar strings to the string in index 0:", lsh.query(0))
print("similar strings to the string in index 1:", lsh.query(1))
print("similar strings to the string in index 2:", lsh.query(2))
print("similar strings to the string in index 3:", lsh.query(3))
print("similar strings to the string in index 4:", lsh.query(4))
print("similar strings to the string in index 5:", lsh.query(5))
print(lsh.contains())