def find_near_duplicate(dataset, query, targets, labels, min_jaccard_value, no_of_bands, n_permutations, n_gram, n_gram_type='char'): """Using LSH object finds the near duplicate strings. Args: query_sentences (dict): Dict with query strings and version of string in lower case and without comma. sentences (dict): Dict with target strings and version of string in lower case and without comma. min_jaccard_value (float): Minimum value for the Jaccard Distance. no_of_bands (int): Number of bands to break minhash signature into before hashing into buckets. n_permutations (int): Number of permutations used to create minhash signatures used in LSH model. n_gram (int): Size of each overlapping text shingle to break text into prior to hashing. """ for i in targets: if len(i) < n_gram: pp(i) # Create MinHash object. minhash = MinHash(targets, n_gram=n_gram, n_gram_type=n_gram_type, permutations=n_permutations, hash_bits=64, seed=SEED) # Create LSH model. lsh = LSH(minhash, labels, no_of_bands=no_of_bands) # Query to find near duplicates the string in `search` closest_results = lsh.query(labels[0], min_jaccard=min_jaccard_value) # print("QUERY: {}".format(labels[0])) # pp(closest_results) return {"dataset": dataset, "query": labels[0], "duplicates": ' '.join(closest_results)}
def test_lsh_errors(): with pytest.raises(ValueError): LSH(content) with pytest.raises(ValueError): LSH(labels=labels) with pytest.raises(ValueError): LSH(minhash, labels, no_of_bands=49)
def find_adjacency(draws): draws_idx = list(draws) draws_nos = list(draws.values()) # labels = [draws_idx[0]] # content = [" ".join(map(str, draws_nos[0]))] # print(labels) # print(content) # minhash = MinHash(content, n_gram=9, permutations=100, hash_bits=64, seed=3) # lsh = LSH(minhash, labels, no_of_bands=50) # print(lsh.query(1, min_jaccard=0.5)) new_labels = [] new_content = [] for i in range(0, len(draws)): new_labels.append(draws_idx[i]) new_content.append(" ".join(map(str, draws_nos[i]))) # print(new_labels) # print(new_content) new_labels.append(99999) new_content.append(" ".join(map(str, draws_nos[0]))) minhash = MinHash(new_content, n_gram=9, permutations=500, hash_bits=64, seed=3) lsh = LSH(minhash, new_labels, no_of_bands=100) adjacency_list = lsh.adjacency_list(min_jaccard=0.5) for key, value in adjacency_list.items(): if len(value) > 0: print(key, value)
class LshCluster(): def __init__(self, Content, nGram=3, PrenutNum=16, BandNum=8, MinJaccard=0.2): self.nGram = nGram self.BandNum = BandNum self.PrenutNum = PrenutNum self.MinJaccard = MinJaccard self.Seed = 3 self.CreateLsh (Content) def Transform(self, Contexts): NewContexts = [] for ctx in Contexts: ctx = ctx.strip() ctx = ctx.replace("_", "") ctx = ctx.lower() NewContexts.append(ctx) return NewContexts def CreateLsh(self, InContext): Labels = range(len(InContext)) InContext = self.Transform (InContext) Hash = MinHash(InContext, n_gram=self.nGram, permutations=self.PrenutNum, hash_bits=64, seed=self.Seed) self.Lsh = LSH(Hash, Labels, no_of_bands=self.BandNum) self.MaxIndex = len (self.Lsh.adjacency_list(min_jaccard=self.MinJaccard)) def QuerySimilars(self, Index): if Index >= self.MaxIndex: return [] Results = self.Lsh.query(Index, min_jaccard=self.MinJaccard) return Results
def test_initialize_from_empty_lsh(): lsh = LSH() assert lsh.no_of_bands is None assert lsh._buckets == defaultdict(list) assert lsh._i_bucket == defaultdict(list) assert lsh.permutations is None lsh.update(minhash, labels) assert list(lsh._i_bucket) == labels assert lsh.permutations == 100 assert lsh.no_of_bands == 50
def test_initialize_from_empty_lsh(): lsh = LSH() assert lsh.no_of_bands is None assert lsh._buckets == defaultdict(list) assert lsh._i_bucket == defaultdict(list) assert lsh.permutations is None lsh.update(minhash, labels) assert list(lsh._i_bucket) == labels buckets = lsh._buckets assert buckets[4466445138223010106] == [1, 8] assert buckets[-3939654010681976230] == [1, 4, 8] assert lsh.permutations == 100 assert lsh.no_of_bands == 50
def create_lsh(self, content, no_of_bands, n_permutations, n_gram): """Create Minhash and Locality Sensitive Hashing (LSH) to detect near duplicate texts. Args: content (list): List with string to build LSH. no_of_bands (int): Number of bands to break minhash signature into before hashing into buckets. n_permutations (int): Number of permutations used to create minhash signatures used in LSH model. n_gram (int): Size of each overlapping text shingle to break text into prior to hashing. no_of_bands(int): Number of bands to break minhash signature into before hashing into buckets. Returns: class 'snapy.lsh.LSH': Snapy LSH object. """ labels = range(len(content)) # Create MinHash object. minhash = MinHash(content, n_gram=n_gram, permutations=n_permutations, hash_bits=64, seed=SEED) # Create LSH model. lsh = LSH(minhash, labels, no_of_bands=no_of_bands) return lsh
def test_lsh_edge_list(): lsh = LSH(minhash, labels) with pytest.raises(ValueError): lsh.edge_list(sensitivity=101) assert lsh.edge_list() == [(8, 1), (8, 4), (5, 3), (4, 1)] assert lsh.edge_list(sensitivity=20) == [(8, 1), (5, 3), (4, 1)] assert lsh.edge_list(min_jaccard=0.7) == [] assert lsh.edge_list(min_jaccard=0.6) == [(5, 3)] assert lsh.edge_list(jaccard_weighted=True, min_jaccard=0.55) == [(5, 3, 0.6), (4, 1, 0.58)]
def get_lsh_model(documents, seed_int): seed = seed_int # Create MinHash object. minhash = MinHash(my_documents, n_gram=9, permutations=100, hash_bits=64, seed=seed) # Create LSH model. lsh = LSH(minhash, labels, no_of_bands=50) return lsh
def test_update_lsh(): lsh = LSH(minhash, labels) with pytest.raises(ValueError): lsh.update(minhash, labels) new_content = [ 'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium', 'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.' ] new_labels = [11, 12] incorrect_minhash = MinHash(new_content, permutations=10) with pytest.raises(ValueError): lsh.update(incorrect_minhash, new_labels) correct_minhash = MinHash(new_content) lsh.update(correct_minhash, new_labels) assert lsh.permutations == 100 assert list(lsh._i_bucket) == labels + [11, 12]
def _get_duplicate_ids(text: List[str], lsh: LSH, min_jaccard: float) -> Iterable[str]: """Uses the given `lsh` object to find near duplicate text in `text`. Returns a list of indices into `text` which point to duplicate texts. """ duplicate_ids = set() adjacency_list = lsh.adjacency_list(min_jaccard=min_jaccard) with typer.progressbar(adjacency_list.items(), label="Deduplicating text") as progress: for query_id, similar_ids in progress: # If query_id exists in duplicate_ids, we have already accounted for it. if query_id in duplicate_ids: continue duplicate_ids.update(similar_ids) typer.secho( f"{SEARCH} Found a total of {len(duplicate_ids)} duplicate texts.", bold=True, ) return list(duplicate_ids)
def test_lsh_adjacency_list(): lsh = LSH(minhash, labels) with pytest.raises(ValueError): lsh.adjacency_list(sensitivity=1000) sensitivity_list = lsh.adjacency_list(sensitivity=2) assert sensitivity_list == { 1: [8, 4], 2: [], 3: [5], 4: [1, 8], 5: [3], 6: [], 7: [], 8: [1, 4], 9: [] } jaccard_list = lsh.adjacency_list(min_jaccard=0.6) assert jaccard_list == { 1: [], 2: [], 3: [5], 4: [], 5: [3], 6: [], 7: [], 8: [], 9: [] } default_list = lsh.adjacency_list() assert default_list == { 1: [8, 4], 2: [], 3: [5], 4: [1, 8], 5: [3], 6: [], 7: [], 8: [1, 4], 9: [] }
def _create_lsh( text: List[str], labels: List[int], n_gram: int, n_permutations: int, hash_bits: int, no_of_bands: int, ) -> LSH: """Returns a `snapy.lsh.LSH` object constructed from `text` to detect near duplicate texts. """ minhash = MinHash(text, n_gram=n_gram, permutations=n_permutations, hash_bits=hash_bits, seed=SEED) lsh = LSH(minhash, labels, no_of_bands=no_of_bands) typer.secho( f"{HASHING} Hashed the normalized text using Locality-Sensitive Hashing (LSH).", bold=True, ) return lsh
def identify_dublicates(self, ctnt_to_dedup): _ix = [i for i in range(len(ctnt_to_dedup))] _mn_hash = MinHash(ctnt_to_dedup, n_gram=self.n_gram, seed=self.seed) _lsh = LSH(_mn_hash, _ix, no_of_bands=self.lsh_bands) candidates = _lsh.adjacency_list(min_jaccard=self.j_thresh) return candidates
def test_lsh_remove(): lsh = LSH(minhash, labels) lsh.remove(5) assert list(lsh._i_bucket) == [1, 2, 3, 4, 6, 7, 8, 9] with pytest.raises(KeyError): lsh.remove(11)
def test_lsh_contains(): lsh = LSH(minhash, labels) assert lsh.contains() == labels
def test_lsh_query(): lsh = LSH(minhash, labels) with pytest.raises(KeyError): lsh.query(10) with pytest.raises(KeyError): lsh.query(0) with pytest.raises(ValueError): lsh.query(2, sensitivity=100) result = lsh.query(1) assert result == [8, 4] result = lsh.query(1, sensitivity=29) assert result == [4] result = lsh.query(1, min_jaccard=0.55) assert result == [4]
def CreateLsh(self, InContext): Labels = range(len(InContext)) InContext = self.Transform (InContext) Hash = MinHash(InContext, n_gram=self.nGram, permutations=self.PrenutNum, hash_bits=64, seed=self.Seed) self.Lsh = LSH(Hash, Labels, no_of_bands=self.BandNum) self.MaxIndex = len (self.Lsh.adjacency_list(min_jaccard=self.MinJaccard))
from snapy import MinHash, LSH import numpy as np from fasta_parser import parse_to_list contigs_file = "../contigs-outputs/basic_k-mer24/basic_try_k-mer24.contigs.fa" print("parsing contigs to list...") contigs_list, num_contigs = parse_to_list(contigs_file) print("number of contigs (shorter then 1,00bp):", num_contigs) labels = np.arange(num_contigs).tolist() # Create MinHash object. print("creating minhash object...") minhash = MinHash(contigs_list, n_gram=24) # Create LSH model. print("creating LSH object...") lsh = LSH(minhash, labels) print("query object:") print("similar strings to the string in index 0:", lsh.query(0)) print("similar strings to the string in index 1:", lsh.query(1)) print("similar strings to the string in index 2:", lsh.query(2)) print("similar strings to the string in index 3:", lsh.query(3)) print("similar strings to the string in index 4:", lsh.query(4)) print("similar strings to the string in index 5:", lsh.query(5)) print(lsh.contains())
#number of randomly sampled hash values to use for generating each texts minhash signature (larger = more accurate & slower) permutations = 100 #hash value size to be used to generate minhash signitures from shingles (32,64, or 128 bit). #NOTE: should be chosen based on text length and a trade off between performance ad accuracy hash_bits = 64 # Create MinHash object. minhash = MinHash(content, n_gram=n_gram, permutations=permutations, hash_bits=hash_bits, seed=seed) # Create LSH model. lsh = LSH(minhash, labels, no_of_bands=50) #query to find near duplicates for text 1 print(lsh.query(1, min_jaccard=.5)) #update model #generate minhash aignitures for new text, and add new texts to LSH model new_text = [ 'Jupiter is primarily composed of hydrogen with a quarter of its mass being helium', 'Jupiter moving out of the inner Solar System would have allowed the formation of inner planets.', ] new_labels = ['new_doc1', 'new_doc2'] #1.create minhash signitues for new text new_minhash = MinHash(new_text,
def test_initialize_lsh_with_params(): lsh = LSH(minhash, labels, no_of_bands=20) assert lsh.no_of_bands == 20 assert lsh.permutations == 100 assert list(lsh._i_bucket) == labels