def __duplicate_pairs(self)-> Iterable[Tuple[int, int]]: similar_pairs = all_pairs(self.__document_elements, similarity_func_name='jaccard', similarity_threshold=self.__set_similarity_threshold) for idx1, idx2, _ in similar_pairs: if self.__multiset_jaccard_similarity(idx1, idx2) >= self.__multiset_similarity_threshold: yield idx1, idx2
def mapster(columns_list, similarity_func_name="jaccard", threshold=0.5, ignore_case=False): """ Utility to cluster different datasets based on their columns :param columns_list: :param similarity_func_name: the name of the similarity function used; this function currently supports `"jaccard"` and `"cosine"`. :param threshold: the threshold used, must be a float between 0 and 1.0. :param ignore_case: ignore case in columns names :return: """ if ignore_case: columns_list = [[col.lower() if isinstance(col, str) else col for col in cols] for cols in columns_list] pairs = all_pairs(columns_list, similarity_func_name=similarity_func_name, similarity_threshold=threshold) metadata_clusters = [{pair[0], pair[1]} for pair in pairs] + [{i} for i in range(len(columns_list))] metadata_final_clusters = [] for cluster in metadata_clusters: i, n = 0, len(metadata_final_clusters) while i < n: if cluster & metadata_final_clusters[i]: cluster.update(metadata_final_clusters[i]) del metadata_final_clusters[i] n -= 1 else: i += 1 metadata_final_clusters.append(cluster) return [__get_summary(columns_list, idx_set) for idx_set in metadata_final_clusters]
def self_all_pairs(set_IDs, sets, similarity_func_name, similarity_threshold): count = 0 logging.info( "Find pairs with similarity >= {}.".format(similarity_threshold)) for x, y, sim in all_pairs(sets, similarity_func_name=similarity_func_name, similarity_threshold=similarity_threshold): yield (set_IDs[x], set_IDs[y], len(sets[x]), len(sets[y]), sim) count += 1 logging.info("Found {} pairs.".format(count))
def test_jaccard(self): sets = [[1, 2, 3], [3, 4, 5], [2, 3, 4], [5, 6, 7]] correct_pairs = set([(1, 0, 0.2), (2, 0, 0.5), (2, 1, 0.5), (3, 1, 0.2)]) pairs = list( all_pairs(sets, similarity_func_name='jaccard', similarity_threshold=0.1)) for pair in pairs: self.assertTrue(pair in correct_pairs) self.assertEqual(len(pairs), len(correct_pairs))
def findSimilarity(self, datasize, setsize, dictsets): sets = [] tcount = 0 #form sets from dictionary for i in range(0, int(datasize / setsize)): for j in range(0, len(self.wsn)): s = self.wsn[j].aggregatedReadings[i] sets.append(CHaggregator.formSet(self, s)) pairs = all_pairs( sets, similarity_func_name="jaccard", similarity_threshold=self.wsn[0].similarityThreshold) simililartyPairs = list(pairs) #print(sets) pruneList = [] for k in range(0, len(simililartyPairs)): if sets[simililartyPairs[k][1]] not in pruneList: pruneList.append(sets[simililartyPairs[k][1]]) for l in pruneList: sets.remove(l) temp = [] for i in range(0, len(pruneList)): temp.append(pruneList[i][0]) self.outputlen = len(temp) self.totalvar += np.var(temp) temp = [] #print(sets) #print(sets) for e in sets: for el in e: temp.append(el) self.a_totalavge += sum(temp) / len(temp) #print(temp) self.totalaggregationOutput.append(temp) tcount += len(temp) temp = [] sets = [] count = 0 #self.a_totalavge/=int(datasize / setsize) print("Accuracy:", (abs(self.b_totalavge - self.a_totalavge) / self.b_totalavge) * 100) print("Energy Efficiency:", tcount / datasize / self.totalEnergyCon)
def similarity_comparison(scripts, threshold): """Computes Jaccard similarity for all pairs of scripts and returns pairs of scripts with similarity > threshold.""" shingles = [] print("similarity search") for script in scripts: data = script.replace("\n", " ") # table = data.maketrans(string.punctuation, ' '*len(string.punctuation)) # data = data.translate(table) words = data.split() # shingles.append(words) shingles_in_doc = set() for index, _ in enumerate(words): # - 2): # Construct the shingle text by combining three words together. # + " " + words[index + 1] + " " + words[index + 2] shingle = words[index] # Hash the shingle to a 32-bit integer. crc = binascii.crc32(shingle.encode('utf-8')) & 0xffffffff # Add the hash value to the list of shingles for the current document. # Note that set objects will only add the value to the set if the set # doesn't already contain it. shingles_in_doc.add(crc) shingles.append(shingles_in_doc) # elapsed = (time.time() - t0) # print("sim search prep took ", elapsed, " seconds") # t0 = time.time() pairs = all_pairs(shingles, similarity_func_name="jaccard", similarity_threshold=threshold) try: list_pairs = list(pairs) except ValueError: return [] # for n in list_pairs: # print(names[n[0]], names[n[1]], n[2]) return list_pairs
def test_identity_matrix(self): # Use all-pair to generate an lower-triangular identity matix nsets = 10 population = list(range(100)) sets = [ set(population) - set(random.choices(population, k=10)) for i in range(nsets) ] coords = all_pairs(sets, similarity_threshold=0) arr = np.nan * np.empty((nsets, nsets)) x, y, z = zip(*coords) arr[x, y] = z # Verify if arr is a lower-triangular matrix. for i in range(nsets): for j in range(nsets): if i > j: self.assertFalse(np.isnan(arr[i, j])) else: self.assertTrue(np.isnan(arr[i, j]))