Ejemplo n.º 1
0
 def __duplicate_pairs(self)-> Iterable[Tuple[int, int]]:
     similar_pairs = all_pairs(self.__document_elements,
                               similarity_func_name='jaccard',
                               similarity_threshold=self.__set_similarity_threshold)
     for idx1, idx2, _ in similar_pairs:
         if self.__multiset_jaccard_similarity(idx1, idx2) >= self.__multiset_similarity_threshold:
             yield idx1, idx2
Ejemplo n.º 2
0
def mapster(columns_list, similarity_func_name="jaccard", threshold=0.5, ignore_case=False):
    """
    Utility to cluster different datasets based on their columns
    :param columns_list:
    :param similarity_func_name: the name of the similarity function used;
        this function currently supports `"jaccard"` and `"cosine"`.
    :param threshold: the threshold used, must be a float between 0 and 1.0.
    :param ignore_case: ignore case in columns names
    :return:
    """
    if ignore_case:
        columns_list = [[col.lower() if isinstance(col, str) else col for col in cols] for cols in columns_list]
    pairs = all_pairs(columns_list, similarity_func_name=similarity_func_name, similarity_threshold=threshold)
    metadata_clusters = [{pair[0], pair[1]} for pair in pairs] + [{i} for i in range(len(columns_list))]

    metadata_final_clusters = []
    for cluster in metadata_clusters:
        i, n = 0, len(metadata_final_clusters)
        while i < n:
            if cluster & metadata_final_clusters[i]:
                cluster.update(metadata_final_clusters[i])
                del metadata_final_clusters[i]
                n -= 1
            else:
                i += 1
        metadata_final_clusters.append(cluster)

    return [__get_summary(columns_list, idx_set) for idx_set in metadata_final_clusters]
Ejemplo n.º 3
0
def self_all_pairs(set_IDs, sets, similarity_func_name, similarity_threshold):
    count = 0
    logging.info(
        "Find pairs with similarity >= {}.".format(similarity_threshold))
    for x, y, sim in all_pairs(sets,
                               similarity_func_name=similarity_func_name,
                               similarity_threshold=similarity_threshold):
        yield (set_IDs[x], set_IDs[y], len(sets[x]), len(sets[y]), sim)
        count += 1
    logging.info("Found {} pairs.".format(count))
Ejemplo n.º 4
0
 def test_jaccard(self):
     sets = [[1, 2, 3], [3, 4, 5], [2, 3, 4], [5, 6, 7]]
     correct_pairs = set([(1, 0, 0.2), (2, 0, 0.5), (2, 1, 0.5),
                          (3, 1, 0.2)])
     pairs = list(
         all_pairs(sets,
                   similarity_func_name='jaccard',
                   similarity_threshold=0.1))
     for pair in pairs:
         self.assertTrue(pair in correct_pairs)
     self.assertEqual(len(pairs), len(correct_pairs))
Ejemplo n.º 5
0
    def findSimilarity(self, datasize, setsize, dictsets):
        sets = []
        tcount = 0
        #form sets from dictionary

        for i in range(0, int(datasize / setsize)):
            for j in range(0, len(self.wsn)):
                s = self.wsn[j].aggregatedReadings[i]
                sets.append(CHaggregator.formSet(self, s))

            pairs = all_pairs(
                sets,
                similarity_func_name="jaccard",
                similarity_threshold=self.wsn[0].similarityThreshold)
            simililartyPairs = list(pairs)
            #print(sets)

            pruneList = []
            for k in range(0, len(simililartyPairs)):

                if sets[simililartyPairs[k][1]] not in pruneList:
                    pruneList.append(sets[simililartyPairs[k][1]])

            for l in pruneList:
                sets.remove(l)
            temp = []
            for i in range(0, len(pruneList)):
                temp.append(pruneList[i][0])
            self.outputlen = len(temp)

            self.totalvar += np.var(temp)
            temp = []

            #print(sets)
            #print(sets)
            for e in sets:
                for el in e:
                    temp.append(el)
            self.a_totalavge += sum(temp) / len(temp)
            #print(temp)
            self.totalaggregationOutput.append(temp)
            tcount += len(temp)
            temp = []
            sets = []
            count = 0
        #self.a_totalavge/=int(datasize / setsize)

        print("Accuracy:",
              (abs(self.b_totalavge - self.a_totalavge) / self.b_totalavge) *
              100)
        print("Energy Efficiency:", tcount / datasize / self.totalEnergyCon)
Ejemplo n.º 6
0
def similarity_comparison(scripts, threshold):
    """Computes Jaccard similarity for all pairs of scripts and returns pairs of scripts with similarity > threshold."""
    shingles = []

    print("similarity search")
    for script in scripts:
        data = script.replace("\n", " ")

        # table = data.maketrans(string.punctuation, ' '*len(string.punctuation))
        # data = data.translate(table)

        words = data.split()

        # shingles.append(words)

        shingles_in_doc = set()

        for index, _ in enumerate(words):  # - 2):
            # Construct the shingle text by combining three words together.
            # + " " + words[index + 1] + " " + words[index + 2]
            shingle = words[index]

            # Hash the shingle to a 32-bit integer.
            crc = binascii.crc32(shingle.encode('utf-8')) & 0xffffffff
            # Add the hash value to the list of shingles for the current document.
            # Note that set objects will only add the value to the set if the set
            # doesn't already contain it.

            shingles_in_doc.add(crc)

        shingles.append(shingles_in_doc)

    # elapsed = (time.time() - t0)
    # print("sim search prep took ", elapsed, " seconds")

    # t0 = time.time()

    pairs = all_pairs(shingles,
                      similarity_func_name="jaccard",
                      similarity_threshold=threshold)

    try:
        list_pairs = list(pairs)
    except ValueError:
        return []

    # for n in list_pairs:
    #     print(names[n[0]], names[n[1]], n[2])

    return list_pairs
Ejemplo n.º 7
0
 def test_identity_matrix(self):
     # Use all-pair to generate an lower-triangular identity matix
     nsets = 10
     population = list(range(100))
     sets = [
         set(population) - set(random.choices(population, k=10))
         for i in range(nsets)
     ]
     coords = all_pairs(sets, similarity_threshold=0)
     arr = np.nan * np.empty((nsets, nsets))
     x, y, z = zip(*coords)
     arr[x, y] = z
     # Verify if arr is a lower-triangular matrix.
     for i in range(nsets):
         for j in range(nsets):
             if i > j:
                 self.assertFalse(np.isnan(arr[i, j]))
             else:
                 self.assertTrue(np.isnan(arr[i, j]))