def __computeSimilarity(self, keyHashesInput,keyHashesBase, outputFile, lsh_band):
     #print "Compute Similarity between: ", keys with inputprefix and keys with baseprefix, "
     numLines = 0
     for keyArr1 in keyHashesInput:
         key1 = keyArr1[0]
         minarr1 = keyArr1[1:]
         #print "Start: ", key1
         for keyArr2 in keyHashesBase:
             key2 = keyArr2[0]
             minarr2 = keyArr2[1:]
             if minarr1 != minarr2:
                 score = util.compute_list_similarity(minarr1, minarr2)
             else:
                 score = 1.0
             if score >= self.scorethreshold:
                 if score < 1.0 or lsh_band == "000":
                     outputFile.write(key1 + self.separator + key2 + self.separator + str(score) + "\n")
                     numLines += 1
     return numLines
Ejemplo n.º 2
0
 def __compute_similarity(self, key_hashes_array, output_file, lsh_band):
     #print "Compute Similarity between: ", len(keyHashesArray), " items"
     num_lines = 0
     for keyArr1 in key_hashes_array:
         key1 = keyArr1[0]
         min_arr1 = keyArr1[1:]
         #print "Start: ", key1
         for keyArr2 in key_hashes_array:
             key2 = keyArr2[0]
             if key1 < key2:
                 min_arr2 = keyArr2[1:]
                 if min_arr1 != min_arr2:
                     score = util.compute_list_similarity(min_arr1, min_arr2)
                 else:
                     score = 1.0
                 if score >= self.score_threshold:
                     if score < 1.0 or lsh_band == "000":
                         output_file.write(key1 + self.separator + key2 + self.separator + str(score) + "\n")
                         num_lines += 1
     return num_lines