def get_clusters_falconn(self): serializable_list = [] vector_numpy_ndarray = np.array(self.vector_matrix) vector_numpy_ndarray /= np.linalg.norm(vector_numpy_ndarray).reshape(-1, 1) center = np.mean(vector_numpy_ndarray) vector_numpy_ndarray -= center falconn_params = falconn.get_default_parameters(len(self.vector_matrix), len(self.vector_matrix[0])) falconn_params.distance_function = "euclidean_squared" lsh_index = falconn.LSHIndex(falconn_params) lsh_index.setup(vector_numpy_ndarray) i = 0 for vector in self.vector_matrix: cluster = lsh_index.find_near_neighbors(np.array(vector), self.similarity_threshold) cluster = cluster + (i,) i += 1 if len(cluster) < 2: continue similarity_cluster = SimilarityCluster(self.similarity_threshold, self.vector_id_list[cluster[0]], self.vector_matrix[cluster[0]], self.start_time_ms, self.end_time_ms) for index in cluster: if index == cluster[0]: continue similarity_cluster.similar_image_ids.append(self.vector_id_list[index]) similarity_cluster.apply_vector_to_average(self.vector_matrix[index]) serializable_list.append(similarity_cluster.to_serializable_object()) return serializable_list
def process_vector_custom(self, vector_id, post_id, vector, image_url=None): normalized_vector = np.linalg.norm(vector) if normalized_vector == 0: print "normalized vector returned 0, skipping." return match_id = self.process_cluster_set(self.similarity_clusters["high"], vector_id, post_id, vector, normalized_vector, image_url) if match_id is not None: self.organize_cluster(match_id, self.similarity_clusters["high"]) return match_id = self.process_cluster_set(self.similarity_clusters["medium"], vector_id, post_id, vector, normalized_vector, image_url) if match_id is not None: self.organize_cluster(match_id, self.similarity_clusters["medium"]) return match_id = self.process_cluster_set(self.similarity_clusters["low"], vector_id, post_id, vector, normalized_vector, image_url) if match_id is not None: self.organize_cluster(match_id, self.similarity_clusters["low"]) return # found no matches, just add a new cluster to the low group new_cluster = SimilarityCluster(self.similarity_threshold, vector_id, post_id, vector, self.start_time_ms, self.end_time_ms, image_url) self.similarity_clusters["low"][new_cluster.id] = new_cluster
def test_positive_similarity_state(self): cluster = SimilarityCluster(.9, 0, 0, [1, 1, 1], 0, 0) cluster.process_similarity(1, 1, [1, 1, 1], np.linalg.norm([1, 1, 1])) self.assertTrue(cluster.valid_cluster) self.assertTrue(len(cluster.similar_ids) == 2)
def test_empty_starting_vector(self): cluster = SimilarityCluster(.9, 0, 0, [], 0, 0) self.assertFalse(cluster.valid_cluster)