Ejemplo n.º 1
0
 def consume(self, f="jaccard"):
     if len(self.temp) == 0:
         return
     starting_tm = time.time()
     ex_roid_sum = 0  # clusteroid sum to each point
     sum_dists = []
     for p1 in self.temp:
         if f == "jaccard":
             _dist = get_jaccard(p1[1], self.clusteroid)
         else:
             _dist = cosine_similarity(p1[1], self.clusteroid)
         # sum of the current clusteroid
         ex_roid_sum += _dist
         # sum for each temp point
         sum = _dist
         for p2 in self.temp:
             if f == "jaccard":
                 sum += get_jaccard(p1[1], p2[1])
             else:
                 sum += cosine_similarity(p1[1], p2[1])[0][0]
         sum_dists.append(sum)
         # also update membership
         self.membership.append(p1[0])
     # find max distance
     max_dist_idx = sum_dists.index(max(sum_dists))
     # declare the new clusteroid
     if sum_dists[max_dist_idx] > ex_roid_sum:
         self.clusteroid = self.temp[max_dist_idx][1]
     self.temp = []
     print("Consuming took {:.3f}".format(time.time() - starting_tm),
           ":: Key ~>", self.key, "membership ", len(self.membership))
Ejemplo n.º 2
0
    def consume(self):
        if len(self.temp) == 0:
            return
        starting_tm = time.time()
        ex_roid_sum = 0
        sum_dists = []
        for p1 in self.temp:
            d1 = get_jaccard(self.clusteroid_genres, p1['genres'])
            d2 = get_jaccard(self.clusteroid_tags, p1['tags'])
            d3 = cosine_similarity(self.clusteroid_ratings,
                                   p1['ratings'])[0][0]
            _dist = 0.33 * d1 + 0.25 * d2 + 0.45 * d3
            ex_roid_sum += _dist
            sum = _dist
            for p2 in self.temp:
                d1 = get_jaccard(p2['genres'], p1['genres'])
                d2 = get_jaccard(p2['tags'], p1['tags'])
                d3 = cosine_similarity(p2['ratings'], p1['ratings'])[0][0]
                sum += 0.33 * d1 + 0.25 * d2 + 0.45 * d3
            sum_dists.append(sum)
            self.membership.append(p1['movie_id'])

        max_dist_idx = sum_dists.index(max(sum_dists))
        if sum_dists[max_dist_idx] > ex_roid_sum:
            self.clusteroid_genres = self.temp[max_dist_idx]['genres']
            self.clusteroid_tags = self.temp[max_dist_idx]['tags']
            self.clusteroid_ratings = self.temp[max_dist_idx]['ratings']
        self.temp = []
        print("Consuming took {:.3f}".format(time.time() - starting_tm),
              ":: Key ~>", self.key, "membership ", len(self.membership))
Ejemplo n.º 3
0
 def fit_with_all(self):
     starting_tm = time.time()
     random_clusters_ids = []
     iteration = 0
     user_ratings = MoviesRatings(self.ratings_path)
     for chunk in pd.read_csv(self.data_path, chunksize=self.chunk_size):
         loop_tm = time.time()
         chunk_ids = chunk['movieId'].tolist()
         chunk_vectors = {}  # user_ratings.get_many_vectors(chunk_ids)
         for movie_id in chunk_ids:
             chunk_vectors[movie_id] = user_ratings.get_vector(movie_id)
         print("All vector created in: {:.3f}".format(time.time() - loop_tm))
         if iteration == 0:
             rows_id = random.sample(range(self.chunk_size), self.k)
             random_clusters_ids = [chunk['movieId'][row_id] for row_id in rows_id]
             self.discard = [
                 ComplexCluster(
                     i,
                     movie_id,
                     chunk['genres'][row_id],
                     chunk['tags'][row_id],
                     chunk_vectors[movie_id]
                 )
                 for i, (row_id, movie_id) in enumerate(zip(rows_id, random_clusters_ids))
             ]
         clustering_tm = time.time()
         for movie_id, genres, tags in zip(chunk['movieId'], chunk['genres'], chunk['tags']):
             if movie_id in random_clusters_ids:
                 continue
             dists = []
             for cluster in self.discard:
                 d1 = get_jaccard(cluster.clusteroid_genres, genres)
                 d2 = get_jaccard(cluster.clusteroid_tags, tags)
                 d3 = cosine_similarity(cluster.clusteroid_ratings, chunk_vectors[movie_id])[0][0]
                 distance = 0.33*d1 + 0.25*d2 + 0.45*d3
                 dists.append(distance)
             point = {
                 "movie_id": movie_id,
                 "genres": genres,
                 "tags": tags,
                 "ratings": chunk_vectors[movie_id]
             }
             if max(dists) >= self.threshold:
                 self.discard[dists.index(max(dists))].add_temp_point(point)
             else:
                 self.remaining.append(point)
         print("Clustering part took {:.3f}".format(time.time() - clustering_tm))
         for cluster in self.discard:
             cluster.consume()
         print("chunk ", iteration, " in: {:.3f}".format(time.time() - loop_tm))
         iteration += 1
     print("Total Iterations:", iteration, " Chunk Size: ", self.chunk_size)
     print("Fit duration(s): {:.3f}".format(time.time() - starting_tm))
Ejemplo n.º 4
0
 def complex_absorb(self):
     print("Absorb starts")
     starting_tm = time.time()
     for remain in self.remaining:
         dist = []
         for cluster in self.discard:
             d1 = get_jaccard(cluster.clusteroid_genres, remain['genres'])
             d2 = get_jaccard(cluster.clusteroid_tags, remain['tags'])
             d3 = cosine_similarity(cluster.clusteroid_ratings, remain['ratings'])[0][0]
             _dist = 0.33 * d1 + 0.25 * d2 + 0.45 * d3
             dist.append(_dist)
         self.discard[dist.index(max(dist))].membership.append(remain['movie_id'])
     print("Absorb duration(s): {:.3f}".format(time.time()-starting_tm))
Ejemplo n.º 5
0
 def refresh(self):
     sum_dists = []
     for member in self.members:
         sum = 0
         for point in self.members:
             sum += get_jaccard(member[1], point[1])
         sum_dists.append(sum)
     self.clusteroid = self.members[sum_dists.index(min(sum_dists))][1]
Ejemplo n.º 6
0
 def simple_absorb(self):
     print("Absorb starts")
     starting_tm = time.time()
     for remain in self.remaining:
         dist = []
         for cluster in self.discard:
             if self.distance_f != "d3":
                 dist.append(get_jaccard(remain.clusteroid, cluster.clusteroid))
             else:
                 dist.append(cosine_similarity(remain.clusteroid, cluster.clusteroid)[0][0])
         self.discard[dist.index(max(dist))].membership.extend([x[0] for x in remain.members])
     print("Absorb duration(s): {:.3f}".format(time.time()-starting_tm))
Ejemplo n.º 7
0
 def fit_with_new(self):
     starting_tm = time.time()
     random_clusters_ids = []
     iteration = 0
     for chunk in pd.read_csv(self.data_path, chunksize=self.chunk_size):
         loop_tm = time.time()
         # If it is the first time, initialize the first k clusters
         if iteration == 0:
             rows_id = random.sample(range(self.chunk_size), self.k)
             random_clusters_ids = [chunk['movieId'][row_id] for row_id in rows_id]
             self.discard = [
                 SimpleCluster(i, movie_id, chunk[self.target][row_id])
                 for i, (row_id, movie_id) in enumerate(zip(rows_id, random_clusters_ids))
             ]
         for movie_id, record in zip(chunk['movieId'], chunk[self.target]):
             if movie_id in random_clusters_ids:
                 continue
             # calculate the distance with each cluster
             dists = []
             for cluster in self.discard:
                 dists.append(get_jaccard(cluster.clusteroid, record))
             # if it is over than threshold add it to DC
             if max(dists) >= self.threshold:
                 self.discard[dists.index(max(dists))].add_temp_point(movie_id, record)
             else:
                 # add it to retained set
                 self.remaining.append(RemainEntity((movie_id, record)))
         # calculate the new clusteroids in the discard set
         for c in self.discard:
             c.consume()
         # handle retain set --TOO SLOW--
         # self.remaining = hierarchical_cluster(self.remaining, self.threshold)
         print("chunk ", iteration, " in: {:.3f}".format(time.time() - loop_tm))
         iteration += 1
     # end of dataset parse
     print("Total Iterations:", iteration, " Chunk Size: ", self.chunk_size)
     print("Fit duration(s): {:.3f}".format(time.time()-starting_tm))