def reduce(self, uid, values): c = Cluster() c.uid = uid sqdist = 0.0 # TODO: update the cluster center instances = [] for value in values: doc = Document(value) instances.append(doc) for token in doc.tfidf.keys(): bool = token in c.tfidf.keys() if bool: c.tfidf[token] += doc.tfidf[token] else: c.tfidf[token] = doc.tfidf[token] size = float(len(values)) for token in c.tfidf.keys(): c.tfidf[token] = c.tfidf[token] / size #compute the distance for instance in instances: sqdist += MathUtil.compute_distance(map1=c.tfidf, map2=instance.tfidf) # Output the cluster center into file: clusteri self.emit("cluster" + str(c.uid), str(c)) # Output the within distance into file: distancei self.emit("distance" + str(c.uid), str(c.uid) + "|" + str(sqdist))
def reduce(self, uid, values): values = [eval(text_dict) for text_dict in values] c = Cluster() c.uid = uid c_total = len(values) sqdist = 0.0 # set cluster center to sum of members for doc in values: for tokenid in doc: if c.tfidf.has_key(tokenid): c.tfidf[tokenid] += doc[tokenid] else: c.tfidf[tokenid] = doc[tokenid] # set cluster center, currently the sum, to the mean for tokenid in c.tfidf: c.tfidf[tokenid] = c.tfidf[tokenid] / float(c_total) # set sqdist to the squared sum of deviations from mean for doc in values: sqdist += MathUtil.compute_distance(c.tfidf, doc, squared=True) # Output the cluster center into file: clusteri self.emit("cluster" + str(c.uid), str(c)) # Output the within distance into file: distancei self.emit("distance" + str(c.uid), str(c.uid) + "|" + str(sqdist))
def reduce(self, uid, values): c = Cluster() c.uid = uid sqdist = 0.0 # TODO: update the cluster center instances = [] for value in values: doc = Document(value) instances.append(doc) for token in doc.tfidf.keys(): bool = token in c.tfidf.keys() if bool: c.tfidf[token] += doc.tfidf[token] else: c.tfidf[token] = doc.tfidf[token] size = float(len(values)) for token in c.tfidf.keys(): c.tfidf[token] = c.tfidf[token]/size #compute the distance for instance in instances: sqdist += MathUtil.compute_distance(map1 = c.tfidf, map2 = instance.tfidf) # Output the cluster center into file: clusteri self.emit("cluster" + str(c.uid), str(c)) # Output the within distance into file: distancei self.emit("distance" + str(c.uid), str(c.uid) + "|" + str(sqdist))
def reduce(self, uid, values): # values is a list of dictionaries c = Cluster() c.uid = int(uid) sqdist = 0.0 # Compute new center count = 0 for value in values: count += 1 doc = Document(value) for key, v in doc.tfidf.items(): c.tfidf[key] = c.tfidf.get(key, 0.0) + v for key in c.tfidf: c.tfidf[key] /= count # Get within cluster distance for value in values: doc = Document(value) sqdist += MathUtil.compute_distance(c.tfidf, doc.tfidf) # Output the cluster center into file: clusteri self.emit("cluster" + str(c.uid), str(c)) # Output the within distance into file: distancei self.emit("distance" + str(c.uid), str(c.uid) + "|" + str(sqdist))
def map(self, line): # TODO: call `self.emit(key, value)` instance = Document(line) min_dist = sys.maxsize key = -1 for cluster in self.clusters: dist = MathUtil.compute_distance(map1 = cluster.tfidf, map2 = instance.tfidf) if dist < min_dist: key = cluster.uid min_dist = dist self.emit(key, line) #instance.__str__()
def map(self, line): #find cluster assignment by brute force doc = Document(line) cluster_uid = None sqdist_to_nearest = float('inf') for cluster_k in self.clusters: sqdist_k = MathUtil.compute_distance(map1 = cluster_k.tfidf, map2 = doc.tfidf, squared=True) if sqdist_k <= sqdist_to_nearest: cluster_uid = cluster_k.uid #dutifully emit. self.emit(key = cluster_uid, value = doc) return
def map(self, line): # TODO: call `self.emit(key, value)` instance = Document(line) min_dist = sys.maxsize key = -1 for cluster in self.clusters: dist = MathUtil.compute_distance(map1=cluster.tfidf, map2=instance.tfidf) if dist < min_dist: key = cluster.uid min_dist = dist self.emit(key, line) #instance.__str__()
def map(self, line): # Key is cluster id - clusters stored in self.clusters # Value is the line dist = float("inf") temp_dist = float("inf") doc = Document(line) key = doc.uid for c in self.clusters: temp_dist = MathUtil.compute_distance(doc.tfidf,c.tfidf) if temp_dist < dist: dist = temp_dist key = c.uid self.emit(str(key),str(doc))