def __call__(self, key, values_gen): values = list(values_gen) version, site, platform, type = key def result(s_type, c_index, c_size, m_id, message, score): sortkey = MAX_SIZE - c_size self.cluster_count += 1 return (sortkey, version, site, platform, s_type, c_index, type, c_size), (m_id, message, score) c_index = 1 if len(values) == 1: m_id, message = values[0] for s_type in (type, None): yield result(s_type, c_index, 1, m_id, message, 1.0) else: corpus = Corpus() unclustered_opinions = {} for m_id, message in values: unclustered_opinions[m_id] = (m_id, message) corpus.add((m_id, message), str=message, key=m_id) clusters = corpus.cluster() for c in clusters: c_index += 1 rest = [(s["object"], s["similarity"]) for s in c.similars] c_size = len(rest) + 1 for (m_id, message), score in [(c.primary, 1.0)] + rest: del unclustered_opinions[m_id] for s_type in (type, None): yield result(s_type, c_index, c_size, m_id, message, score) for m_id, message in unclustered_opinions.values(): c_index += 1 for s_type in (type, None): yield result(s_type, c_index, 1, m_id, message, 1.0)
def __call__(self, key, values_gen): values = list(values_gen) version, site, platform, type = key def result(s_type, c_index, c_size, m_id, message, score): sortkey = MAX_SIZE - c_size self.cluster_count += 1 return \ (sortkey, version, site, platform, s_type, c_index, type, c_size), \ (m_id, message, score) c_index = 1 if len(values) == 1: m_id, message = values[0] for s_type in (type, None): yield result(s_type, c_index, 1, m_id, message, 1.0) else: corpus = Corpus() unclustered_opinions = {} for m_id, message in values: unclustered_opinions[m_id] = (m_id, message) corpus.add((m_id, message), str=message, key=m_id) clusters = corpus.cluster() for c in clusters: c_index += 1 rest = [(s["object"], s["similarity"]) for s in c.similars] c_size = len(rest) + 1 for (m_id, message), score in [(c.primary, 1.0)] + rest: del unclustered_opinions[m_id] for s_type in (type, None): yield result(s_type, c_index, c_size, m_id, message, score) for m_id, message in unclustered_opinions.values(): c_index += 1 for s_type in (type, None): yield result(s_type, c_index, 1, m_id, message, 1.0)
def generate_clusters_for(self, err, storage, group): num_clusters = 0 site_summary = SiteSummary( pk=self.site_summary_id.next(), size=len(group.opinion_pks), issues_count=group.positive_counts[0], praise_count=group.positive_counts[1], **group.key ) storage.save(site_summary) group_positive = group.key["positive"] # Handle single-comment case: if site_summary.size == 1: opinion = Opinion.objects.get(pk=group.opinion_pks[0]) self.add_singleton_cluster(storage, site_summary, opinion) return opinions = Opinion.objects.filter(pk__in=group.opinion_pks) # Handle cluster case, make one corpus for positive, one for negative. for positive in (0, 1): if group_positive is not None and positive != group_positive: continue corpus = Corpus() remaining_opinions = {} for opinion in opinions: if opinion.positive != positive: continue remaining_opinions[opinion.id] = opinion corpus.add(opinion, str=unicode(opinion.description)) clusters = corpus.cluster() for next in clusters: primary = {"object": next.primary, "similarity": 1.0} comments = [ Comment( pk=self.comment_id.next(), description=doc["object"].description, opinion_id=doc["object"].id, score=doc["similarity"], ) for doc in [primary] + next.similars ] cluster = Cluster( pk=self.cluster_id.next(), site_summary=site_summary, primary_description=comments[0].description, primary_comment=None, positive=positive, size=len(comments), ) storage.save(cluster) for comment in comments: del remaining_opinions[comment.opinion_id] comment.cluster = cluster storage.save(comment) cluster.primary_comment = comments[0] cluster.save() # Add singletons for remaining opinions for opinion in remaining_opinions.values(): self.add_singleton_cluster(storage, site_summary, opinion)