Ejemplo n.º 1
0
    def __call__(self, key, values_gen):
        values = list(values_gen)
        version, site, platform, type = key

        def result(s_type, c_index, c_size, m_id, message, score):
            sortkey = MAX_SIZE - c_size
            self.cluster_count += 1
            return (sortkey, version, site, platform, s_type, c_index, type, c_size), (m_id, message, score)

        c_index = 1
        if len(values) == 1:
            m_id, message = values[0]
            for s_type in (type, None):
                yield result(s_type, c_index, 1, m_id, message, 1.0)
        else:
            corpus = Corpus()
            unclustered_opinions = {}
            for m_id, message in values:
                unclustered_opinions[m_id] = (m_id, message)
                corpus.add((m_id, message), str=message, key=m_id)

            clusters = corpus.cluster()
            for c in clusters:
                c_index += 1
                rest = [(s["object"], s["similarity"]) for s in c.similars]
                c_size = len(rest) + 1
                for (m_id, message), score in [(c.primary, 1.0)] + rest:
                    del unclustered_opinions[m_id]
                    for s_type in (type, None):
                        yield result(s_type, c_index, c_size, m_id, message, score)

            for m_id, message in unclustered_opinions.values():
                c_index += 1
                for s_type in (type, None):
                    yield result(s_type, c_index, 1, m_id, message, 1.0)
Ejemplo n.º 2
0
    def __call__(self, key, values_gen):
        values = list(values_gen)
        version, site, platform, type = key

        def result(s_type, c_index, c_size, m_id, message, score):
            sortkey = MAX_SIZE - c_size
            self.cluster_count += 1
            return \
                (sortkey, version, site, platform, s_type, c_index, type, c_size), \
                (m_id, message, score)

        c_index = 1
        if len(values) == 1:
            m_id, message = values[0]
            for s_type in (type, None):
                yield result(s_type, c_index, 1, m_id, message, 1.0)
        else:
            corpus = Corpus()
            unclustered_opinions = {}
            for m_id, message in values:
                unclustered_opinions[m_id] = (m_id, message)
                corpus.add((m_id, message), str=message, key=m_id)

            clusters = corpus.cluster()
            for c in clusters:
                c_index += 1
                rest = [(s["object"], s["similarity"]) for s in c.similars]
                c_size = len(rest) + 1
                for (m_id, message), score in [(c.primary, 1.0)] + rest:
                    del unclustered_opinions[m_id]
                    for s_type in (type, None):
                        yield result(s_type, c_index, c_size, m_id, message,
                                     score)

            for m_id, message in unclustered_opinions.values():
                c_index += 1
                for s_type in (type, None):
                    yield result(s_type, c_index, 1, m_id, message, 1.0)
Ejemplo n.º 3
0
    def generate_clusters_for(self, err, storage, group):
        num_clusters = 0
        site_summary = SiteSummary(
            pk=self.site_summary_id.next(),
            size=len(group.opinion_pks),
            issues_count=group.positive_counts[0],
            praise_count=group.positive_counts[1],
            **group.key
        )
        storage.save(site_summary)
        group_positive = group.key["positive"]

        # Handle single-comment case:
        if site_summary.size == 1:
            opinion = Opinion.objects.get(pk=group.opinion_pks[0])
            self.add_singleton_cluster(storage, site_summary, opinion)
            return

        opinions = Opinion.objects.filter(pk__in=group.opinion_pks)

        # Handle cluster case, make one corpus for positive, one for negative.
        for positive in (0, 1):
            if group_positive is not None and positive != group_positive:
                continue
            corpus = Corpus()
            remaining_opinions = {}
            for opinion in opinions:
                if opinion.positive != positive:
                    continue
                remaining_opinions[opinion.id] = opinion
                corpus.add(opinion, str=unicode(opinion.description))
            clusters = corpus.cluster()
            for next in clusters:
                primary = {"object": next.primary, "similarity": 1.0}
                comments = [
                    Comment(
                        pk=self.comment_id.next(),
                        description=doc["object"].description,
                        opinion_id=doc["object"].id,
                        score=doc["similarity"],
                    )
                    for doc in [primary] + next.similars
                ]
                cluster = Cluster(
                    pk=self.cluster_id.next(),
                    site_summary=site_summary,
                    primary_description=comments[0].description,
                    primary_comment=None,
                    positive=positive,
                    size=len(comments),
                )
                storage.save(cluster)
                for comment in comments:
                    del remaining_opinions[comment.opinion_id]
                    comment.cluster = cluster
                    storage.save(comment)
                cluster.primary_comment = comments[0]
                cluster.save()

            # Add singletons for remaining opinions
            for opinion in remaining_opinions.values():
                self.add_singleton_cluster(storage, site_summary, opinion)