Esempio n. 1
0
    def test_find_or_create(self):
        docs = self.make_docs()

        cluster = Cluster.find_or_create(docs=docs)
        self.assertEqual(cluster.fingerprint, '202cb962ac59075b964b07152d234b70')
        self.assertEqual(sorted(cluster.documents), sorted(docs))

        db.session.add(cluster)
        db.session.flush()

        cluster2 = Cluster.find_or_create(docs=docs)
        self.assertIsNotNone(cluster2.id)
        self.assertEqual(cluster.id, cluster2.id)
Esempio n. 2
0
    def test_find_or_create(self):
        docs = self.make_docs()

        cluster = Cluster.find_or_create(docs=docs)
        self.assertEqual(cluster.fingerprint,
                         '202cb962ac59075b964b07152d234b70')
        self.assertEqual(sorted(cluster.documents), sorted(docs))

        db.session.add(cluster)
        db.session.flush()

        cluster2 = Cluster.find_or_create(docs=docs)
        self.assertIsNotNone(cluster2.id)
        self.assertEqual(cluster.id, cluster2.id)
Esempio n. 3
0
    def test_delete_cascades(self):
        docs = self.make_docs()

        cluster = Cluster.find_or_create(docs=docs)
        db.session.add(cluster)
        db.session.flush()

        deleted = docs[0]
        rest = docs[1:]

        db.session.delete(deleted)
        db.session.flush()
        db.session.commit()

        cluster = db.session.query(Cluster).filter(Cluster.id == cluster.id).one()
        self.assertEqual(sorted(rest), sorted(cluster.documents))
Esempio n. 4
0
    def test_delete_cascades(self):
        docs = self.make_docs()

        cluster = Cluster.find_or_create(docs=docs)
        db.session.add(cluster)
        db.session.flush()

        deleted = docs[0]
        rest = docs[1:]

        db.session.delete(deleted)
        db.session.flush()
        db.session.commit()

        cluster = db.session.query(Cluster).filter(
            Cluster.id == cluster.id).one()
        self.assertEqual(sorted(rest), sorted(cluster.documents))
Esempio n. 5
0
    def find_topics(self):
        """
        Run clustering on these documents and identify common topics.

        We use latent Dirichlet allocation (LDA) to cluster the documents
        into an arbitrary number of clusters. We then find the strongest
        clusters and pull representative documents for each cluster.

        Clustering is based on the people and entities mentioned in the documents,
        rather than raw text. This is based on the assumption that Opencalais and
        AlchemyAPI have already done the work to identify pertinent things
        and concepts in the documents, so rely on those rather than on
        arbitrary words.

        The results are stored in `clustered_topics`.

        See also: https://github.com/ariddell/lda
        """
        from sklearn.feature_extraction import DictVectorizer
        import numpy

        # TODO: factor people into cluster calcs

        self.clustered_topics = []

        # load documents and their entities
        docs = Document.query\
            .options(subqueryload('entities'),
                     subqueryload('medium'))\
            .filter(Document.id.in_(self.doc_ids))\
            .all()

        if not docs:
            return

        # guess at the number of topics, between 1 and 50
        n_topics = max(min(50, len(docs)/5), 1)

        # list of entity maps for each document, from entity name to occurrence count
        entities = [dict(('%s-%s' % (de.entity.group, de.entity.name), de.count or 1)
                         for de in d.entities) for d in docs]
        vec = DictVectorizer(sparse=True)

        # TODO: we should ideally use sparse, but it causes the lda library to fail
        entity_vector = vec.fit_transform(entities)
        features = numpy.array(vec.feature_names_)

        clusters, lda_model = self._run_lda(entity_vector, n_topics)
        del entity_vector
        del vec

        # for normalising histograms
        day_counts = self.date_histogram(d.published_at for d in docs)

        # generate topic info
        for i, clustering in clusters.iteritems():
            # cluster is a list of (doc-index, score) pairs

            # sort each cluster to put top-scoring docs first
            # TODO: this isn't great, because scores for each document
            # for the same cluster can't really be compared. We
            # need a better way of doing this.
            clustering.sort(key=lambda p: p[1], reverse=True)
            cluster_docs = [docs[p[0]] for p in clustering]

            cluster = Cluster.find_or_create(docs=cluster_docs)

            # top 8 features for this cluster as (feature, weight) pairs
            indexes = numpy.argsort(lda_model.components_[i])[:-8:-1]
            cluster.features = zip(features[indexes], lda_model.components_[i][indexes])

            # top 20 of each cluster are used to characterize the cluster
            best = clustering[0:20]
            cluster.score = numpy.median([p[1] for p in best])

            # keep only the clusters with a score > self.topic_score_threshold
            if cluster.score <= self.topic_score_threshold:
                continue

            # score for this cluster as stars, from 0 to 3
            cluster.stars = math.ceil((cluster.score - self.topic_score_threshold) / ((1.0 - self.topic_score_threshold) / 3.0))

            # media counts
            media = dict(collections.Counter([d.medium for d in cluster_docs]))
            cluster.media_counts = sorted(media.items(), key=lambda p: p[1], reverse=True)

            # publication dates
            cluster.histogram = self.date_histogram((d.published_at for d in cluster_docs))
            cluster.trend = moving_weighted_avg_zscore(cluster.histogram)
            cluster.histogram = self.normalise_histogram(cluster.histogram, day_counts)

            self.clustered_topics.append(cluster)

        # sort clusters by size
        self.clustered_topics.sort(key=lambda t: t.score, reverse=True)
Esempio n. 6
0
    def find_topics(self):
        """
        Run clustering on these documents and identify common topics.

        We use latent Dirichlet allocation (LDA) to cluster the documents
        into an arbitrary number of clusters. We then find the strongest
        clusters and pull representative documents for each cluster.

        Clustering is based on the people and entities mentioned in the documents,
        rather than raw text. This is based on the assumption that Opencalais and
        AlchemyAPI have already done the work to identify pertinent things
        and concepts in the documents, so rely on those rather than on
        arbitrary words.

        The results are stored in `clustered_topics`.

        See also: https://github.com/ariddell/lda
        """
        from sklearn.feature_extraction import DictVectorizer
        import numpy

        # TODO: factor people into cluster calcs

        self.clustered_topics = []

        # load documents and their entities
        docs = (
            Document.query.options(subqueryload("entities"), subqueryload("medium"))
            .filter(Document.id.in_(self.doc_ids))
            .all()
        )

        if not docs:
            return

        # guess at the number of topics, between 1 and 50
        n_topics = max(min(50, len(docs) / 5), 1)

        # list of entity maps for each document, from entity name to occurrence count
        entities = [
            dict(("%s-%s" % (de.entity.group, de.entity.name), de.count or 1) for de in d.entities) for d in docs
        ]
        vec = DictVectorizer(sparse=True)

        # TODO: we should ideally use sparse, but it causes the lda library to fail
        entity_vector = vec.fit_transform(entities)
        features = numpy.array(vec.feature_names_)

        clusters, lda_model = self._run_lda(entity_vector, n_topics)
        del entity_vector
        del vec

        # for normalising histograms
        day_counts = self.date_histogram(d.published_at for d in docs)

        # generate topic info
        for i, clustering in clusters.iteritems():
            # cluster is a list of (doc-index, score) pairs

            # sort each cluster to put top-scoring docs first
            # TODO: this isn't great, because scores for each document
            # for the same cluster can't really be compared. We
            # need a better way of doing this.
            clustering.sort(key=lambda p: p[1], reverse=True)
            cluster_docs = [docs[p[0]] for p in clustering]

            cluster = Cluster.find_or_create(docs=cluster_docs)

            # top 8 features for this cluster as (feature, weight) pairs
            indexes = numpy.argsort(lda_model.components_[i])[:-8:-1]
            cluster.features = zip(features[indexes], lda_model.components_[i][indexes])

            # top 20 of each cluster are used to characterize the cluster
            best = clustering[0:20]
            cluster.score = numpy.median([p[1] for p in best])

            # keep only the clusters with a score > self.topic_score_threshold
            if cluster.score <= self.topic_score_threshold:
                continue

            # score for this cluster as stars, from 0 to 3
            cluster.stars = math.ceil(
                (cluster.score - self.topic_score_threshold) / ((1.0 - self.topic_score_threshold) / 3.0)
            )

            # media counts
            media = dict(collections.Counter([d.medium for d in cluster_docs]))
            cluster.media_counts = sorted(media.items(), key=lambda p: p[1], reverse=True)

            # publication dates
            cluster.histogram = self.date_histogram((d.published_at for d in cluster_docs))
            cluster.trend = moving_weighted_avg_zscore(cluster.histogram)
            cluster.histogram = self.normalise_histogram(cluster.histogram, day_counts)

            self.clustered_topics.append(cluster)

        # sort clusters by size
        self.clustered_topics.sort(key=lambda t: t.score, reverse=True)