def test_find_or_create(self): docs = self.make_docs() cluster = Cluster.find_or_create(docs=docs) self.assertEqual(cluster.fingerprint, '202cb962ac59075b964b07152d234b70') self.assertEqual(sorted(cluster.documents), sorted(docs)) db.session.add(cluster) db.session.flush() cluster2 = Cluster.find_or_create(docs=docs) self.assertIsNotNone(cluster2.id) self.assertEqual(cluster.id, cluster2.id)
def test_delete_cascades(self): docs = self.make_docs() cluster = Cluster.find_or_create(docs=docs) db.session.add(cluster) db.session.flush() deleted = docs[0] rest = docs[1:] db.session.delete(deleted) db.session.flush() db.session.commit() cluster = db.session.query(Cluster).filter(Cluster.id == cluster.id).one() self.assertEqual(sorted(rest), sorted(cluster.documents))
def test_delete_cascades(self): docs = self.make_docs() cluster = Cluster.find_or_create(docs=docs) db.session.add(cluster) db.session.flush() deleted = docs[0] rest = docs[1:] db.session.delete(deleted) db.session.flush() db.session.commit() cluster = db.session.query(Cluster).filter( Cluster.id == cluster.id).one() self.assertEqual(sorted(rest), sorted(cluster.documents))
def find_topics(self): """ Run clustering on these documents and identify common topics. We use latent Dirichlet allocation (LDA) to cluster the documents into an arbitrary number of clusters. We then find the strongest clusters and pull representative documents for each cluster. Clustering is based on the people and entities mentioned in the documents, rather than raw text. This is based on the assumption that Opencalais and AlchemyAPI have already done the work to identify pertinent things and concepts in the documents, so rely on those rather than on arbitrary words. The results are stored in `clustered_topics`. See also: https://github.com/ariddell/lda """ from sklearn.feature_extraction import DictVectorizer import numpy # TODO: factor people into cluster calcs self.clustered_topics = [] # load documents and their entities docs = Document.query\ .options(subqueryload('entities'), subqueryload('medium'))\ .filter(Document.id.in_(self.doc_ids))\ .all() if not docs: return # guess at the number of topics, between 1 and 50 n_topics = max(min(50, len(docs)/5), 1) # list of entity maps for each document, from entity name to occurrence count entities = [dict(('%s-%s' % (de.entity.group, de.entity.name), de.count or 1) for de in d.entities) for d in docs] vec = DictVectorizer(sparse=True) # TODO: we should ideally use sparse, but it causes the lda library to fail entity_vector = vec.fit_transform(entities) features = numpy.array(vec.feature_names_) clusters, lda_model = self._run_lda(entity_vector, n_topics) del entity_vector del vec # for normalising histograms day_counts = self.date_histogram(d.published_at for d in docs) # generate topic info for i, clustering in clusters.iteritems(): # cluster is a list of (doc-index, score) pairs # sort each cluster to put top-scoring docs first # TODO: this isn't great, because scores for each document # for the same cluster can't really be compared. We # need a better way of doing this. clustering.sort(key=lambda p: p[1], reverse=True) cluster_docs = [docs[p[0]] for p in clustering] cluster = Cluster.find_or_create(docs=cluster_docs) # top 8 features for this cluster as (feature, weight) pairs indexes = numpy.argsort(lda_model.components_[i])[:-8:-1] cluster.features = zip(features[indexes], lda_model.components_[i][indexes]) # top 20 of each cluster are used to characterize the cluster best = clustering[0:20] cluster.score = numpy.median([p[1] for p in best]) # keep only the clusters with a score > self.topic_score_threshold if cluster.score <= self.topic_score_threshold: continue # score for this cluster as stars, from 0 to 3 cluster.stars = math.ceil((cluster.score - self.topic_score_threshold) / ((1.0 - self.topic_score_threshold) / 3.0)) # media counts media = dict(collections.Counter([d.medium for d in cluster_docs])) cluster.media_counts = sorted(media.items(), key=lambda p: p[1], reverse=True) # publication dates cluster.histogram = self.date_histogram((d.published_at for d in cluster_docs)) cluster.trend = moving_weighted_avg_zscore(cluster.histogram) cluster.histogram = self.normalise_histogram(cluster.histogram, day_counts) self.clustered_topics.append(cluster) # sort clusters by size self.clustered_topics.sort(key=lambda t: t.score, reverse=True)
def find_topics(self): """ Run clustering on these documents and identify common topics. We use latent Dirichlet allocation (LDA) to cluster the documents into an arbitrary number of clusters. We then find the strongest clusters and pull representative documents for each cluster. Clustering is based on the people and entities mentioned in the documents, rather than raw text. This is based on the assumption that Opencalais and AlchemyAPI have already done the work to identify pertinent things and concepts in the documents, so rely on those rather than on arbitrary words. The results are stored in `clustered_topics`. See also: https://github.com/ariddell/lda """ from sklearn.feature_extraction import DictVectorizer import numpy # TODO: factor people into cluster calcs self.clustered_topics = [] # load documents and their entities docs = ( Document.query.options(subqueryload("entities"), subqueryload("medium")) .filter(Document.id.in_(self.doc_ids)) .all() ) if not docs: return # guess at the number of topics, between 1 and 50 n_topics = max(min(50, len(docs) / 5), 1) # list of entity maps for each document, from entity name to occurrence count entities = [ dict(("%s-%s" % (de.entity.group, de.entity.name), de.count or 1) for de in d.entities) for d in docs ] vec = DictVectorizer(sparse=True) # TODO: we should ideally use sparse, but it causes the lda library to fail entity_vector = vec.fit_transform(entities) features = numpy.array(vec.feature_names_) clusters, lda_model = self._run_lda(entity_vector, n_topics) del entity_vector del vec # for normalising histograms day_counts = self.date_histogram(d.published_at for d in docs) # generate topic info for i, clustering in clusters.iteritems(): # cluster is a list of (doc-index, score) pairs # sort each cluster to put top-scoring docs first # TODO: this isn't great, because scores for each document # for the same cluster can't really be compared. We # need a better way of doing this. clustering.sort(key=lambda p: p[1], reverse=True) cluster_docs = [docs[p[0]] for p in clustering] cluster = Cluster.find_or_create(docs=cluster_docs) # top 8 features for this cluster as (feature, weight) pairs indexes = numpy.argsort(lda_model.components_[i])[:-8:-1] cluster.features = zip(features[indexes], lda_model.components_[i][indexes]) # top 20 of each cluster are used to characterize the cluster best = clustering[0:20] cluster.score = numpy.median([p[1] for p in best]) # keep only the clusters with a score > self.topic_score_threshold if cluster.score <= self.topic_score_threshold: continue # score for this cluster as stars, from 0 to 3 cluster.stars = math.ceil( (cluster.score - self.topic_score_threshold) / ((1.0 - self.topic_score_threshold) / 3.0) ) # media counts media = dict(collections.Counter([d.medium for d in cluster_docs])) cluster.media_counts = sorted(media.items(), key=lambda p: p[1], reverse=True) # publication dates cluster.histogram = self.date_histogram((d.published_at for d in cluster_docs)) cluster.trend = moving_weighted_avg_zscore(cluster.histogram) cluster.histogram = self.normalise_histogram(cluster.histogram, day_counts) self.clustered_topics.append(cluster) # sort clusters by size self.clustered_topics.sort(key=lambda t: t.score, reverse=True)