Beispiel #1
0
    def analyse_people_sources(self):
        """
        Do trend analysis on people.
        """
        utterance_count = self.count_utterances(self.people.keys())
        source_counts = self.source_frequencies(self.people.keys())

        self.analysed_people = {}
        for pid, person in self.people.iteritems():
            src = AnalysedSource()
            src.person = person

            src.utterance_count = utterance_count.get(src.person.id, 0)
            src.source_counts = source_counts[src.person.id]
            src.source_counts_total = sum(src.source_counts)

            self.analysed_people[pid] = src

        # normalize by total counts per day
        totals = [0] * (self.days+1)

        # first count per-day totals
        for src in self.analysed_people.itervalues():
            for i, n in enumerate(src.source_counts):
                totals[i] += n

        # normalize
        for src in self.analysed_people.itervalues():
            for i, n in enumerate(src.source_counts):
                if totals[i] == 0:
                    src.source_counts[i] = 0
                else:
                    src.source_counts[i] = 100.0 * n / totals[i]

        # calculate trends
        # normalise source counts
        if self.analysed_people:
            biggest = max(src.source_counts_total for src in self.analysed_people.itervalues())
            for src in self.analysed_people.itervalues():
                src.source_counts_trend = moving_weighted_avg_zscore(src.source_counts, 0.8)
                src.source_counts_normalised = src.source_counts_total * 1.0 / biggest

        # top 20 sources
        self.top_people = sorted(
                self.analysed_people.itervalues(),
                key=lambda s: s.source_counts_total, reverse=True)[:20]

        # trends
        trending = sorted(
                self.analysed_people.itervalues(),
                key=lambda s: s.source_counts_trend)

        # top 10 trending up, most trending first
        self.people_trending_up = [s for s in trending[-10:] if s.source_counts_trend > self.TREND_UP]
        self.people_trending_up.reverse()

        # top 10 trending down, most trending first
        self.people_trending_down = [s for s in trending[:10] if s.source_counts_trend < self.TREND_DOWN]
Beispiel #2
0
    def _analyse_people_mentions(self):
        """
        Do trend analysis on people mentions.
        """
        mention_counts = self.mention_frequencies(self.people.keys())

        self.analysed_people = {}
        for pid, person in self.people.iteritems():
            mention = AnalysedMention()
            mention.person = person
            mention.mention_counts = mention_counts[pid]
            mention.mention_counts_total = sum(mention.mention_counts)
            self.analysed_people[pid] = mention

        # normalize by total counts per day
        totals = [0] * (self.days+1)

        # first count per-day totals
        for topic in self.analysed_people.itervalues():
            for i, n in enumerate(topic.mention_counts):
                totals[i] += n

        # normalize
        for topic in self.analysed_people.itervalues():
            for i, n in enumerate(topic.mention_counts):
                if totals[i] == 0:
                    topic.mention_counts[i] = 0
                else:
                    topic.mention_counts[i] = 100.0 * n / totals[i]

        # calculate trends
        for topic in self.analysed_people.itervalues():
            topic.mention_counts_trend = moving_weighted_avg_zscore(topic.mention_counts, 0.8)


        # top 20 sources
        self.top_people = sorted(
                self.analysed_people.itervalues(),
                key=lambda s: s.mention_counts_total, reverse=True)[:20]

        # trends
        trending = sorted(
                self.analysed_people.itervalues(),
                key=lambda s: s.mention_counts_trend)

        # top 10 trending up, most trending first
        self.people_trending_up = [s for s in trending[-10:] if s.mention_counts_trend > self.TREND_UP]
        self.people_trending_up.reverse()

        # top 10 trending down, most trending first
        self.people_trending_down = [s for s in trending[:10] if s.mention_counts_trend < self.TREND_DOWN]
Beispiel #3
0
    def _analyse_people_mentions(self):
        """
        Do trend analysis on people mentions.
        """
        mention_counts = self.mention_frequencies(self.people.keys())

        self.analysed_people = {}
        for pid, person in self.people.iteritems():
            mention = AnalysedMention()
            mention.person = person
            mention.mention_counts = mention_counts[pid]
            mention.mention_counts_total = sum(mention.mention_counts)
            self.analysed_people[pid] = mention

        # normalize by total counts per day
        totals = [0] * (self.days + 1)

        # first count per-day totals
        for topic in self.analysed_people.itervalues():
            for i, n in enumerate(topic.mention_counts):
                totals[i] += n

        # normalize
        for topic in self.analysed_people.itervalues():
            for i, n in enumerate(topic.mention_counts):
                if totals[i] == 0:
                    topic.mention_counts[i] = 0
                else:
                    topic.mention_counts[i] = 100.0 * n / totals[i]

        # calculate trends
        for topic in self.analysed_people.itervalues():
            topic.mention_counts_trend = moving_weighted_avg_zscore(topic.mention_counts, 0.8)

        # top 20 sources
        self.top_people = sorted(self.analysed_people.itervalues(), key=lambda s: s.mention_counts_total, reverse=True)[
            :20
        ]

        # trends
        trending = sorted(self.analysed_people.itervalues(), key=lambda s: s.mention_counts_trend)

        # top 10 trending up, most trending first
        self.people_trending_up = [s for s in trending[-10:] if s.mention_counts_trend > self.TREND_UP]
        self.people_trending_up.reverse()

        # top 10 trending down, most trending first
        self.people_trending_down = [s for s in trending[:10] if s.mention_counts_trend < self.TREND_DOWN]
Beispiel #4
0
    def find_topics(self):
        """
        Run clustering on these documents and identify common topics.

        We use latent Dirichlet allocation (LDA) to cluster the documents
        into an arbitrary number of clusters. We then find the strongest
        clusters and pull representative documents for each cluster.

        Clustering is based on the people and entities mentioned in the documents,
        rather than raw text. This is based on the assumption that Opencalais and
        AlchemyAPI have already done the work to identify pertinent things
        and concepts in the documents, so rely on those rather than on
        arbitrary words.

        The results are stored in `clustered_topics`.

        See also: https://github.com/ariddell/lda
        """
        from sklearn.feature_extraction import DictVectorizer
        import numpy

        # TODO: factor people into cluster calcs

        self.clustered_topics = []

        # load documents and their entities
        docs = Document.query\
            .options(subqueryload('entities'),
                     subqueryload('medium'))\
            .filter(Document.id.in_(self.doc_ids))\
            .all()

        if not docs:
            return

        # guess at the number of topics, between 1 and 50
        n_topics = max(min(50, len(docs)/5), 1)

        # list of entity maps for each document, from entity name to occurrence count
        entities = [dict(('%s-%s' % (de.entity.group, de.entity.name), de.count or 1)
                         for de in d.entities) for d in docs]
        vec = DictVectorizer(sparse=True)

        # TODO: we should ideally use sparse, but it causes the lda library to fail
        entity_vector = vec.fit_transform(entities)
        features = numpy.array(vec.feature_names_)

        clusters, lda_model = self._run_lda(entity_vector, n_topics)
        del entity_vector
        del vec

        # for normalising histograms
        day_counts = self.date_histogram(d.published_at for d in docs)

        # generate topic info
        for i, clustering in clusters.iteritems():
            # cluster is a list of (doc-index, score) pairs

            # sort each cluster to put top-scoring docs first
            # TODO: this isn't great, because scores for each document
            # for the same cluster can't really be compared. We
            # need a better way of doing this.
            clustering.sort(key=lambda p: p[1], reverse=True)
            cluster_docs = [docs[p[0]] for p in clustering]

            cluster = Cluster.find_or_create(docs=cluster_docs)

            # top 8 features for this cluster as (feature, weight) pairs
            indexes = numpy.argsort(lda_model.components_[i])[:-8:-1]
            cluster.features = zip(features[indexes], lda_model.components_[i][indexes])

            # top 20 of each cluster are used to characterize the cluster
            best = clustering[0:20]
            cluster.score = numpy.median([p[1] for p in best])

            # keep only the clusters with a score > self.topic_score_threshold
            if cluster.score <= self.topic_score_threshold:
                continue

            # score for this cluster as stars, from 0 to 3
            cluster.stars = math.ceil((cluster.score - self.topic_score_threshold) / ((1.0 - self.topic_score_threshold) / 3.0))

            # media counts
            media = dict(collections.Counter([d.medium for d in cluster_docs]))
            cluster.media_counts = sorted(media.items(), key=lambda p: p[1], reverse=True)

            # publication dates
            cluster.histogram = self.date_histogram((d.published_at for d in cluster_docs))
            cluster.trend = moving_weighted_avg_zscore(cluster.histogram)
            cluster.histogram = self.normalise_histogram(cluster.histogram, day_counts)

            self.clustered_topics.append(cluster)

        # sort clusters by size
        self.clustered_topics.sort(key=lambda t: t.score, reverse=True)
Beispiel #5
0
    def analyse_people_sources(self):
        """
        Do trend analysis on people.
        """
        utterance_count = self.count_utterances(self.people.keys())
        source_counts = self.source_frequencies(self.people.keys())

        self.analysed_people = {}
        for pid, person in self.people.iteritems():
            src = AnalysedSource()
            src.person = person

            src.utterance_count = utterance_count.get(src.person.id, 0)
            src.source_counts = source_counts[src.person.id]
            src.source_counts_total = sum(src.source_counts)

            self.analysed_people[pid] = src

        # normalize by total counts per day
        totals = [0] * (self.days + 1)

        # first count per-day totals
        for src in self.analysed_people.itervalues():
            for i, n in enumerate(src.source_counts):
                totals[i] += n

        # normalize
        for src in self.analysed_people.itervalues():
            for i, n in enumerate(src.source_counts):
                if totals[i] == 0:
                    src.source_counts[i] = 0
                else:
                    src.source_counts[i] = 100.0 * n / totals[i]

        # calculate trends
        # normalise source counts
        if self.analysed_people:
            biggest = max(src.source_counts_total
                          for src in self.analysed_people.itervalues())
            for src in self.analysed_people.itervalues():
                src.source_counts_trend = moving_weighted_avg_zscore(
                    src.source_counts, 0.8)
                src.source_counts_normalised = src.source_counts_total * 1.0 / biggest

        # top 20 sources
        self.top_people = sorted(self.analysed_people.itervalues(),
                                 key=lambda s: s.source_counts_total,
                                 reverse=True)[:20]

        # trends
        trending = sorted(self.analysed_people.itervalues(),
                          key=lambda s: s.source_counts_trend)

        # top 10 trending up, most trending first
        self.people_trending_up = [
            s for s in trending[-10:] if s.source_counts_trend > self.TREND_UP
        ]
        self.people_trending_up.reverse()

        # top 10 trending down, most trending first
        self.people_trending_down = [
            s for s in trending[:10] if s.source_counts_trend < self.TREND_DOWN
        ]
Beispiel #6
0
    def find_topics(self):
        """
        Run clustering on these documents and identify common topics.

        We use latent Dirichlet allocation (LDA) to cluster the documents
        into an arbitrary number of clusters. We then find the strongest
        clusters and pull representative documents for each cluster.

        Clustering is based on the people and entities mentioned in the documents,
        rather than raw text. This is based on the assumption that Opencalais and
        AlchemyAPI have already done the work to identify pertinent things
        and concepts in the documents, so rely on those rather than on
        arbitrary words.

        The results are stored in `clustered_topics`.

        See also: https://github.com/ariddell/lda
        """
        from sklearn.feature_extraction import DictVectorizer
        import numpy

        # TODO: factor people into cluster calcs

        self.clustered_topics = []

        # load documents and their entities
        docs = (
            Document.query.options(subqueryload("entities"), subqueryload("medium"))
            .filter(Document.id.in_(self.doc_ids))
            .all()
        )

        if not docs:
            return

        # guess at the number of topics, between 1 and 50
        n_topics = max(min(50, len(docs) / 5), 1)

        # list of entity maps for each document, from entity name to occurrence count
        entities = [
            dict(("%s-%s" % (de.entity.group, de.entity.name), de.count or 1) for de in d.entities) for d in docs
        ]
        vec = DictVectorizer(sparse=True)

        # TODO: we should ideally use sparse, but it causes the lda library to fail
        entity_vector = vec.fit_transform(entities)
        features = numpy.array(vec.feature_names_)

        clusters, lda_model = self._run_lda(entity_vector, n_topics)
        del entity_vector
        del vec

        # for normalising histograms
        day_counts = self.date_histogram(d.published_at for d in docs)

        # generate topic info
        for i, clustering in clusters.iteritems():
            # cluster is a list of (doc-index, score) pairs

            # sort each cluster to put top-scoring docs first
            # TODO: this isn't great, because scores for each document
            # for the same cluster can't really be compared. We
            # need a better way of doing this.
            clustering.sort(key=lambda p: p[1], reverse=True)
            cluster_docs = [docs[p[0]] for p in clustering]

            cluster = Cluster.find_or_create(docs=cluster_docs)

            # top 8 features for this cluster as (feature, weight) pairs
            indexes = numpy.argsort(lda_model.components_[i])[:-8:-1]
            cluster.features = zip(features[indexes], lda_model.components_[i][indexes])

            # top 20 of each cluster are used to characterize the cluster
            best = clustering[0:20]
            cluster.score = numpy.median([p[1] for p in best])

            # keep only the clusters with a score > self.topic_score_threshold
            if cluster.score <= self.topic_score_threshold:
                continue

            # score for this cluster as stars, from 0 to 3
            cluster.stars = math.ceil(
                (cluster.score - self.topic_score_threshold) / ((1.0 - self.topic_score_threshold) / 3.0)
            )

            # media counts
            media = dict(collections.Counter([d.medium for d in cluster_docs]))
            cluster.media_counts = sorted(media.items(), key=lambda p: p[1], reverse=True)

            # publication dates
            cluster.histogram = self.date_histogram((d.published_at for d in cluster_docs))
            cluster.trend = moving_weighted_avg_zscore(cluster.histogram)
            cluster.histogram = self.normalise_histogram(cluster.histogram, day_counts)

            self.clustered_topics.append(cluster)

        # sort clusters by size
        self.clustered_topics.sort(key=lambda t: t.score, reverse=True)