def _top_entities(collection, n=10, urls=True, images=True, hts=True, mentions=True, geolocation_names=True,
    user_locations=True, ngrams=(1,2), ngram_stopwords=[], ngram_hashtags=True, ngram_mentions=True,
    ngram_rts=False, ngram_mts=False, ngram_https=False):
    counters = defaultdict(Counter)
    for tweet in collection:
        if urls:
            for url in get_urls(tweet):
                counters['urls'][url] += 1
        if images:
            for url in get_image_urls(tweet):
                counters['images'][url] += 1
        if hts:
            for ht in get_hashtags(tweet):
                counters['hts'][ht] += 1
        if mentions:
            for um in get_users_mentioned(tweet):
                counters['mentions'][um] += 1
        if geolocation_names:
            counters['geolocation_names'][tweet['place']['full_name'] if 'place' in tweet and tweet['place'] is not None else None] += 1
        if user_locations:
            counters['user_locations'][tweet['user'].get('location', None)] += 1
        if ngrams:
            tokens = get_cleaned_tokens(tweet["text"],
                                keep_hashtags=ngram_hashtags,
                                keep_mentions=ngram_mentions,
                                rts=ngram_rts,
                                mts=ngram_mts,
                                https=ngram_https,
                                stopwords=ngram_stopwords)
            for ngram in ngrams:
                grams = get_ngrams(tokens, ngram)
                counters['{}-grams'.format(ngram)].update(' '.join(e) for e in grams)
    return { key: _counter_to_series(counters[key], n) for key in counters }
def _top_ngrams(collection, ngram, n, hashtags, mentions, rts, mts, https, stopwords):
    counts = Counter()
    for tweet in collection:
        tokens = get_cleaned_tokens(tweet["text"],
                                    keep_hashtags=hashtags,
                                    keep_mentions=mentions,
                                    rts=rts,
                                    mts=mts,
                                    https=https,
                                    stopwords=stopwords)
        ngrams = get_ngrams(tokens, ngram)
        counts.update(' '.join(e) for e in ngrams)
    return _counter_to_series(counts, n)
def _top_ngrams(collection, ngram, n, hashtags, mentions, rts, mts, https,
                stopwords):
    counts = Counter()
    for tweet in collection:
        tokens = get_cleaned_tokens(tweet["text"],
                                    keep_hashtags=hashtags,
                                    keep_mentions=mentions,
                                    rts=rts,
                                    mts=mts,
                                    https=https,
                                    stopwords=stopwords)
        ngrams = get_ngrams(tokens, ngram)
        counts.update(' '.join(e) for e in ngrams)
    return _counter_to_series(counts, n)
def _top_entities(collection,
                  n=10,
                  urls=True,
                  images=True,
                  hts=True,
                  mentions=True,
                  geolocation_names=True,
                  user_locations=True,
                  ngrams=(1, 2),
                  ngram_stopwords=[],
                  ngram_hashtags=True,
                  ngram_mentions=True,
                  ngram_rts=False,
                  ngram_mts=False,
                  ngram_https=False):
    counters = defaultdict(Counter)
    for tweet in collection:
        if urls:
            for url in get_urls(tweet):
                counters['urls'][url] += 1
        if images:
            for url in get_image_urls(tweet):
                counters['images'][url] += 1
        if hts:
            for ht in get_hashtags(tweet):
                counters['hts'][ht] += 1
        if mentions:
            for um in get_users_mentioned(tweet):
                counters['mentions'][um] += 1
        if geolocation_names:
            counters['geolocation_names'][
                tweet['place']['full_name'] if 'place' in
                tweet and tweet['place'] is not None else None] += 1
        if user_locations:
            counters['user_locations'][tweet['user'].get('location',
                                                         None)] += 1
        if ngrams:
            tokens = get_cleaned_tokens(tweet["text"],
                                        keep_hashtags=ngram_hashtags,
                                        keep_mentions=ngram_mentions,
                                        rts=ngram_rts,
                                        mts=ngram_mts,
                                        https=ngram_https,
                                        stopwords=ngram_stopwords)
            for ngram in ngrams:
                grams = get_ngrams(tokens, ngram)
                counters['{}-grams'.format(ngram)].update(' '.join(e)
                                                          for e in grams)
    return {key: _counter_to_series(counters[key], n) for key in counters}