def _get_all_ngrams(self, language):
     """Generator that yields all ngrams from database for a certain language"""
     print("Processing {}".format(language))
     body = {
         'query': {
             'constant_score': {
                 'filter': {
                     'term': {
                         'lang': language
                     }
                 }
             }
         }
     }
     n_tweets = es_tweets.n_hits(index=TWEETS_INDEX, doc_type='tweet', body=body)
     tweets = es_tweets.scroll_through(index=TWEETS_INDEX, body=body, size=1000, source=True)
     for i, tweet in enumerate(tweets):
         if i % 10000 == 0:
             print("Finding most_common_names: {}%".format(round((i+1) / n_tweets * 100, 1)), end="\r")
         if tweet['retweet']:
             continue
         # Tokenize tweet
         clean_text = sanitize.clean_text(tweet['text'])
         tokens = sanitize.tokenize(clean_text, tweet['lang'])
         # Create ngrams from tweet up to a length of 3
         ngrams = sanitize.gramify(tokens, MAX_NGRAM_LENGTH)
         ngrams = (ngram for ngram in ngrams if not any(char.isdigit() for char in ngram))
         for n_gram in ngrams:
             if len(n_gram) >= MINIMUM_GRAM_LENGTH:
                 yield n_gram
     print()
Example #2
0
 def get_ngrams_space_separable(self, tokens):
     return sanitize.gramify(tokens, 1, 3)
 def get_ngrams_space_separable(self, clean_text):
     tokens = sanitize.tokenize(clean_text, remove_punctuation=True)
     ngrams = sanitize.gramify(tokens, 1, 3)
     return sanitize.discard_ngrams_with_digits(ngrams)