def _get_all_ngrams(self, language): """Generator that yields all ngrams from database for a certain language""" print("Processing {}".format(language)) body = { 'query': { 'constant_score': { 'filter': { 'term': { 'lang': language } } } } } n_tweets = es_tweets.n_hits(index=TWEETS_INDEX, doc_type='tweet', body=body) tweets = es_tweets.scroll_through(index=TWEETS_INDEX, body=body, size=1000, source=True) for i, tweet in enumerate(tweets): if i % 10000 == 0: print("Finding most_common_names: {}%".format(round((i+1) / n_tweets * 100, 1)), end="\r") if tweet['retweet']: continue # Tokenize tweet clean_text = sanitize.clean_text(tweet['text']) tokens = sanitize.tokenize(clean_text, tweet['lang']) # Create ngrams from tweet up to a length of 3 ngrams = sanitize.gramify(tokens, MAX_NGRAM_LENGTH) ngrams = (ngram for ngram in ngrams if not any(char.isdigit() for char in ngram)) for n_gram in ngrams: if len(n_gram) >= MINIMUM_GRAM_LENGTH: yield n_gram print()
def get_ngrams_space_separable(self, tokens): return sanitize.gramify(tokens, 1, 3)
def get_ngrams_space_separable(self, clean_text): tokens = sanitize.tokenize(clean_text, remove_punctuation=True) ngrams = sanitize.gramify(tokens, 1, 3) return sanitize.discard_ngrams_with_digits(ngrams)