def build_tweet_dictionary(): import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) from twitter_stream.models import Tweet stoplist = stopwords.words('english') texts = DbTexts(Tweet) tokenized = Tokenizer(texts, stoplist=stoplist) # build a dictionary print "Building a dictionary" dictionary = corpora.Dictionary(tokenized) # Remove extremely rare words dictionary.filter_extremes(no_below=2, no_above=0.5, keep_n=None) dictionary.compactify() # Save it in the database print "Saving the tweet dict" from models import Dictionary return Dictionary.create_from_gensim_dictionary(dictionary, "tweets dictionary")