def hashdictionary_corpus(dataframe, id_range=32000): """Returns a HashDictionary mapping words to ids. Precomputed HashDictionaries are read from file if previously cached, or generated then cached otherwise. Parameters ---------- dataframe : Pandas DataFrame The DataFrame containing the documents to process. id_range : int The maximum number of ids available. Returns ------- dictionary : Gensim HashDictionary HashDictionary mapping words to ids. """ filename = 'caches/models/dictionary_{}.model'.format(id_range) if not os.path.isfile(filename): corpus = text_corpus(dataframe) dictionary = HashDictionary(corpus, id_range=id_range) dictionary.save(filename) else: dictionary = HashDictionary.load(filename) return dictionary
def apply_tfidf(reviews_text: list): dictionary = HashDictionary() review_tokens = [] for result in reviews_text: # Tokenize the reviews review_tokens.append(tokenize(result)) # Build the dictionary dictionary.add_documents(review_tokens) # Convert to vector corpus vectors = [dictionary.doc2bow(token) for token in review_tokens] # Build TF-IDF model tfidf = TfidfModel(vectors) # Get TF-IDF weights weights = tfidf[vectors] # Get terms from the dictionary and pair with weights freq = dict() for doc in weights: for pair in doc: list_of_words = list(dictionary[pair[0]]) for word in list_of_words: if word in freq: freq[word] += pair[1] else: freq[word] = pair[1] return freq
# check and process input arguments if len(sys.argv) < 3: print(globals()['__doc__'] % locals()) sys.exit(1) inp, outp = sys.argv[1:3] if len(sys.argv) > 3: keep_words = int(sys.argv[3]) else: keep_words = DEFAULT_DICT_SIZE online = 'online' in program lemmatize = 'lemma' in program debug = 'nodebug' not in program if online: dictionary = HashDictionary(id_range=keep_words, debug=debug) dictionary.allow_update = True # start collecting document frequencies wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2') dictionary.allow_update = False else: wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) mywiki = myWikiCorpus(inp, lemmatize=lemmatize) # only keep the most frequent words (out of total ~8.2m unique tokens) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
tokens = list(filter(None, tokens)) return tokens class Corpus(object): def __iter__(self): for file in glob.glob("*.txt"): print(file) paper = Path(file).read_text(encoding='utf8') yield paper corpus_memory_friendly = Corpus() papers = list(corpus_memory_friendly) texts = [list(preprocess(t)) for t in papers] # define the dictionary: dictionary = Dictionary(texts) dictionary.save('reasoning_corpura.dict') corpus = [dictionary.doc2bow(text) for text in texts] MmCorpus.serialize('reasoning_bow.mm', corpus) hash_dictionary = HashDictionary(texts ) hash_dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) hash_dictionary.save_as_text('reasoning_wordids.txt.bz2') hash_dictionary.save('reasoning_corpura_hash.dict')
corpora.MmCorpus.serialize('deerwester.mm', corpus) # store to disk, for later use from gensim import corpora, models, similarities a = corpus; print(dictionary.token2id) import logging, gensim lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=4, update_every=0, chunksize=1000, passes=10) lda.print_topics(4) print(dictionary[2]) print(lda[corpus[0]]) from gensim.corpora import HashDictionary dct = HashDictionary(texts) import pandas as pd import numpy as np nltk.download('vader_lexicon') from nltk.sentiment.vader import SentimentIntensityAnalyzer from afinn import Afinn afn = Afinn(emoticons=True) nltk.download('sentiwordnet') from nltk.corpus import sentiwordnet as swn from nltk.corpus import wordnet from textblob import TextBlob from textblob.sentiments import NaiveBayesAnalyzer
def _new_model(self, X=None, y=None): return HashDictionary(id_range=self.id_range, myhash=self.myhash, debug=self.debug)
Download it from: https://www.kaggle.com/benhamner/nips-papers/downloads/papers.csv/2 """ papers = pd.read_csv('papers.csv') corpus = list(papers['paper_text']) print("corpus size: ", len(corpus)) # ToDo: check performance with lemmatization: gensim.utils.lemmatize tokenized_corpus = [[ utils.to_unicode(token) for token in utils.tokenize(corpus_item, lower=True, errors='ignore') ] for corpus_item in corpus] hash_dictionary = HashDictionary(tokenized_corpus) bow_corpus = [hash_dictionary.doc2bow(text) for text in corpus] MmCorpus.serialize('nips_bow.mm', bow_corpus, progress_cnt=10000) hash_dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) hash_dictionary.save_as_text('nips_wordids.txt.bz2') hash_dictionary.save('nips_corpura_hash.dict') dictionary = Dictionary(tokenized_corpus) dictionary.save('nips_corpura.dict')
def process_corpus(input_filename=WIKI_CORPUS, output_dir=GENSIM_DIR, online=False, to_lemmatize=LEMMING, debug=True): program = 'GensimWikiCorpus' logger = logging.getLogger(program) inp = input_filename # twice because model will be saved into directory/prefixfilenames outp = os.path.join(output_dir, WIKI_STATS + '/' + WIKI_STATS) if not os.path.isdir(os.path.dirname(outp)): os.makedirs(outp) keep_words = DEFAULT_DICT_SIZE if online: dictionary = HashDictionary(id_range=keep_words, debug=debug) dictionary.allow_update = True # start collecting document frequencies wiki = JsonWikiCorpus(inp, to_lemmatize=to_lemmatize, dictionary=dictionary) MmCorpus.serialize( outp + '_bow.mm', wiki, progress_cnt=10000 ) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2') dictionary.allow_update = False else: wiki = JsonWikiCorpus( inp, to_lemmatize=to_lemmatize ) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) # only keep the most frequent words (out of total ~8.2m unique tokens) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') del wiki # initialize corpus reader and word->id mapping mm = MmCorpus(outp + '_bow.mm') # build tfidf, ~50min tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(outp + '.tfidf_model') # save tfidf vectors in matrix market format # ~4h; result file is 15GB! bzip2'ed down to 4.5GB MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) logger.info("finished running %s" % program)