Beispiel #1
0
def hashdictionary_corpus(dataframe, id_range=32000):
    """Returns a HashDictionary mapping words to ids.

    Precomputed HashDictionaries are read from file if previously cached, or generated then cached otherwise.

    Parameters
    ----------
    dataframe : Pandas DataFrame
        The DataFrame containing the documents to process.
    id_range : int
        The maximum number of ids available.

    Returns
    -------
    dictionary : Gensim HashDictionary
        HashDictionary mapping words to ids.
    """
    filename = 'caches/models/dictionary_{}.model'.format(id_range)

    if not os.path.isfile(filename):
        corpus = text_corpus(dataframe)
        dictionary = HashDictionary(corpus, id_range=id_range)
        dictionary.save(filename)
    else:
        dictionary = HashDictionary.load(filename)

    return dictionary
Beispiel #2
0
def apply_tfidf(reviews_text: list):
    dictionary = HashDictionary()
    review_tokens = []
    for result in reviews_text:
        # Tokenize the reviews
        review_tokens.append(tokenize(result))

    # Build the dictionary
    dictionary.add_documents(review_tokens)
    # Convert to vector corpus
    vectors = [dictionary.doc2bow(token) for token in review_tokens]
    # Build TF-IDF model
    tfidf = TfidfModel(vectors)
    # Get TF-IDF weights
    weights = tfidf[vectors]
    # Get terms from the dictionary and pair with weights
    freq = dict()
    for doc in weights:
        for pair in doc:
            list_of_words = list(dictionary[pair[0]])
            for word in list_of_words:
                if word in freq:
                    freq[word] += pair[1]
                else:
                    freq[word] = pair[1]
    return freq
Beispiel #3
0
    # check and process input arguments
    if len(sys.argv) < 3:
        print(globals()['__doc__'] % locals())
        sys.exit(1)
    inp, outp = sys.argv[1:3]
    if len(sys.argv) > 3:
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE
    online = 'online' in program
    lemmatize = 'lemma' in program
    debug = 'nodebug' not in program

    if online:
        dictionary = HashDictionary(id_range=keep_words, debug=debug)
        dictionary.allow_update = True # start collecting document frequencies
        wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
        dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        dictionary.save_as_text(outp + '_wordids.txt.bz2')
        wiki.save(outp + '_corpus.pkl.bz2')
        dictionary.allow_update = False
    else:
        wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        mywiki = myWikiCorpus(inp, lemmatize=lemmatize)
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
Beispiel #4
0
    tokens = list(filter(None, tokens))
    return tokens


class Corpus(object):
    def __iter__(self):
        for file in glob.glob("*.txt"):
            print(file)
            paper = Path(file).read_text(encoding='utf8')
            yield paper


corpus_memory_friendly = Corpus()
papers = list(corpus_memory_friendly)

texts = [list(preprocess(t)) for t in papers]

# define the dictionary:
dictionary = Dictionary(texts)
dictionary.save('reasoning_corpura.dict')

corpus = [dictionary.doc2bow(text) for text in texts]
MmCorpus.serialize('reasoning_bow.mm', corpus)


hash_dictionary = HashDictionary(texts )
hash_dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
hash_dictionary.save_as_text('reasoning_wordids.txt.bz2')
hash_dictionary.save('reasoning_corpura_hash.dict')

corpora.MmCorpus.serialize('deerwester.mm', corpus)  # store to disk, for later use
from gensim import corpora, models, similarities

a = corpus;
print(dictionary.token2id)

import logging, gensim

lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=4, update_every=0, chunksize=1000,
                                      passes=10)
lda.print_topics(4)
print(dictionary[2])
print(lda[corpus[0]])
from gensim.corpora import HashDictionary

dct = HashDictionary(texts)
import pandas as pd
import numpy as np

nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from afinn import Afinn
afn = Afinn(emoticons=True)
nltk.download('sentiwordnet')
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

Beispiel #6
0
 def _new_model(self, X=None, y=None):
     return HashDictionary(id_range=self.id_range,
                           myhash=self.myhash,
                           debug=self.debug)
Beispiel #7
0
Download it from:

    https://www.kaggle.com/benhamner/nips-papers/downloads/papers.csv/2

"""
papers = pd.read_csv('papers.csv')
corpus = list(papers['paper_text'])

print("corpus size: ", len(corpus))

# ToDo: check performance with lemmatization: gensim.utils.lemmatize

tokenized_corpus = [[
    utils.to_unicode(token)
    for token in utils.tokenize(corpus_item, lower=True, errors='ignore')
] for corpus_item in corpus]

hash_dictionary = HashDictionary(tokenized_corpus)

bow_corpus = [hash_dictionary.doc2bow(text) for text in corpus]
MmCorpus.serialize('nips_bow.mm', bow_corpus, progress_cnt=10000)

hash_dictionary.filter_extremes(no_below=20,
                                no_above=0.1,
                                keep_n=DEFAULT_DICT_SIZE)
hash_dictionary.save_as_text('nips_wordids.txt.bz2')
hash_dictionary.save('nips_corpura_hash.dict')

dictionary = Dictionary(tokenized_corpus)
dictionary.save('nips_corpura.dict')
def process_corpus(input_filename=WIKI_CORPUS,
                   output_dir=GENSIM_DIR,
                   online=False,
                   to_lemmatize=LEMMING,
                   debug=True):
    program = 'GensimWikiCorpus'
    logger = logging.getLogger(program)

    inp = input_filename
    # twice because model will be saved into directory/prefixfilenames
    outp = os.path.join(output_dir, WIKI_STATS + '/' + WIKI_STATS)

    if not os.path.isdir(os.path.dirname(outp)):
        os.makedirs(outp)

    keep_words = DEFAULT_DICT_SIZE

    if online:
        dictionary = HashDictionary(id_range=keep_words, debug=debug)
        dictionary.allow_update = True  # start collecting document frequencies
        wiki = JsonWikiCorpus(inp,
                              to_lemmatize=to_lemmatize,
                              dictionary=dictionary)

        MmCorpus.serialize(
            outp + '_bow.mm', wiki, progress_cnt=10000
        )  # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
        dictionary.filter_extremes(no_below=20,
                                   no_above=0.1,
                                   keep_n=DEFAULT_DICT_SIZE)
        dictionary.save_as_text(outp + '_wordids.txt.bz2')
        wiki.save(outp + '_corpus.pkl.bz2')
        dictionary.allow_update = False
    else:
        wiki = JsonWikiCorpus(
            inp, to_lemmatize=to_lemmatize
        )  # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20,
                                        no_above=0.1,
                                        keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(outp + '_bow.mm', wiki,
                           progress_cnt=10000)  # another ~9h
        wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
        # load back the id->word mapping directly from file
        # this seems to save more memory, compared to keeping the wiki.dictionary object from above
        dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
    del wiki

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)