Example #1
0
# logging is important to get the state of the functions
import logging

if __name__ == '__main__':
    multiprocessing.freeze_support()

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)

    #wiki = WikiCorpus('C:\\Wiki\\enwiki-latest-pages-articles.xml.bz2', lemmatize=False)
    #tfidf = TfidfModel(wiki)
    # save for persistence
    #wiki.save('C:\\Wiki\\wiki.corpus')
    #tfidf.save('C:\\Wiki\\wiki.tfidf.model')

    wiki = WikiCorpus.load('C:\\Wiki\\wiki.corpus')
    tfidf = TfidfModel(wiki)
    tfidf.load('C:\\Wiki\\wiki.tfidf.model')

    # word2vec


    class MySentences(object):
        def __iter__(self):
            for text in wiki.get_texts():
                yield [word.decode() for word in text]

    sentences = MySentences()
    params = {
        'size': 300,
        'window': 10,