Example #1
0
from gensim import corpora

# create a toy corpus of 2 documents, as a plain python list
corpus = [[(1, 0.5)], []]
# serialise to disk
corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)

# read from disk
corpus = corpora.MmCorpus('/tmp/corpus.mm')
print(corpus)
print(list(corpus))
# or
for doc in corpus:
    print(doc)

# other way to serialise to disk
# corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)
# corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
# corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)
Example #2
0
# compile corpus (vectors number of times each elements appears)
raw_corpus = [dictionary.doc2bow(t) for t in tweets]
print("Then convert convert tokenized documents to vectors: %s" %
      type(raw_corpus))
corpora.MmCorpus.serialize('/tmp/tweets.mm', raw_corpus)  # store to disk
print("Save the vectorized corpus as a .mm file")
print()

# STEP 2 : similarity between corpuses
print("STEP 2 : Transform and compute similarity between corpuses")
print('-' * 10)
dictionary = corpora.Dictionary.load('/tmp/tweets.dict')
print("We load our dictionary : %s" % type(dictionary))

corpus = corpora.MmCorpus('/tmp/tweets.mm')
print("We load our vector corpus : %s " % type(corpus))

# Transform Text with TF-IDF
tfidf = models.TfidfModel(corpus)  # step 1 -- initialize a model
print("We initialize our TF-IDF transformation tool : %s" % type(tfidf))

# corpus tf-idf
corpus_tfidf = tfidf[corpus]
print("We convert our vectors corpus to TF-IDF space : %s" %
      type(corpus_tfidf))
print()

# STEP 3 : Create similarity matrix of all files
print("STEP 3 : Create similarity matrix of all docs")
print('-' * 10)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim import corpora, models, similarities
directory = corpora.Dictionary.load('./tmp/deerwester.dict')
corpus = corpora.MmCorpus("./tmp/deerwester.mm")

lsi = models.LsiModel(corpus, id2word=directory, num_topics=2)
doc = "Human computer interaction"
vec_bow = directory.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]

# print(vec_lsi)

index = similarities.MatrixSimilarity(lsi[corpus])
index.save('./tmp/deerwester.index')

sims = index[vec_lsi]
# print(list(enumerate(sims)))

sims = sorted(enumerate(sims),key=lambda item: -item[1])
print(sims)
Example #4
0
def load_corpus(corpus_file):
    corpus_ = corpora.MmCorpus(corpus_file)
    return corpus_
Example #5
0
import sqlite3
import pandas as pd
import numpy as np
import time
import pickle
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize.moses import MosesDetokenizer
from stop_words import get_stop_words
from collections import defaultdict
import gensim
from gensim import corpora, models, similarities
from gensim.models.doc2vec import TaggedDocument, TaggedLineDocument
from gensim.models import Doc2Vec
import gensim.models.doc2vec

print('loading...')
corpus = corpora.MmCorpus('/volume/models/corpus.mm')
dictionary = corpora.Dictionary.load('/volume/models/dictionary.dict')

print('training lsi...')
start_time = time.time()
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=10)
lsi.save('/volume/models/tax_model.lsi')
print("--- %s seconds ---" % (time.time() - start_time))

print('training lda...')
start_time = time.time()
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=10)
lda.save('/volume/models/tax_model.lda')
print("--- %s seconds ---" % (time.time() - start_time))
Example #6
0
dictionary.save(os.path.join(
    __location__, 'data/KeyVis.dict'))  #store dictionary for future reference
#dictionary = corpora.Dictionary.load(os.path.join(__location__, 'data/KeyVis.dict'))
"""Initialize corpus"""


class MyCorpus(object):
    def __iter__(self):
        for line in open(
                os.path.join(__location__, 'KeyVisCorpora', 'abstracts.txt'),
                'rU'):
            line = unicode(line, errors='ignore')
            lowers = line.lower()
            tokenList = lowers.split()
            output = [stem(word, stemmer=LEMMA) for word in tokenList]
            #Assume there's one document per line, tokens separated by space
            yield dictionary.doc2bow([x.strip() for x in output])


corpus = MyCorpus()
"""tf-idf transformation; tfidf is a read-only object that converts any vector 
from the old representation to the new representation"""
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

#Save as market matrix
corpora.MmCorpus.serialize(os.path.join(__location__, 'data/KeyVis_tfidf.mm'),
                           corpus_tfidf)
mm = corpora.MmCorpus(os.path.join(__location__, 'data/KeyVis_tfidf.mm'))
print "DONE:", mm
Example #7
0
def loadCorpus(p):
    c_path = os.path.join(PROJECT_ROOT, "data/{0}_corpus.mm".format(p))
    c = corpora.MmCorpus(c_path)
    return c
def lda(input_file=sys.argv[1]):
    # List for remove stop words.
    slothlib_stopwords = []
    with open("./slothlib.txt", "r") as f:
        slothlib_stopwords = [line.strip() for line in f]

    separated_document_list = documents_wakati(input_file)
    separated_document_list_temp = []

    # Remove stop words.
    for l in separated_document_list:
        b_pivot = 0
        for i in range(len(l)):
            if l[b_pivot] in slothlib_stopwords:
                l.pop(b_pivot)
                continue
            b_pivot += 1
        separated_document_list_temp.append(l)

    separated_document_list = separated_document_list_temp
    separated_document_list_temp = None

    # Generate a corpora.
    dictionary = corpora.Dictionary(separated_document_list)
    dictionary.filter_extremes(no_below=2, no_above=0.3)  # (Provisional)
    dictionary.save_as_text('dict.txt')

    # Generate a Dictionary.
    corpus = [dictionary.doc2bow(text) for text in separated_document_list]
    corpora.MmCorpus.serialize('cop.mm', corpus)
    dictionary = gensim.corpora.Dictionary.load_from_text('dict.txt')
    corpus = corpora.MmCorpus('cop.mm')

    # Create a model by Hierarchical Dirichlet Process.
    #topic_N = 150
    #model = gensim.models.hdpmodel.HdpModel(corpus=corpus, id2word=dictionary)

    # Create a model by Latent Dirichlet Allocation.
    topic_N = 20
    model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            num_topics=topic_N,
                                            id2word=dictionary)

    # Topics(Max. 150), and words that construct a topic.
    #topics_list = model.print_topics(num_topics=-1, num_words=999999)
    topics_list = model.print_topics(num_topics=topic_N, num_words=999999)

    # Count topics that estimated above script.
    estimated_topicnum_list = []
    topics = [model[c] for c in corpus]
    json_data = {}

    #with open("models/lda20_2_30per.json", "r") as f:
    #    lda_result = json.load(f)
    with open("lda20_2_30per.json", "w") as f:
        for i in range(len(topics)):
            if len(topics[i]) == 0:
                continue
            else:
                print(i, u"番目の文書のトピックは, ",
                      topics[i])  # [(topic_index, topic_weight), ...]
                for topic_and_prob_tuple in topics[i]:
                    estimated_topicnum_list.append(topic_and_prob_tuple[0])

                index = [j[0] for j in topics[i]]
                c = [j[1] for j in topics[i]]

                top_n_word = []
                # Words (=N) related to topic.
                top_n = 10
                counter = 0
                words_list = topics_list[index[np.argmax(c)]][1].split("+")
                for w in words_list:
                    # 0.000*"hoge" -> hoge
                    w = w.replace("*", "").replace(" ", "").replace('"',
                                                                    '')[5:]
                    if w in separated_document_list[i]:
                        if counter == top_n:
                            break
                        #if w not in lda_result[str(i)]:
                        #    top_n_word.append(w)
                        #    counter += 1
                        top_n_word.append(w)
                        counter += 1

                print(i, top_n_word)
                json_data[i] = top_n_word

        json.dump(json_data,
                  f,
                  indent=4,
                  sort_keys=True,
                  separators=(',', ': '))

    estimated_topicnum_dict = collections.Counter(estimated_topicnum_list)
    print(u"推定されたトピックの数: ", len(estimated_topicnum_dict))

    # Calculate a occuerence probabilty of each words in topics that most represent a input document.
    input_text_topics = topics[len(separated_document_list) -
                               1]  # [(topic's index, occProb), ...]
    #word_prob_in_topic = model.print_topics(num_topics=-1, num_words=len(dictionary))  # [(topic's index, u"'occProb*word', ..."), ...]
    word_prob_in_topic = model.print_topics(
        num_topics=topic_N, num_words=len(
            dictionary))  # [(topic's index, u"'occProb*word', ..."), ...]
    word_prob_in_topic_dic = {}  # {"word": "prob", ...}

    for input_text_topic in input_text_topics:
        word_prob_in_topic_list = word_prob_in_topic[
            input_text_topic[0]][1].split(",")
        for factors in word_prob_in_topic_list:
            factor = factors.split("*")

            if len(factor) == 2:
                if factor[1] in word_prob_in_topic_dic:
                    word_prob_in_topic_dic[factor[1]] = str(
                        float(word_prob_in_topic_dic[factor[1]]) +
                        float(factor[0]))
                else:
                    word_prob_in_topic_dic[factor[1]] = factor[0]

    return word_prob_in_topic_dic
Example #9
0
    def load_corpus(self):
        """
        Load self.corpus from a file it was saved to earlier.

        """
        self.corpus = corpora.MmCorpus(self.corpus_filepath)
class EmotionShot(object):
    """
    根据论文算法计算shot的emotion
    """
    """
    类属性
    """
    print "*****************************Loading class attributes**********************************"
    print 'Loading TopicInfo'

    ldaParameter = {'topicNum': 20, 'iteration': 500}
    print 'ldaParameter:', ldaParameter

    topicFile = GLOBAL_generatedFiles + '/txtall_t' + str(
        ldaParameter['topicNum']) + '_it' + str(
            ldaParameter['iteration']) + '.txt'
    et = EmotionTopic(topicFile)
    _TopicsInfo = et.topicsInfo()

    print '\nLoading the Dictionary, LDA and list_corpora, index ...'
    # 载入语料库和lda模型
    myDictLocation = GLOBAL_generatedFiles + '/' + GLOBAL_dictionaryName
    myCorporaLocation = GLOBAL_generatedFiles + '/' + GLOBAL_corporaTfidfName
    myLDALocation = GLOBAL_generatedFiles + '/' + 'topics' + str(
        ldaParameter['topicNum']) + '___iterations' + str(
            ldaParameter['iteration']) + '___.lda'
    _dictionary = corpora.Dictionary.load(myDictLocation)
    _list_corpus = corpora.MmCorpus(myCorporaLocation)
    _lda = models.LdaModel.load(myLDALocation)

    print '\nCommunicating the index...'
    # _index是后面检索相似shot用,如果分析单个shot不需要
    _index = similarities.MatrixSimilarity(_lda[_list_corpus])

    print "\nCalculating all Movie Emotion Vectors...."
    em = EmotionMovie()
    em.calculateEmoMovieVector()
    MoviesVectors = em.allEmoMovie  # dict: allEmoMovie:{movieName1:<Counter>, movieName2:<Counter>}

    print "*****************************Loading class attributes END********************************"
    """
    类属性
    """
    def __init__(self, shotLocation):
        """
        参考SimilarityUtil.py
        初始化,dictionanry,LDA
        :return:
        """

        # shot(window)的位置
        self.shotLocation = shotLocation
        # shot归属movie
        self.belongedMovie = extractMovieName(shotLocation=shotLocation)
        self.wordsCount = 0  # shot中的词量

        self.numofMaxTopic = -1
        self.maxTopicWeight = 0

        # shot Emotion Vector
        self.shotVector = Counter({
            'surprise': 0,
            'sorrow': 0,
            'love': 0,
            'joy': 0,
            'hate': 0,
            'expect': 0,
            'anxiety': 0,
            'anger': 0,
        })

    def emoCalculate4OneShot(self):
        """
        计算shot_i的情感向量
        :return:
        """
        if self.shotLocation.find('.txt') == -1:
            print 'It is not a windowTXT.'
            return
        fr = open(self.shotLocation, 'r')
        windowContent = fr.read().decode('utf-8',
                                         'ignore').encode('utf-8')  # 忽略繁体字

        windowCounter = Counter()
        fill_windowCounter(
            windowContent,
            windowCounter)  # InitialCorporaUtil.fill_windowCounter
        listWindowContent = []  # 该shot里的word
        # print 'The words in this Shot:'
        for key, count in windowCounter.items():
            for x in xrange(count):
                # print key,
                listWindowContent.append(key)
        # 打印shot里words的数量
        # print '\nnumber of shotwords:',len(listWindowContent)
        self.wordsCount = len(listWindowContent)
        vec_bow = EmotionShot._dictionary.doc2bow(listWindowContent)
        vec_lda = EmotionShot._lda[vec_bow]  # 该文档的主题分布,可以是个新文档

        self.maxTopicWeight = 0
        maxtopic = -1  # 记录shot的权重最大的topic编号
        for tuple_topic in vec_lda:
            if tuple_topic[1] > self.maxTopicWeight:
                self.maxTopicWeight = tuple_topic[1]
                maxtopic = tuple_topic[0]
        self.numofMaxTopic = maxtopic
        # print '\nMaxTopic:', self.numofMaxTopic, self.maxTopicWeight

        # 打印该shot-maxTopicInfo
        # print '***************maxTopicInfo*****************'
        # EmotionShot.et.show_topic(self.numofMaxTopic)

        # 计算Shot 的emotion Vector
        self.calculateShotVector(shotWordsList=listWindowContent)

    def calculateShotVector(self, shotWordsList):
        """
        called by emoCalculate4OneShot
        计算Shot 的emotion Vector
        :param shotWordsList: 这个shot的词:list<string>
        :return:
        """
        # print '*********************calculating shotVector**********************'

        maxtopicInfo = EmotionShot._TopicsInfo[self.numofMaxTopic]
        count4shotDictWord = 0  # 在dictionary中出现的词
        count4shotTopicWord = 0  # 没在dictionary中找到但是在maxtopic中找到
        count4for = 0
        # 定义两个计数器分别记录词典词的总向量,和topic中的总向量
        shotVector_dict = Counter()
        shotVector_topic = Counter()
        for shotword in shotWordsList:  # 遍历shot中的词
            count4for += 1
            flagfind = False
            for emoword in EmotionShot.et.Emodictionary:
                if emoword.word == shotword:
                    flagfind = True
                    count4shotDictWord += 1
                    # self.shotVector.update(emoword.emotionVector)
                    # print 'indict:',emoword.word, emoword.emotionVector
                    shotVector_dict.update(emoword.emotionVector)
                    break
            if not flagfind:  # 没有在词典里找到  # 论文公式4
                for topicItem in maxtopicInfo['topicWords']:
                    if shotword in topicItem:
                        count4shotTopicWord += 1
                        alpha, topicword = topicItem.split('*')
                        alpha = float(alpha)
                        # 论文公式(4) otherwise: 没有在词典找到的词的词向量
                        wordVector = calculateWordVectorInMaxTopic(
                            maxTopicVector=maxtopicInfo['topicVector'],
                            maxTopicWeight=self.maxTopicWeight,
                            wordWeightInMaxTopic=alpha)
                        # print '          intopic:',shotword, wordVector
                        # self.shotVector.update(wordVector)
                        shotVector_topic.update(wordVector)
                        break
        # 打印inDict和inTopic两种类型的词的信息
        # print 'count4shotDictWord:', count4shotDictWord
        # print 'count4shotTopicWord:', count4shotTopicWord
        # print 'shotVector_dict:', shotVector_dict
        # print 'shotVector_topic', shotVector_topic

        self.shotVector.update(shotVector_dict)
        self.shotVector.update(shotVector_topic)
        # print 'shotBelongedMovie:', self.belongedMovie
        # print 'shotVector:', self.shotVector

    @property
    def list_corpus(self):
        return self._list_corpus
Example #11
0
    corpus)  # store to disk, for later use
print corpus[:10]

# In[5]:

from gensim import models, similarities
import os

if (os.path.exists(
        '/Users/jordanchisam/Desktop/ProgrammingTextAnalysis/corpora/practicenovel.dict'
)):
    dictionary = corpora.Dictionary.load(
        '/Users/jordanchisam/Desktop/ProgrammingTextAnalysis/corpora/practicenovel.dict'
    )
    corpus = corpora.MmCorpus(
        "/Users/jordanchisam/Desktop/ProgrammingTextAnalysis/corpora/corpusnovel.mm"
    )
    print("Lets get to work")
else:
    print("Invalid data set provided")

# ## Model
#
# Decided to continue with TF IDF model as opposed to measuring raw word count values or other frequency weigting methods because TF IDF works well for measuring the significance of words. Additionally, it properly shows the similarilty and differences in texts.

# In[6]:

tfidf = models.TfidfModel(corpus)

# In[7]:
Example #12
0
def load(dic_path, cor_path):
    dic = corpora.Dictionary.load_from_text(dic_path)
    cor = corpora.MmCorpus(cor_path)
    return dic,cor
Example #13
0
    groupdic = {
        d[0]: d[1]
        for d in [
            i.split(' ')
            for i in open('texts/' + prefix + '/metadata_extra.txt',
                          'r').read().strip().split('\n')
        ]
    }
    groups = [groupdic[i] for i in labels]
else:
    groups = [l.split('.')[0] for l in labels]

#step 1 prepare corpus
#prepare_corpus([open('texts/'+section,'r').read() for section in book])
dictionary = corpora.Dictionary.load('texts/' + prefix + '/dictionary.dict')
corpus = corpora.MmCorpus('texts/' + prefix + "/corpus.mm")

#step 2 create tf-idf model
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]  # convert vector space to tfidf space

#step 2.1 create LDA model
numTopics = 50
lda = models.ldamodel.LdaModel(corpus_tfidf,
                               id2word=dictionary,
                               num_topics=numTopics)
topics = lda.show_topics(num_topics=numTopics)
#print topics
for text in corpus:
    for id, freq in lda[text]:
        print(dictionary[id], freq)
Example #14
0
    # Creating the term dictionary of our courpus, where every unique term is assigned an index.
    dictionary = corpora.Dictionary(doc_clean)
    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=20, no_above=0.5)
    dictionary.save('./models/dictionary.dict')
else:
    dictionary = corpora.Dictionary.load('./models/dictionary.dict')
    print("preprocessed ictionary loaded...")

if not os.path.exists('./models/corpus.mm'):
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. [Bag Of Word]
    corpus = [dictionary.doc2bow(doc) for doc in doc_clean]
    # Save the matrix into Market Matrix format.
    corpora.MmCorpus.serialize('./models/corpus.mm', corpus)
else:
    corpus = corpora.MmCorpus('./models/corpus.mm')
    print("document to term matrix loaded...")

# Use TF-IDF model
tfidf = gensim.models.TfidfModel(corpus, normalize=True)
corpus_tfidf = tfidf[corpus]

print("de-normaliza tf-idf corpus...")
corpus_tfidf = map(lambda x: map(lambda y: (y[0], round(y[1] * 200, 1)), x),
                   corpus_tfidf)

# pprint(dictionary[237])
print("tfidf weights of the first document after de-normalization")
# print("BOW of the first document")
# pprint(map(lambda x: (dictionary[x[0]], x[1]), corpus[0]))
# pprint(len(corpus[0]))
Example #15
0
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary,
                      num_topics=2)  # initialize an LSI transformation
corpus_lsi = lsi[
    corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

lsi.print_topics(2)

todas = []
for doc in corpus_lsi:  # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    todas.append(doc)
todas

from gensim import corpora, models, similarities
dictionary = corpora.Dictionary.load('/tmp/deerwester4.dict')
corpus = corpora.MmCorpus(
    '/tmp/deerwester4.mm'
)  # comes from the first tutorial, "From strings to vectors"
print(corpus)

np.array(corpus).shape

lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

p = []
for i in range(0, len(documents)):
    doc1 = documents[i]
    vec_bow2 = dictionary.doc2bow(doc1.lower().split())
    vec_lsi2 = lsi[vec_bow2]  # convert the query to LSI space
    p.append(vec_lsi2)

index = similarities.MatrixSimilarity(
Example #16
0
        raw_corpus)
    corpora.MmCorpus.serialize('/tmp/' + str(i) + '.mm',
                               raw_corpus)  # store to disk
    print "Save the vectorized corpus as a .mm file"
    print

# STEP 2 : similarity between corpuses
print "STEP 2 : Transform and compute similarity between corpuses"
print '-' * 10
t0 = time()
for i, t in enumerate(to_vectorize):

    dictionary = corpora.Dictionary.load('/tmp/' + str(i) + '.dict')
    print "Load our dictionary : %s" % type(dictionary)

    corpus = corpora.MmCorpus('/tmp/' + str(i) + '.mm')
    print "Load our vector corpus : %s " % type(corpus)

    # Train the TF-IDF model
    tfidf = models.TfidfModel(corpus)  # step 1 -- initialize a model
    print "Initialize our TF-IDF transformation tool : %s" % type(tfidf)

    # corpus tf-idf
    corpus_tfidf = tfidf[corpus]
    print "Convert our vectors corpus to TF-IDF space : %s" % type(
        corpus_tfidf)

    print "Save the tranformed corpus"
    corpus_tfidf.save('/tmp/' + str(i) + '.trans')

    print
                # 学习其他同学过滤掉无意义的标点符号和助词等
                word = words[i]
                if '/w' not in word and '/y' not in word and '/u' not in word \
                        and '/c' not in word:
                    doc.append(word)
    documents.append(doc)

# 提取出现频数大于 1 的关键词作为词袋,以此为基础将文档向量化并使用 TF-IDF 作为词的权重
fre = {}
for doc in documents:
    for word in doc:
        if word in fre:
            fre[word] += 1
        else:
            fre[word] = 1
documents = [[word for word in doc if fre[word] > 1] for doc in documents]
bag = corpora.Dictionary(documents)  # 词袋(bag of words)
corpus = [bag.doc2bow(doc) for doc in documents]  # 基于频数的文档向量列表
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]  # 转换成 TF-IDF 文档向量列表并持久化
corpora.MmCorpus.serialize('tmp/corpus_tfidf.mm', corpus_tfidf)
corpus_tfidf = corpora.MmCorpus('tmp/corpus_tfidf.mm')

# 构建文档相似度矩阵索引用于查询,再使用文档列表本身进行相似度查询(默认使用 Cosine)
index = similarities.SparseMatrixSimilarity(corpus_tfidf)
with open('result.csv', 'w') as f_out:
    for sims in index[corpus_tfidf]:
        f_out.write(','.join(map(str, sims)) + '\n')

print("总共耗时(秒):" + str(time.time() - start_time))
Example #18
0
all_tokens = sum(cleaned_comment, [])
token_set = set(all_tokens)
tokens_once = set(word for word in token_set if all_tokens.count(word) == 1)
comment_tokens = [[word for word in text if word not in tokens_once]
                  for text in cleaned_comment]
pickle.dump(comment_tokens, open('../data/comment_tokens.pkl', 'wd'))

print "making dict"
dictionary = corpora.Dictionary(comment_tokens)
dictionary.save('../model/comments_' + sys.argv[1] +
                '.dict')  # store the dictionary, for future reference
print(dictionary)

print "making corpus"
corpus = [dictionary.doc2bow(text) for text in comment_tokens]
corpora.MmCorpus.serialize('../model/comments' + sys.argv[1] + '.mm', corpus)

print "loading corpus"
mm = corpora.MmCorpus('../model/comments' + sys.argv[1] + '.mm')
print mm

#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#print "training LDA model"
#lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=dictionary, num_topics=int(sys.argv[1]), update_every=1, chunksize=10000, passes=5)
#lda.print_topics()

#save_name = '../model/lda_model_' + sys.argv[1] + '.pkl'
#print 'saving model ', save_name
#pickle.dump(lda, open(save_name, 'wd'))
Example #19
0
    def calc_similarity(self, prefix: str, sysno: int, text: str):
        """计算相似度
        返回索引和余弦值

        Arguments:
            prefix {str} -- 模型前缀
            text {str} -- 文本数据
            value {float} -- 设定的阈值,返回大于这个值的数据
        """
        dictionary = corpora.Dictionary.load('./models/{}_dict.dic'.format(prefix))  # 加载字典
        corpus = corpora.MmCorpus('./models/{}_corpuse.mm'.format(prefix))  # 加载语料
        tfidf_model = models.TfidfModel.load("./models/{}_tfidf_model.model".format(prefix))  # 加载Tfidf模型
        corpus_tfidf = tfidf_model[corpus]

        lsi = models.LsiModel(corpus_tfidf)
        corpus_lsi = lsi[corpus_tfidf]
        similarity_lsi = similarities.Similarity('./models/similarity-lsi-index',
                                                 corpus_lsi,
                                                 num_features=400,
                                                 num_best=1)
        cut_raw = self.segment(text)  # 1.分词
        corpus = dictionary.doc2bow(cut_raw)  # 2.转换成bow向量
        corpus_tfidf = tfidf_model[corpus]  # 3.计算tfidf值
        corpus_lsi = lsi[corpus_tfidf]  # 4.计算lsi值
        sims = similarity_lsi[corpus_lsi]

        def find_idx(x):
            dtc = self.mongo_db.find_one("{}_idx".format(prefix), {"_id": int(x)})
            val = None
            if dtc is not None:
                val = dtc["data"]
            return val

        ids_dic = []
        if sims is not None:
            # 取索引
            index_dic = [(idx + 1) for idx, val in sims if val > self.keep_val]
            # 取编号
            for x in index_dic:
                tt = find_idx(x)
                if tt is not None:
                    ids_dic.append(tt)

        idxs = self.mongo_db.find("{}_idx".format(prefix), {"data": {"$in": ids_dic}}).sort([("_id", -1)])
        # 查找编号是否存在
        ids = self.mongo_db.find("{}_idx".format(prefix), {"data": sysno})
        if len(ids_dic) > 0:
            # 最新一条
            _id = idxs[0]["_id"]
            if ids.count() > 0:
                # 编号存在
                is_update = False
                if _id not in index_dic:
                    # 最新一条索引不在返回的索引中,和之前编辑内容重复,更新模型
                    ids_dic = []
                    is_update = True
            else:
                # 编号不存在,和之前的内容重复
                is_update = False
        else:
            # 编号存在内容不重复,为编辑,更新模型,
            # 编号不存在内容不重复,为新增,更新模型
            is_update = True

        return ids_dic, is_update
Example #20
0
    # remove stop words and words that appear only once
    dictionary.filter_tokens(once_ids)

    # remove gaps in id sequence after words that were removed
    dictionary.compactify()
    print(dictionary)

    dictionary.save(
        os.path.join('C:\\Users\\soapk\\Desktop', saved_filename + ".dict"))

# print(len(dictionary))

if os.path.exists(
        os.path.join('C:\\Users\\soapk\\Desktop', saved_filename + ".mm")):
    corpus = corpora.MmCorpus(
        os.path.join('C:\\Users\\soapk\\Desktop', saved_filename + ".mm"))
else:
    corpus = [dictionary.doc2bow(title) for title in so_titles]
    corpora.MmCorpus.serialize(
        os.path.join('C:\\Users\\soapk\\Desktop', saved_filename + ".mm"),
        corpus)

tfidf = models.TfidfModel(corpus)

if os.path.exists(
        os.path.join('C:\\Users\\soapk\\Desktop', saved_filename + ".index")):
    index = similarities.SparseMatrixSimilarity.load(
        os.path.join('C:\\Users\\soapk\\Desktop', saved_filename + ".index"))
else:
    corpus_tfidf = tfidf[corpus]
Example #21
0
from nltk.stem import SnowballStemmer
from gensim import corpora, models, similarities
import logging
import glob
import errno
import re
import os
'''reading info that are not printed'''
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
'''read generated files'''
if (os.path.exists(
        "D:/HOME/TCSS456/NLP-TCSS-456-A-Winter-2018/2017/tmp/dictionary.dic")):
    dictionary = corpora.Dictionary.load(
        'D:/HOME/TCSS456/NLP-TCSS-456-A-Winter-2018/2017/tmp/dictionary.dic')
    corpus = corpora.MmCorpus(
        'D:/HOME/TCSS456/NLP-TCSS-456-A-Winter-2018/2017/tmp/corpus.cop')
    print("Used files generated")
else:
    print("Please generate data set")

##print(list(corpus))
##print(dictionary)
##print(dictionary.token2id)
'''TF-IDF'''
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
##for doc in corpus_tfidf:
##    print(doc)
print(
    '10----------------------------------------------------------------------------------------------------------------'
)
"""
  train baseline LDA model.
"""
import UbuntuCorpus as UC
from gensim import corpora, models, similarities
import logging

num_topics = 100

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

dictionary = corpora.Dictionary.load_from_text('tmp/dialogs4.dict')
corpus = corpora.MmCorpus('tmp/dialogs4-corpus.mm')

# compute tfidf
tfidf = models.TfidfModel(corpus)

# convert the corpus to tfidf representation
corpus_tfidf = tfidf[corpus]

lda = models.ldamodel.LdaModel(corpus=corpus_tfidf,
                               id2word=dictionary,
                               num_topics=num_topics,
                               update_every=1,
                               eta=0.02,
                               chunksize=10000,
                               passes=10)

print("****TOP TOPICS****")
lda.print_topics(10)
Example #23
0
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. 
    dictionary = corpora.Dictionary(doc_clean)
    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=20, no_above=0.5)
    dictionary.save('models/3/new-rc-lda.dict')
else:
    dictionary = corpora.Dictionary.load('models/3/new-rc-lda.dict')


if not os.path.exists('models/3/new-doc-term.mm'):
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. [Bag Of Word]
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # Save the matrix into Market Matrix format. 
    corpora.MmCorpus.serialize('models/3/new-doc-term.mm', doc_term_matrix)
else:
    doc_term_matrix = corpora.MmCorpus('models/3/new-doc-term.mm')


# pprint(doc_term_matrix)

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Save LDA model
if not os.path.exists('models/3/new-model.lda'):
    # Creating the object for LDA model using gensim library
    Lda = gensim.models.ldamodel.LdaModel
    # Running and Trainign LDA model on the document term matrix.
    ldamodel = Lda(doc_term_matrix, num_topics=100, id2word = dictionary, passes=50)
    ldamodel.save('models/3/new-model.lda')
else:
Example #24
0
# store to disk, for later use
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)

class MyCorpus(object):
     def __iter__(self):
         for line in open('mycorpus.txt'):
             # assume there's one document per line, tokens separated by whitespace
             yield dictionary.doc2bow(line.lower().split())

corpus_memory_friendly = MyCorpus()  # doesn't load the corpus into memory!
print(corpus_memory_friendly)

dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
corpus = corpora.MmCorpus('/tmp/deerwester.mm')
print(corpus)

lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

# convert the query to LSI space
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
print(vec_lsi)

# transform corpus to LSI space and index it
index = similarities.MatrixSimilarity(lsi[corpus])

# Saving and loading index
index.save('/tmp/deerwester.index')
Example #25
0
File: lda.py Project: pmantica1/lda
from gensim.models.ldaseqmodel import LdaSeqModel
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from nltk import word_tokenize
from tqdm import tqdm
from csv import DictReader
from collections import defaultdict
import pprint

pp = pprint.PrettyPrinter(indent=4)

id2word = corpora.Dictionary.load('tokens.dict')
mm = corpora.MmCorpus('messages.mm')
ldaseq = LdaModel(corpus=mm, id2word=id2word, num_topics=15)
pp.pprint(ldaseq.print_topics())
ldaseq.save("lda_model")
Example #26
0
# GLOBAL calculation for weights (same for all recommendations)
# Weights: reciproacal ranks of the 500 items in each list (hd2v and bm25)
hybrid_weights = [1 / (i + 1) for i in range(500)]
hybrid_weights.extend(hybrid_weights)
hybrid_weights = np.array(hybrid_weights)
# Convert to probabilities
hybrid_weights = hybrid_weights / hybrid_weights.sum()

# GLOBAL num_items_to_pick (with replacement) -- high number: half a million
num_picks = 1000000

# LOAD MODELS
loadmodstart = time()
id2word_dictionary = corpora.Dictionary.load(
    '/home/ashwath/Programs/ACLAAn/LDA/aclmag.dict')
corpus = corpora.MmCorpus(
    '/home/ashwath/Programs/ACLAAn/LDA/aclmag_bow_corpus.mm')
try:
    ldamallet = LdaMallet.load(
        '/home/ashwath/Programs/ACLAAn/LDA/lda_model.model')
    vec_bow_test = id2word_dictionary.doc2bow(['test'])
    vec_ldamallet = ldamallet[vec_bow_test]
except subprocess.CalledProcessError:
    print("LDA MALLET COULDN'T READ INSTANCE FILE. USING NORMAL LDA INSTEAD")
    ldamallet = LdaModel.load(
        '/home/ashwath/Programs/ACLAAn/LDA/ldanormal_acl.model')

#index = similarities.MatrixSimilarity(ldamallet[corpus])
#index.save("simIndex.index")
malletindex = similarities.MatrixSimilarity.load(
    '/home/ashwath/Programs/ACLAAn/LDA/simIndexAcl.index')
with open(
corr_phi_tot = np.zeros([1, nbrun])
cosine_theta_tot = np.zeros([1, nbrun])
cosine_phi_tot = np.zeros([1, nbrun])
KL_theta_tot = np.zeros([1, nbrun])
KL_phi_tot = np.zeros([1, nbrun])
num_topics = 3
num_docs = 100
term_per_doc = 100
voc_size = 1000
beta = [0.01 for i in range(voc_size)]
alpha = [1 for i in range(num_topics)]
for i in range(nbrun):
    X, p_generate, theta_generate, phi_generate, data = generate_data(
        num_topics, num_docs, term_per_doc, voc_size, alpha, beta)
    dct = corpora.Dictionary.load('dct.dict')
    corpus = corpora.MmCorpus('corpus.mm')
    num_words = len(dct)
    print("Corpus actuel : ", i)
    if nbrun == 1:
        phi_gensim, corr_theta_gensim, corr_phi_gensim, cosine_theta_gensim, cosine_phi_gensim, KL_theta_gensim, KL_phi_gensim = lda_train(
            p_generate, theta_generate, phi_generate, num_topics, num_docs)
        phi_cgs, corr_theta, cosine_theta, KL_theta, corr_phi, cosine_phi, KL_phi = gibbs_vanilla(
            X, p_generate, theta_generate, phi_generate, num_topics, num_docs)
        words_id = np.arange(num_words, dtype=float)
        ymax = max(np.max(phi_generate), np.max(phi_gensim), np.max(phi_cgs))
        fig1 = plt.figure()
        for i in range(num_topics):
            plt.subplot(1, num_topics, i + 1)
            plt.bar(words_id, phi_generate[i, :], label="Généré", color="r")
            #plt.subplot(3,num_topics,num_topics+i+1)
            plt.bar(words_id, phi_gensim[i, :], label="Gensim", color="g")
Example #28
0
from gensim.corpora import dictionary
from gensim import models

if len(sys.argv) > 1:
    fname_suffix = sys.argv[1]
else:
    fname_suffix = ''

# In[6]:

corpus_fname = 'corpus' + fname_suffix + '.mm'
tfidf_corpus_fname = 'tfidf_corpus' + fname_suffix + '.mm'

my_dict = dictionary.Dictionary.load(
    os.path.join(settings.PERSIST_DIR, 'my_dict'))
corpus = corpora.MmCorpus(os.path.join(settings.PERSIST_DIR, corpus_fname))

# In[8]:

tfidf = models.TfidfModel(corpus)

# In[10]:

tfidf_corpus = tfidf[corpus]

tfidf.save(os.path.join(settings.PERSIST_DIR, 'tfidf_model' + fname_suffix))

# In[11]:

corpora.MmCorpus.serialize(
    os.path.join(settings.PERSIST_DIR, tfidf_corpus_fname), tfidf_corpus)
## import some packages
#################################################################################################
print('Loading modules ... ')
from gensim import corpora, models, similarities
import logging
import os.path
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

#################################################################################################
## load dictionary, corpus - bag of words representation
#################################################################################################
print('Loading dictionary and corpus ... ')
root = os.path.dirname(os.getcwd()) + '\\OBJ\\LSI\\'
dictionary = corpora.Dictionary.load(root + 'fullDictionary.dict')
corpus = corpora.MmCorpus(root + 'fullCorpus.mm')
print corpus

#################################################################################################
## Tfidf model (topic frequency, inverse document frequency)
#################################################################################################
print('\nGenerating Tfidf model ... ')
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

#################################################################################################
## LSI model with Nfeat features
#################################################################################################
Nfeat = 100
print('\nGenerating LSI model with {} features ... '.format(Nfeat))
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary,
Example #30
0
    def classify_stock_news(self,
                            unseen_raw_document,
                            database_name,
                            collection_name,
                            label_name="60DaysLabel",
                            topic_model_type="lda",
                            classifier_model="svm",
                            ori_dict_path=None,
                            bowvec_save_path=None,
                            is_saved_bow_vector=False):
        historical_raw_documents_list = []
        Y = []
        for row in self.database.get_collection(database_name,
                                                collection_name).find():
            if label_name in row.keys():
                if row[label_name] != "":
                    historical_raw_documents_list.append(row["Article"])
                    Y.append(row[label_name])
        logging.info(
            "fetch symbol '{}' historical news with label '{}' from [DB:'{}' - COL:'{}'] ... "
            .format(collection_name, label_name, database_name,
                    collection_name))

        le = preprocessing.LabelEncoder()
        Y = le.fit_transform(Y)
        logging.info(
            "encode historical label list by sklearn preprocessing for training ... "
        )
        label_name_list = le.classes_  # ['中性' '利好' '利空'] -> [0, 1, 2]

        # 根据历史新闻数据库创建词典,以及计算每个历史新闻的词袋向量;如果历史数据库创建的字典存在,则加载进内存
        # 用未见过的新闻tokens去更新该词典
        if not os.path.exists(ori_dict_path):
            if not os.path.exists(bowvec_save_path):
                _, _, historical_bow_vec = self.create_bag_of_word_representation(
                    historical_raw_documents_list,
                    new_dict_path=ori_dict_path,
                    bow_vector_save_path=bowvec_save_path,
                    is_saved_dict=True)
                logging.info(
                    "create dictionary of historical news, and serialized in path -> {} ... "
                    .format(ori_dict_path))
                logging.info(
                    "create bow-vector of historical news, and serialized in path -> {} ... "
                    .format(bowvec_save_path))
            else:
                _, _, _ = self.create_bag_of_word_representation(
                    historical_raw_documents_list,
                    new_dict_path=ori_dict_path,
                    is_saved_dict=True)
                logging.info(
                    "create dictionary of historical news, and serialized in path -> {} ... "
                    .format(ori_dict_path))
        else:
            if not os.path.exists(bowvec_save_path):
                _, _, historical_bow_vec = self.create_bag_of_word_representation(
                    historical_raw_documents_list,
                    new_dict_path=ori_dict_path,
                    bow_vector_save_path=bowvec_save_path,
                    is_saved_dict=True)
                logging.info(
                    "historical news dictionary existed, which saved in path -> {}, but not the historical bow-vector"
                    " ... ".format(ori_dict_path))
            else:
                historical_bow_vec_mmcorpus = corpora.MmCorpus(
                    bowvec_save_path
                )  # type -> <gensim.corpora.mmcorpus.MmCorpus>
                historical_bow_vec = []
                for _bow in historical_bow_vec_mmcorpus:
                    historical_bow_vec.append(_bow)
                logging.info(
                    "both historical news dictionary and bow-vector existed, load historical bow-vector to memory ... "
                )

        start_time = time.time()
        updated_dictionary_with_old_and_unseen_news, unssen_documents_token_list = self.renew_dictionary(
            ori_dict_path, [unseen_raw_document], is_saved=True)
        end_time = time.time()
        logging.info(
            "renew dictionary with unseen news tokens, and serialized in path -> {}, "
            "which took {} mins ... ".format(ori_dict_path,
                                             (end_time - start_time) / 60))

        unseen_bow_vector = [
            updated_dictionary_with_old_and_unseen_news.doc2bow(doc_token)
            for doc_token in unssen_documents_token_list
        ]
        updated_bow_vector_with_old_and_unseen_news = []
        updated_bow_vector_with_old_and_unseen_news.extend(historical_bow_vec)
        updated_bow_vector_with_old_and_unseen_news.extend(unseen_bow_vector)
        # 原先updated_bow_vector_with_old_and_unseen_news是list类型,
        # 但是经过下面序列化后重新加载进来的类型是gensim.corpora.mmcorpus.MmCorpus
        if is_saved_bow_vector and bowvec_save_path:
            corpora.MmCorpus.serialize(
                bowvec_save_path, updated_bow_vector_with_old_and_unseen_news
            )  # 保存更新后的bow向量,即包括新旧新闻的bow向量集
        logging.info(
            "combined bow vector(type -> 'list') generated by historical news with unseen bow "
            "vector to create a new one ... ")

        if topic_model_type == "lsi":
            start_time = time.time()
            updated_tfidf_model_vector = self.transform_vectorized_corpus(
                updated_dictionary_with_old_and_unseen_news,
                updated_bow_vector_with_old_and_unseen_news,
                model_type="tfidf"
            )  # type -> <gensim.interfaces.TransformedCorpus object>
            end_time = time.time()
            logging.info(
                "regenerated TF-IDF model vector by updated dictionary and updated bow-vector, "
                "which took {} mins ... ".format((end_time - start_time) / 60))

            start_time = time.time()
            model = models.LsiModel(
                updated_tfidf_model_vector,
                id2word=updated_dictionary_with_old_and_unseen_news,
                num_topics=config.TOPIC_NUMBER)  # 初始化模型
            model_vector = model[
                updated_tfidf_model_vector]  # type -> <gensim.interfaces.TransformedCorpus object>
            end_time = time.time()
            logging.info(
                "regenerated LSI model vector space by updated TF-IDF model vector space, "
                "which took {} mins ... ".format((end_time - start_time) / 60))
        elif topic_model_type == "lda":
            start_time = time.time()
            model_vector = self.transform_vectorized_corpus(
                updated_dictionary_with_old_and_unseen_news,
                updated_bow_vector_with_old_and_unseen_news,
                model_type="lda")
            end_time = time.time()
            logging.info(
                "regenerated LDA model vector space by updated dictionary and bow-vector, "
                "which took {} mins ... ".format((end_time - start_time) / 60))

        # 将gensim.interfaces.TransformedCorpus类型的lsi模型向量转为numpy矩阵
        start_time = time.time()
        latest_matrix = corpus2dense(model_vector,
                                     num_terms=model_vector.obj.num_terms).T
        end_time = time.time()
        logging.info(
            "transform {} model vector space to numpy.adarray, "
            "which took {} mins ... ".format(topic_model_type.upper(),
                                             (end_time - start_time) / 60))

        # 利用历史数据的话题模型向量(或特征),进一步训练新闻分类器
        start_time = time.time()
        train_x, train_y, test_x, test_y = utils.generate_training_set(
            latest_matrix[:-1, :], Y)
        clf = self.classifier.train(train_x,
                                    train_y,
                                    test_x,
                                    test_y,
                                    model_type=classifier_model)
        end_time = time.time()
        logging.info(
            "finished training by sklearn {} using latest {} model vector space, which took {} mins ... "
            .format(classifier_model.upper(), topic_model_type.upper(),
                    (end_time - start_time) / 60))

        label_id = clf.predict(latest_matrix[-1, :].reshape(1, -1))[0]

        return label_name_list[label_id]