Example #1
0
#coding:utf-8
import logging
import jieba
from gensim import corpora, models
import myConfig
import preprocess

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

dictionary = corpora.Dictionary.load(myConfig.dict_file)
corpus = corpora.MmCorpus(myConfig.corpora_file)

if myConfig.useTFIDF:
    tfidf = models.TfidfModel(corpus)
    corpus = tfidf[corpus]

lsi = models.LsiModel(corpus, id2word =dictionary, num_topics = myConfig.num_topics )
corpus_lsi = lsi[corpus]
lsi.save(myConfig.topic_model_file)
lsi.print_topics(myConfig.num_topics)
lsi.print_debug(myConfig.num_topics, num_words = 20 )


def gensim_feature(corpus=None):

    # corpus参数样例数据如下:
    # corpus = [["我", "来到", "成都", "春熙路"],
    #           ["今天", "在", "宽窄巷子", "耍", "了", "一天"],
    #           ["成都", "整体", "来说", "还是", "挺", "安逸", "的"],
    #           ["成都", "的", "美食", "真", "巴适", "惨", "了"]]
    dictionary = corpora.Dictionary(corpus)  # 构建语料词典

    # # 收集停用词和仅出现一次的词的id
    # stop_ids = [dictionary.token2id[stopword] for stopword in user_stop_word_list if stopword in dictionary.token2id]
    # once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
    # dictionary.filter_tokens(stop_ids + once_ids) # 删除停用词和仅出现一次的词
    # dictionary.compactify()  # 消除id序列在删除词后产生的不连续的缺口
    # dictionary.save('mycorpus.dict')  # 把字典保存起来,方便以后使用

    # 统计词频特征
    dfs = dictionary.dfs  # 词频词典
    for key_id, c in dfs.items():
        print(dictionary[key_id], c)

    # 转换成doc_bow
    doc_bow_corpus = [dictionary.doc2bow(doc_cut) for doc_cut in corpus]

    # 生成tfidf特征
    tfidf_model = models.TfidfModel(dictionary=dictionary)  # 生成tfidf模型
    tfidf_corpus = [tfidf_model[doc_bow] for doc_bow in doc_bow_corpus
                    ]  # 将每doc_bow转换成对应的tfidf_doc向量

    # 生成lsi特征(潜在语义索引)
    lsi_model = models.LsiModel(corpus=tfidf_corpus,
                                id2word=dictionary,
                                num_topics=100)  # 生成lsi model
    # 生成corpus of lsi
    lsi_corpus = [lsi_model[tfidf_doc]
                  for tfidf_doc in tfidf_corpus]  # 转换成lsi向量

    # 生成lda特征(主题模型)
    lda_model = models.LdaModel(corpus=tfidf_corpus,
                                id2word=dictionary,
                                num_topics=100)  # 生成lda model
    # 生成corpus of lsi
    lda_corpus = [lda_model[tfidf_doc]
                  for tfidf_doc in tfidf_corpus]  # 转换成lda向量

    # 生成随机映射(Random Projections,RP, 优点:减小空维度、CPU和内存都很友好)
    rp_model = models.RpModel(tfidf_corpus, num_topics=500)
    rp_corpus = [rp_model[tfidf_doc]
                 for tfidf_doc in tfidf_corpus]  # 转换成随机映射tfidf向量

    # 分层狄利克雷过程(Hierarchical Dirichlet Process,HDP ,一种无参数贝叶斯方法)
    hdp_model = models.HdpModel(doc_bow_corpus, id2word=dictionary)
    hdp_corpus = [hdp_model[doc_bow] for doc_bow in doc_bow_corpus]  # 转换成HDP向量

    # 文档向量和词向量 (Doc2Vec and Word2Vec)
    tld_list = []
    for ind, line_list in enumerate(corpus):
        tld_list.append(TaggedDocument(line_list, tags=[str(ind)]))
    d2v_model = Doc2Vec(tld_list,
                        min_count=5,
                        window=3,
                        size=100,
                        sample=1e-3,
                        negative=5,
                        iter=15)
    # 由于Doc2vec的训练过程也可以同时训练Word2vec,所以可以直接获取两个模型,全部保存起来:
    # model.save(save_model_d2v_file_path)
    # model.save_word2vec_format(save_model_w2v_file_path, binary=True)

    # 将文本转换成向量矩阵
    docvecs = d2v_model.docvecs
    docvecs_matrix = np.asarray(docvecs)
Example #3
0
from gensim import corpora, models, similarities
texts = ["きのう", "も", "私", "は", "その", "料理", "を", "食べました"]

# textsを予め準備しておく(分かち書き文のリスト)
num_topics = 3
dictionary = corpora.Dictionary(texts)  # 入力textsをdictionaryに変換
corpus = [dictionary.doc2bow(text) for text in texts]  # corpusを作成
tfidf = models.TfidfModel(corpus)  # TFIDFモデルを作成
corpus_tfidf = tfidf[corpus]  # corpusをTF-IDFで重要語のみに変換
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary,
                      num_topics=num_topics)  # corpus_tfidfからLSIモデルを作成

# トピックの表示
print(lsi.show_topics(num_topics, formatted=True))  # topicを表示
corpus_lsi = lsi[corpus_tfidf]  # corpus_tfidfのすべての文をLSIに変換
for doc in corpus_lsi:
    x = [
        sorted(doc, key=lambda u: u[1], reverse=True) for u in doc
        if len(u) != 0
    ]
    print(x)
Example #4
0
print(final)
dictionary = corpora.Dictionary(final)
dictionary.save('dictionary/mydict.dic')
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('corpus/corpus.mm', corpus)
# Reload dictionary and corpus
dictionary = corpora.Dictionary.load('dictionary/mydict.dic')
corpus = corpora.MmCorpus('corpus/corpus.mm')
tfidf = models.TfidfModel(corpus=corpus)
tfidf.save('model/model.tfidf')
# Serialize corpus
tfidf_corpus = tfidf[corpus]
corpora.MmCorpus.serialize('model/tfidf_corpus.mm', tfidf_corpus)

#lsi
lsi = models.LsiModel(corpus=tfidf_corpus, id2word=dictionary,
                      num_topics=120)  #TODO
lsi_corpus = lsi[tfidf_corpus]
lsi.save('model/lsi/model.lsi')
corpora.MmCorpus.serialize('model/lsi/lsi_corpus.mm', lsi_corpus)
print('LSI Topics:')
print(lsi.print_topics(120))

#lda
lda = models.LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=120)
lda_corpus = lda[tfidf_corpus]
lda.save('model/lda/model.lda')
corpora.MmCorpus.serialize('model/lda/lda_corpus.mm', lda_corpus)

index = similarities.MatrixSimilarity(corpus)
index.save('similarity/lsi_similarity.sim')
def lsi_model(corpus_tfidf, dictionary, lsi_save_path):
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=50)
    lsi.print_topics(50)
    lsi.save(lsi_save_path)
Example #6
0
from gensim import corpora, models, similarities
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

dictionary = corpora.Dictionary.load('resources/today.dict')
corpus = corpora.MmCorpus('resources/today.mm')

tfidf = models.TfidfModel(corpus)  # step 1 -- initialize a model
corpus_tfidf = tfidf[corpus]  # apply the trained model to a corpus
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary,
                      num_topics=10)  # initialize an LSI transformation
corpus_lsi = lsi[
    corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
lsi.print_topics(5)
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

# Have a look at how the 20th document looks like: [(word_id, count), ...]
#print(corpus[20])
# [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...

# Build the LDA model
lda_model = models.LdaModel(corpus=corpus,
                            num_topics=NUM_TOPICS,
                            id2word=dictionary)

# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus,
                            num_topics=NUM_TOPICS,
                            id2word=dictionary)

print("LDA Model:")

for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))

print("=" * 20)

print("LSI Model:")

for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
Example #8
0
#dictionary = corpora.Dictionary(sentTokens[0])
#dictionary.save(os.path.join(TEMP_FOLDER, 'execsSententceTokens.dict')) 
#print(dictionary)


# create and save corpus
#corpus = [dictionary.doc2bow(sentTk) for sentTk in sentTokens[0]]
#corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'execsSententceTokens.mm'), corpus)  # store to disk, for later use

# load saved dictionary and corpus
dictionary = corpora.Dictionary.load(TEMP_FOLDER + '\execsAnnotatedtext.dict')
corpus = corpora.MmCorpus(TEMP_FOLDER + '\execsSententceTokens.mm')


# create LSI model with 250 topics
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=250)

# transform corpus to LSI space and index it
index = similarities.MatrixSimilarity(lsi[corpus])#, num_features=208) 

#save the index
#index.save(TEMP_FOLDER + '\execsSententceTokensIndex.index')

#load the saved index
#index = similarities.MatrixSimilarity.load(TEMP_FOLDER + '\execsSententceTokensIndex.index')


#lsi.show_topic(1, topn=15)
docSentLkup = {}
docSentAll = {}
sentNdoc = []
Example #9
0
# print(new_vec)  # the word "interaction" does not appear in the dictionary and is ignored

corpus = [dictionary.doc2bow(text) for text in texts]
# for c in corpus:
#     print(c)

from gensim import models
from gensim import similarities

##
##
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
##
##
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary)

doc = 'What worries me about AI'#documents[0]
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space
# print(vec_lsi)

index = similarities.MatrixSimilarity(lsi[corpus_tfidf]) # transform corpus to LSI space and index it

sims = index[vec_lsi] # perform a similarity query against the corpus
# for i, sim in enumerate(sims):
#     print('{} - {}'.format(sim, documents[i]))

sims_s = sorted(list(enumerate(sims)), key=lambda tup: tup[1], reverse=True)
for item in sims_s:
    i = item[0]
Example #10
0
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

#texts = [[token for token in text if frequency[token] > 1] for text in texts]
#pprint(texts)

dictionary = corpora.Dictionary(texts)

corpus = [dictionary.doc2bow(text) for text in texts]

tfidf = models.TfidfModel(corpus)

#lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=10)
lsi = models.LsiModel(tfidf[corpus], id2word=dictionary, num_topics=10)

#index = similarities.MatrixSimilarity(lsi[corpus]) #transform corpus to LSI and index it

#print(corpus)


@app.route('/_analyse')
def _analyse():
    return_message = ""

    doc = request.args.get('usrinp')

    if doc.lower() == "yes":
        session.clear()
        return jsonify(
Example #11
0
def calcLSI(tfidfList, dictionary):
    global corpus_lsi
    topics = 4  #this is arbitrary we should play around with this
    lsi = models.LsiModel(tfidfList, id2word=dictionary, num_topics=topics)
    corpus_lsi = lsi[tfidfList]
    lsi.print_topics(topics)
Example #12
0
p_ans_dictionary.save('/tmp/perfect_answer.dict')  # store the dictionary, for future reference
print(p_ans_dictionary)


###########################################################

print(p_ans_dictionary.token2id)

###########################################################

corpus = p_ans_dictionary.doc2bow(recorded_answer.lower().split())

corpora.MmCorpus.serialize('/tmp/p_ans.mm', [corpus])

##############################################################

from gensim import models
lsi = models.LsiModel([corpus], id2word=p_ans_dictionary, num_topics=2)

vec_bow = p_ans_dictionary.doc2bow(r_ans.lower().split())
vec_lsi = lsi[vec_bow]

from gensim import similarities
index = similarities.MatrixSimilarity(lsi[[corpus]])  # transform corpus to LSI space and index it

###############################################################

sims = index[vec_lsi]

print(list(enumerate(sims)))  # print (document_number, document_similarity) 2-tuples
Example #13
0
from gensim import corpora
from gensim import models
from gensim import similarities

# l1 = ["你的名字是什么", "你今年几岁了", "你有多高你胸多大", "你胸多大"]
# a = "你今年多大了"
from Config import MongoDB

l1 = list(MongoDB.Content.find({})) # 从数据库中获取问题库
all_doc_list = []
for doc in l1:
    doc_list = list(jieba.cut_for_search(doc.get("title"))) # 把Content表中的歌曲名进行jieba处理,放在大的列表中
    all_doc_list.append(doc_list)


# 制作语料库
dictionary = corpora.Dictionary(all_doc_list)  # 制作词袋
corpus = [dictionary.doc2bow(doc) for doc in all_doc_list]
lsi = models.LsiModel(corpus)  # 数据量小时相对精确,大了就非常的不精确 500万以内,LsiModel就是用来寻找、提取公共的搜索条件
index = similarities.SparseMatrixSimilarity(lsi[corpus], num_features=len(dictionary.keys()))


def my_gensim_nlp(a):
    doc_test_list = list(jieba.cut_for_search(a))
    doc_test_vec = dictionary.doc2bow(doc_test_list)
    sim = index[lsi[doc_test_vec]]
    cc = sorted(enumerate(sim), key=lambda item: -item[1])
    if cc[0][0] >= 0.55:
        text = l1[cc[0][0]]

        return text
Example #14
0
File: asn3.py Project: dddd999/asn3
def TFIDFH():
    document1 = []
    for w in brown.words(categories='mystery'):
        document1.append(w.lower())

    B = document1[:len(document1) // 2]

    doc = ""
    for w in brown.words(categories='mystery'):
        doc += str(w.lower())

    C, D = doc[:int(len(doc) / 2)], doc[int(len(doc) / 2):]

    stoplist = set('for a of the and to in - , is'.split())
    texts = [[
        word for word in document.lower().split() if word not in stoplist
    ] for document in B]

    texts[0] = [text.replace(',', '') for text in texts[0]]

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

    vec_bow = dictionary.doc2bow(D.lower().split())

    vec_lsi = lsi[vec_bow]
    index = similarities.MatrixSimilarity(lsi[corpus])

    sims = index[vec_lsi]  # perform a similarity query against the corpus
    print("results: ", sims)

    bow1 = document1
    bow2 = doc

    wordSet = set(bow1).union(set(bow2))

    wordDict1 = dict.fromkeys(wordSet, 0)
    wordDict2 = dict.fromkeys(wordSet, 0)

    for word in bow1:
        wordDict1[word] += 1

    for word in bow2:
        wordDict2[word] += 1

    def computeTF(wordDict, bow):
        tfDict = {}
        bowCount = len(bow)
        for word, count in wordDict.items():
            tfDict[word] = count / float(bowCount)
        return tfDict

    tfBow1 = computeTF(wordDict1, bow1)
    tfBow2 = computeTF(wordDict2, bow2)

    def computeIDF(docList):
        import math
        idfDict = {}
        N = len(docList)

        idfDict = dict.fromkeys(docList[0].keys(), 0)
        for doc in docList:
            for word, val in doc.items():
                if val > 0:
                    idfDict[word] += 1

        for word, val in idfDict.items():
            idfDict[word] = math.log10(N / float(val))

        return idfDict

    idfs = computeIDF([wordDict1, wordDict2])
    print("IDF")
    print(idfs)

    def computeTFIDF(tfBow, idfs):
        tfidf = {}
        for word, val in tfBow.items():
            tfidf[word] = val * idfs[word]
        return tfidf

    tfidfBow1 = computeTFIDF(tfBow1, idfs)
    tfidfBow2 = computeTFIDF(tfBow2, idfs)

    print("TF-IDF Document1: ")
    print(tfidfBow1)
    print("TF-IDF Document2: ")
    print(tfidfBow2)
Example #15
0
    #Create a dictionary of the words
    dictionary = corpora.Dictionary(feature_none)
    #print( dictionary.token2id)

    #Transform the document to a BOW
    corpus = [dictionary.doc2bow(text) for text in feature_none]
    #print(corpus[:2])

    #Transform to TFIDF
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    #Extract top topics using Latent Semantic Indexing
    lsi = models.LsiModel(corpus_tfidf,
                          id2word=dictionary,
                          num_topics=total_topics)

    #Print the top topics
    print_topics_gensim(topic_model=lsi,
                        total_topics=total_topics,
                        num_terms=15,
                        display_weights=False)

# In[ ]:
'''
Observations of the topics by MBTI type:
- Across the MBTI types some common theme occur:
    Personality
    Relationship
    Music
Example #16
0
# Step 4: Creating bag-of-words model and generate corpus sparse vector
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
# print(dictionary.token2id)

# Step 5: Generating TF-IDF
tfidf = models.TfidfModel(corpus, normalize=True)
corpus_tfidf = tfidf[corpus]
# for document in corpus_tfidf:
#     print(document)

#TODO: learn how to read this and annotate the topic and observe the stuff
num_topic = [100, 200]
#### LSI ##########
for num_topics in num_topic:
    lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics, power_iters=300)
    corpus_lsi = lsi_model[corpus_tfidf]

    #writing doc classification to file
    count = 0
    filename = 'lsi_list_' + str(num_topics) + '.csv'
    with open(filename, 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter = ',')
        for document in corpus_lsi:
            count += 1
            try:
                writer.writerow(max(document, key = lambda item:item[1]))
            except ValueError:
                print(count)

    name_of_file = 'lsi_topics_' + str(num_topics) + '.csv'
Example #17
0
def LSI_analysis(texts, nTopics):
    # 抽取一个“词袋(bag-of-words)",将文档的token映射为id
    dictionary = corpora.Dictionary(texts)

    # 将用字符串表示的文档转换为用id表示的文档向量
    corpus = [dictionary.doc2bow(text) for text in texts]
    #print 'Corpus: '
    #print corpus

    # 基于这些“训练文档”计算一个TF-IDF“模型
    tfidf = models.TfidfModel(corpus)
    #print 'tfidf model:'
    #print tfidf.dfs
    #print tfidf.idfs

    # 将上述用词频表示文档向量表示为一个用tf-idf值表示的文档向量
    corpus_tfidf = tfidf[corpus]
    print 'Text vector formed by tf-idf'
    #for doc in corpus_tfidf:
    #print doc

    # 训练一个LSI模型
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=nTopics)
    print 'Top 2 topics of LSI model:'
    print lsi.print_topics(2)

    # 将文档映射到一个n维的topic空间中
    print 'The text projection in the n-dim topic space:'
    corpus_lsi = lsi[corpus_tfidf]  # n of doc * n of topic
    for doc in corpus_lsi:
        print doc

    # LDA模型; lda模型中的每个主题单词都有概率意义,其加和为1,值越大权重越大,物理意义比较明确
    #lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=nTopics)
    #print 'Top 2 topics of LDA model:'
    #print lda.print_topics(2)

    # 将文档映射到一个二维的topic空间中
    #print 'The text projection in the n-dim topic space:'
    #corpus_lda = lda[corpus_tfidf]
    #for doc in corpus_lda:
    ##print doc

    # 计算文档之间的相似度,或者给定一个查询Query,如何找到最相关的文档: 首先建索引
    index = similarities.MatrixSimilarity(lsi[corpus])

    # 将query向量化
    query = "shipment of silver arrived"
    query_bow = dictionary.doc2bow(query.lower().split())
    print "query: " + query + "; the bow vector: "
    #print query_bow

    # 用之前训练好的LSI模型将其映射到n维的topic空间
    query_lsi = lsi[query_bow]

    print 'The projection of query in the n-dim topic space:'
    print query_lsi

    # 计算其和index中doc的余弦相似度
    sims = index[query_lsi]

    print 'The cos simularity between query and doc:'
    print sims
    print list(enumerate(sims))

    # 也可以按相似度进行排序
    sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
    print sort_sims

    query_tfidf = tfidf[query_bow]
    query_tfidf_lsi = lsi[query_tfidf]
    index_tfidf = similarities.MatrixSimilarity(lsi[corpus_tfidf])
    sims_tfid = index_tfidf[query_tfidf_lsi]
    print 'The cos simularity between query and doc:'
    print list(enumerate(sims_tfid))
    return
Example #18
0
def suggest_next_video(original_id, input_chunks, search_term):
    if(search_term == ''):
        global last_search
        search_term = last_search
    
	# This video_id is just a test case
    #if (original_id == 'R9npBuS9AsE'):
    #    output_id_list = get_canned_search_results()
    #else:

    output_video_list = query_video_ids(search_term)
    
    output_name_map = dict(output_video_list)
    output_id_list = [video[0] for video in output_video_list]
    
    #Truncate possible video list to 20 for performance reasons
    try:
        output_id_list.remove(original_id)
    except:
        pass
    output_id_list = output_id_list[:40]
        
    chunk_lookup_dict = {}
    
    start = time.time()
    
    chunk_counter = 0
    output_chunks = []
    myq = queue.Queue()
    threads = list()
    for video_id in output_id_list:
        thread = threading.Thread(target=queueTranscript,args=(video_id,myq))
        threads.append(thread)
        thread.start()
    
    for thread in threads:
        thread.join()
    
    for transcript in list(myq.queue):
        transcript_counter = 0
        #try:
        #    output_video_list = yttapi.get_transcript(str(video_id))
        #except yttapi.CouldNotRetrieveTranscript:
        #    continue
        output_video_list = transcript[1]
        video_length = len(transcript[1])
        video_id = transcript[0]
    
        for i in range(video_length//10):
            chunk_text_list = []
            for j in range(10):
                try:
                    chunk_text_list.append(output_video_list[transcript_counter]['text'])
                except Exception:
                    break
                chunk_text = ' '.join(chunk_text_list)
                transcript_counter += 1
                
            output_chunks.append(chunk_text)
            chunk_lookup_dict[chunk_counter] = video_id
            chunk_counter += 1
            
    print ("After chunking output: " + str(time.time() - start))
    
    start = time.time()
	# Exclude common stop words and those used frequently in YouTube transcripts
    my_stop_words = STOPWORDS.union(set(['[Music]', '[music]', '[Applause]', 'subscribe', 'channel', 'youtube']))
    #stoplist = set('for a of the and to in [music]'.split())
    texts = [
        [word for word in document.lower().split() if word not in my_stop_words]
        for document in output_chunks
    ]
    
    dictionary = corpora.Dictionary(texts)
    
    corpus = [dictionary.doc2bow(text) for text in texts]
    lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=10)
    
    # generates an index of the corpus, need only do this once 
    index = similarities.MatrixSimilarity(lsi[corpus])
    
    print ("After building index: " + str(time.time() - start))
    
    video_average_score = {}
    for video_id in output_id_list:
        video_average_score[video_id] = []
    
    start = time.time()
    
    # Go through each input chunk and get an average score for each video
    for i in range(len(input_chunks)):
        
        # Skip over chunks the user didn't watch
        watched_score = input_chunks[i][1]
        if (watched_score == 0):
            continue
        
        doc=input_chunks[i][0]
        #doc=input_chunks[0][0]
        vec_bow = dictionary.doc2bow(doc.lower().split())
        vec_lsi = lsi[vec_bow]
        similarity_score = index[vec_lsi]
    
    
        # sorts based on descending relevance (earlier sort order = more useful)
        similarity_scores = sorted(enumerate(similarity_score), key=lambda item: -item[1])
        
        #chunk_ranking = [(documents[x],y) for (x,y) in similarity_scores]
        
        video_total_score = {}
        video_chunk_counts = {}
    
        
        for video_id in output_id_list:
            video_total_score[video_id] = 0
            video_chunk_counts[video_id] = 0  
            
            
        for chunk_id, score in similarity_scores:
            video_total_score[chunk_lookup_dict[chunk_id]] += score
            video_chunk_counts[chunk_lookup_dict[chunk_id]] += 1
            
        # Multiply the similarity ranking by the 'score' given to us that represents how slowly they 
        # watched the video chunk and how many times they repeated it
        # We append this to a list of average scores for the video
        for video_id in output_id_list:
            if (video_chunk_counts[video_id] == 0):
                video_average_score[video_id].append(0)
            else:
                avg_score = video_total_score[video_id]/video_chunk_counts[video_id]
                video_average_score[video_id].append(avg_score)
    
    print ("After looping through input chunks: " + str(time.time() - start))
        
    video_sum = {}
    for idx, video_id in enumerate(video_average_score.keys()):
        total_score = sum(x for x in video_average_score[video_id])
        #video_sum[video_id] = (total_score * (1 + RL_WEIGHT_FACTOR * rl_network.weights['param_' + str(idx)]), output_name_map[video_id])
        video_sum[video_id] = (total_score, output_name_map[video_id])

    sorted_videos = list(sorted(video_sum.items(), key=lambda kv: -kv[1][0]))

    # now apply geva
    return_videos = []
    for i in range(0,10):
        return_videos.append((sorted_videos[i][0], (sorted_videos[i][1][0]* (1.0+RL_WEIGHT_FACTOR * rl_network.weights['param_'+str(i)]),sorted_videos[i][1][1])))

    return return_videos
Example #19
0
def data():

    data = []
    pos = pd.read_excel("E:/dataset/words_classification/dataset/pos.xls",
                        header=None,
                        index=None)
    neg = pd.read_excel("E:/dataset/words_classification/dataset/neg.xls",
                        header=None,
                        index=None)

    stop_words = []
    with open('E:/dataset/NLP/stopwords/stopwords_TUH.txt',
              'r',
              encoding='gbk') as f:
        line = f.readline()
        while line:
            stop_words.append(line[:-1])
            line = f.readline()
    stop_words = set(stop_words)

    pos_split = []
    for index, seq in enumerate(pos[0]):
        pos_seq = list(jieba.cut(seq, cut_all=False))
        pos_line = []
        for word in pos_seq:
            if word not in stop_words:
                pos_line.append(word)
        pos_split.append(pos_line)
    neg_split = []
    for index, seq in enumerate(neg[0]):
        neg_seq = list(jieba.cut(seq, cut_all=False))
        neg_line = []
        for word in neg_seq:
            if word not in stop_words:
                neg_line.append(word)
        neg_split.append(neg_line)

    data = np.concatenate((pos_split, neg_split))

    # 分词词典
    data_dic = corpora.Dictionary(data)
    data_dic.save(
        'E:/dataset/words_classification/dataset/tf-idf_model/data_dict')

    # 转为频率表示的稀疏向量
    corpus = [data_dic.doc2bow(text) for text in data]
    corpora.MmCorpus.serialize(
        'E:/dataset/words_classification/dataset/tf-idf_model/data_corpus',
        corpus)  #保存生成的语料

    # tf-idf
    tfidf_model = models.TfidfModel(corpus=corpus, dictionary=data_dic)
    # corpus_tfidf = tfidf_model[corpus]
    corpus_tfidf = [tfidf_model[doc] for doc in corpus]
    tfidf_model.save(
        'E:/dataset/words_classification/dataset/tf-idf_model/data_tf-idf.tfidf'
    )

    # lsi
    lsi_model = models.LsiModel(corpus=corpus,
                                id2word=data_dic,
                                num_topics=100)
    # corpus_lsi = lsi_model[tfidf_corpus]
    corpus_lsi = [lsi_model[doc] for doc in corpus]
    lsi_model.save(
        "E:/dataset/words_classification/dataset/tf-idf_model/data_lsi")

    return corpus_lsi
Example #20
0
    def get_recmd(self, id, type):
        texts1 = self.get_behaviors(id, type)  #get user's behaviors
        if texts1 == []:
            print "user's behavior is null"
            if type == 'Home':
                return_articles = Article.objects.all().order_by('?')[:20]
            else:
                return_articles = Article.objects.filter(
                    articleType1=type).order_by('?')[:20]
            return return_articles
        else:
            print "user has behavior"
            if type == 'Home':
                num = len(texts1)
                study = 0
                news = 0
                fun = 0
                for i in texts1:
                    # ????
                    t = Article.objects.get(articleId=i).articleType1
                    if t == 'Study':
                        study += 1
                    else:
                        if t == 'News':
                            news += 1
                        else:
                            fun += 1
                s_num = (int)(1.0 * study / num * 20)
                n_num = (int)(1.0 * news / num * 20)
                f_num = (int)(1.0 * fun / num * 20)
                return_articles = []
                return_articles.extend(
                    Article.objects.filter(
                        articleType1='Study').order_by('?')[0:s_num])
                return_articles.extend(
                    Article.objects.filter(
                        articleType1='News').order_by('?')[0:n_num])
                return_articles.extend(
                    Article.objects.filter(
                        articleType1='Fun').order_by('?')[0:f_num])
                return return_articles

            else:
                texts = []
                articles = Article.objects.filter(articleType1=type)
                dict = {}
                i = 0
                for article in articles:
                    dict[i] = article.articleId
                    i += 1
                    l = article.articleEnglishText.lower().replace(
                        ',', " ").replace('.', " ").replace('"', " ").split()
                    new_l = [word for word in l if word not in stopwords]
                    texts.append(new_l)
                dictionary = corpora.Dictionary(texts)
                corpus = [dictionary.doc2bow(text) for text in texts]
                tfidf = models.TfidfModel(corpus)
                corpus_tfidf = tfidf[corpus]
                lsi = models.LsiModel(corpus_tfidf,
                                      id2word=dictionary,
                                      num_topics=10)
                corpus_lsi = lsi[corpus_tfidf]
                index = similarities.MatrixSimilarity(lsi[corpus])
                simi = zeros((len(texts1), len(texts)))
                for i in range(len(texts1)):
                    query = texts[i]
                    query_bow = dictionary.doc2bow(query)
                    query_lsi = lsi[query_bow]
                    sims = index[query_lsi]
                    l = list(enumerate(sims))
                    for j in range(len(l)):
                        simi[i][l[j][0]] = l[j][1]
                simi1 = zeros(len(texts))
                s1 = numpy.array(simi1)
                for i in range(len(simi)):
                    s1 += numpy.array(simi[i])
                for i in s1:
                    i = i / len(texts1)
                simi1 = s1
                d1 = {}
                for i in range(len(simi1)):
                    d1[dict[i]] = simi1[i]
                sorted_dict = sorted(d1.iteritems(),
                                     key=lambda x: x[1],
                                     reverse=True)
                return_articles = []
                for i in range(20):
                    return_articles.append(
                        Article.objects.get(articleId=sorted_dict[i][0]))
                return return_articles
Example #21
0
 def train_lsi(self):
     lsi = models.LsiModel(self.corpus_tfidf,
                           id2word=self.dictionary,
                           num_topics=self.num_topics)
     return lsi
Example #22
0
file_names = [
    file_name for file_name in file_names
    if filter_file_by_content(file_name, unanalyzed_senders) is not None
]
logging.debug('after filtering we are using %d files', len(file_names))

corpus = MyCorpus([file_name for file_name in file_names])

corpus.dictionary.save(dictionary_file_name)

corpora.MmCorpus.save_corpus(corpus_file_name, corpus)

model = models.LsiModel(corpus,
                        num_topics=topics_count,
                        id2word=corpus.dictionary,
                        chunksize=20000,
                        distributed=False,
                        onepass=True)

logging.debug('built LSI model')

model.save(model_file_name)
logging.debug('saved LSI model as %s' % model_file_name)
model.show_topics(num_topics=topics_count)

topics_matrix = model.show_topics(formatted=False, num_words=top_words_count)
logging.debug(topics_matrix)

model.print_topics(-1)
Example #23
0
def LSI(request):
    query = ""
    query_response = None
    file_list = None
    file_list_dictionary = None
    search_result_dictionary = None
    documents = []
    for counter in range(1033):
        temp = open("IR/" + str(counter + 1) + ".txt", 'r')
        documents.append(temp.read())
        temp.close()
    stop_words = stopwords.words('english')
    texts = [[
        word for word in document.lower().split() if word not in stop_words
    ] for document in documents]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('/tmp/ir.mm', corpus)
    lsi = models.LsiModel(corpus, num_topics=43, id2word=dictionary)
    if request.method == "POST":
        form = SearchForm(request.POST)
        if form.is_valid():
            query_response = list()
            user_query = form.save()
            query = user_query.query
            user_query.save()
            index = similarities.MatrixSimilarity(lsi[corpus])
            doc = user_query.query
            vec_bow = dictionary.doc2bow(doc.lower().split())
            vec_lsi = lsi[vec_bow]
            sims = index[vec_lsi]
            sims = sorted(enumerate(sims, 1), key=lambda item: -item[1])
            file_list = list()
            for element in sims[0:5]:
                file_list.append(element[0])
            temp = None
            for text in file_list:
                temp = open("IR/" + str(text) + ".txt", 'r')
                query_response.append(temp.read())
                temp.close()
            #print(query_response)
            file_list_dictionary = dict()
            file_list_dictionary = {
                i: file_list[i - 1]
                for i in range(1,
                               len(file_list) + 1)
            }
            search_result_dictionary = {
                i: query_response[i - 1]
                for i in range(1,
                               len(query_response) + 1)
            }
    else:
        form = SearchForm()
    return render(
        request, "lsi.html", {
            'form': form,
            'query': query,
            'answer': file_list,
            'search_results': query_response,
            'file_dictionary': file_list_dictionary,
            'search_result_dictionary': search_result_dictionary
        })
def create_lsi_model(texts):
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
    return dictionary, corpus, lsi
Example #25
0
    print('Text = ')
    pprint(texts)

    dictionary = corpora.Dictionary(texts)
    print(dictionary)
    V = len(dictionary)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpus_tfidf = models.TfidfModel(corpus)[corpus]
    corpus_tfidf = corpus

    print('TF-IDF:')
    for c in corpus_tfidf:
        print(c)

    print('\nLSI Model:')
    lsi = models.LsiModel(corpus_tfidf, num_topics=2, id2word=dictionary)
    topic_result = [a for a in lsi[corpus_tfidf]]
    pprint(topic_result)
    print('LSI Topics:')
    pprint(lsi.print_topics(num_topics=2, num_words=5))
    similarity = similarities.MatrixSimilarity(lsi[corpus_tfidf])   # similarities.Similarity()
    print('Similarity:')
    pprint(list(similarity))

    print('\nLDA Model:')
    num_topics = 2
    lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary,
                          alpha='auto', eta='auto', minimum_probability=0.001, passes=10)
    doc_topic = [doc_t for doc_t in lda[corpus_tfidf]]
    print('Document-Topic:\n')
    pprint(doc_topic)
    cleaned_text = [p_stemmer.stem(i) for i in tokens]
    
    all_sentences.append(cleaned_text)
    for token in cleaned_text:
        all_words.append(token)

# Generate corpus and dictionary.
dictionary = gensim.corpora.Dictionary(all_sentences)
corpus = [dictionary.doc2bow(word) for word in all_sentences]


### Step 3: Train LSA and t-SNE model.
# Here I train a LSA model and reduce it to a 2D t-SNE space.

# Train.
lsa_model = models.LsiModel(corpus, id2word=dictionary, num_topics=1000)

# Get most frequent unique words.
all_words_unique = nltk.FreqDist(all_words).most_common(4100)

# Prepare input for t-SNE
all_words_unique = nltk.FreqDist(all_words).most_common(4100)
all_words_unique_vec = []
all_words_unique_word = []
for index2, item2 in enumerate(all_words_unique):
    all_words_unique_vec.append(list(model.wv[item2[0]]))
    all_words_unique_word.append(item2[0])
    
# Set up t-SNE model.
tsne_model = TSNE(n_components=2, random_state=10, perplexity=50.0)
X = np.array(all_words_unique_vec)
Example #27
0
thunderbird_rss_list = list()

for row in cursor_1:
    thunderbird_rss_list.append(row[4])

tokenizer = RegexpTokenizer(r'\w+')
en_stop = set(stopwords.words('english'))
p_stemmer = PorterStemmer()

doc_set = thunderbird_rss_list

texts = []

for i in doc_set:
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    stopped_tokens = [i for i in tokens if not i in en_stop]
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    texts.append(stemmed_tokens)

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lsimodel = models.LsiModel(corpus, num_topics=2, id2word = dictionary)

print(lsimodel.print_topics(2))

end_time = time()
time_taken = end_time - start_time

print("Total time taken in seconds: ", time_taken)
 def CallTransformationModel(self, Dict, Bowvec, **kwarg):
     '''Invoke specific transformation models of Gensim module.
     # Arguments:
         Dict: Dictionary made by all tokenized news(articles/documents).
         Bowvec: Bow-vector created by all tokenized news(articles/documents).
         modelType: Transformation model type, including 'lsi', 'lda' and 'None', 'None' means TF-IDF mmodel.
         tfDim: The number of topics that will be extracted from each news(articles/documents). 
         renewModel: Re-train the transformation models or not(bool type).
         modelPath: The path of saving trained transformation models.
     '''
     if kwarg['renewModel']:
         tfidf = models.TfidfModel(Bowvec)  # initialize tfidf model
         tfidfVec = tfidf[Bowvec]  # use the model to transform whole corpus
         tfidf.save(kwarg['modelPath'] + "tfidf_model.tfidf")
         if kwarg['modelType'] == 'lsi':
             model = models.LsiModel(tfidfVec,
                                     id2word=Dict,
                                     num_topics=kwarg['tfDim']
                                     )  # initialize an LSI transformation
             modelVec = model[
                 tfidfVec]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
             model.save(kwarg['modelPath'])  # same for tfidf, lda, ...
         elif kwarg['modelType'] == 'lda':
             model = models.LdaModel(tfidfVec,
                                     id2word=Dict,
                                     num_topics=kwarg['tfDim'])
             modelVec = model[tfidfVec]  #每个文本对应的LDA向量,稀疏的,元素值是隶属与对应序数类的权重
             model.save(kwarg['modelPath'])  # same for tfidf, lda, ...
         elif kwarg['modelType'] == 'None':
             model = tfidf
             modelVec = tfidfVec
     else:
         if not os.path.exists(kwarg['modelPath'] + "tfidf_model.tfidf"):
             tfidf = models.TfidfModel(Bowvec)  # initialize tfidf model
             tfidfVec = tfidf[Bowvec]  #
             tfidf.save(kwarg['modelPath'] + "tfidf_model.tfidf")
         else:
             tfidf = models.TfidfModel.load(kwarg['modelPath'] +
                                            "tfidf_model.tfidf")
             tfidfVec = tfidf[
                 Bowvec]  # use the model to transform whole corpus
         if kwarg['modelType'] == 'lsi':
             if not os.path.exists(kwarg['modelPath'] + "lsi_model.lsi"):
                 tfidf = models.TfidfModel.load(kwarg['modelPath'] +
                                                "tfidf_model.tfidf")
                 tfidfVec = tfidf[
                     Bowvec]  # use the model to transform whole corpus
                 model = models.LsiModel(
                     tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']
                 )  # initialize an LSI transformation
                 modelVec = model[
                     tfidfVec]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
                 model.save(kwarg['modelPath'] +
                            "lsi_model.lsi")  # same for tfidf, lda, ...
             else:
                 model = models.LsiModel.load(kwarg['modelPath'] +
                                              "lsi_model.lsi")
                 modelVec = model[tfidfVec]
         elif kwarg['modelType'] == 'lda':
             if not os.path.exists(kwarg['modelPath'] + "lda_model.lda"):
                 tfidf = models.TfidfModel.load(kwarg['modelPath'] +
                                                "tfidf_model.tfidf")
                 tfidfVec = tfidf[
                     Bowvec]  # use the model to transform whole corpus
                 model = models.LdaModel(tfidfVec,
                                         id2word=Dict,
                                         num_topics=kwarg['tfDim'])
                 modelVec = model[
                     tfidfVec]  #每个文本对应的LDA向量,稀疏的,元素值是隶属与对应序数类的权重
                 model.save(kwarg['modelPath'] +
                            "lda_model.lda")  # same for tfidf, lda, ...
             else:
                 model = models.LdaModel.load(kwarg['modelPath'] +
                                              "lda_model.lda")
                 modelVec = model[tfidfVec]
         elif kwarg['modelType'] == 'None':
             model = tfidf
             modelVec = tfidfVec
     return tfidfVec, modelVec
Example #29
0
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('lyrics.mm', corpus)

##################

if (os.path.exists("lyrics.dict")):
    dictionary = corpora.Dictionary.load('lyrics.dict')
    corpus = corpora.MmCorpus('lyrics.mm')
    print("Used files generated from first tutorial")
else:
    print("Please run first tutorial to generate data set")

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
corpus_lsi = lsi[corpus_tfidf]

lsi.save('lyrics.lsi')
lsi = models.LsiModel.load('lyrics.lsi')

##################

dictionary = corpora.Dictionary.load('lyrics.dict')
corpus = corpora.MmCorpus('lyrics.mm')

lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=10)

doc = "望著 滿天星斗 的 塗鴉 好像 看見 自己 童年 的 模樣 總是 說 著 淘氣 浪漫 的 願望 夢想 能夠 飛往 燦爛 的 天堂 而 那天 真的 心願 正 溫柔 地 對 我 說 當你 陷入 絕望 中請 記得 我 用 美麗 的 幻想 讓 真心 永遠 純真 而 不變 當你 寂寞 的 時候 請 想念 我 用 單純 的 信仰 給 自己 溫暖 的 回答 閉上 雙眼 靜靜地 徜徉 彷彿 穿越時空 回到 了 過往 以為 銀河 就 在 不遠 的 前方 星星 月亮 都 在 我 面前 玩耍 而 那 微小 的 喜悅 正 溫柔 地 對 我 說 當你 陷入 絕望 中請 記得 我 用 美麗 的 幻想 讓 真心 永遠 純真 而 不變 當你 寂寞 的 時候 請 想念 我 用 單純 的 信仰 給 自己 溫暖 的 回答 ( 和 童 年時 無邪 的 希望 ) 親愛 的 我 親愛 的 我 願 你 永遠 像 我 一樣 帶著 勇氣 和 倔強 歲月 改變 你 的 模樣 無法 改變 你 的 去向"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
# for idx, f in enumerate(sys.argv[3:]):
#     sample_map[idx] = f

# # A magic number from the original program
num_topics = round(len(sys.argv[3:]) * 0.8)

# set it to at least 5 (for testing)
if num_topics < 5:
    num_topics = 5

# from paper
READING_CHUNK_SIZE = 200000

lsi = models.LsiModel(
    corpus,
    num_topics=num_topics,
    #id2word=sample_map,
    distributed=False,
    chunksize=READING_CHUNK_SIZE)
# lsi.save(outdir + '/kmer_lsi.gensim')

# Done training the lsi now we cluster kmers

# map the kmer_docs into topic space
corpus_transform = lsi[corpus]

num_kmer_docs = len(corpus_transform)
num_kmer_docs_to_sample = num_kmer_docs * PERCENTAGE_OF_KMER_DOCS_FOR_SEEDING

NUM_CHUNKS = int(
    round((num_kmer_docs * PERCENTAGE_OF_KMER_DOCS_FOR_SEEDING) /
          KMER_DOC_CHUNK_SIZE))