コード例 #1
0
    def Modeling(self,Modelpath):
        '''
        :return:
        '''

        if os.path.exists(os.path.join(Modelpath, 'coursera_corpus.dict')):

            # 加载字典和语料库
            dictionary = corpora.Dictionary.load(os.path.join(Modelpath, 'coursera_corpus.dict'))
            corpus = corpora.MmCorpus(os.path.join(Modelpath, 'coursera_corpus.mm'))

        else :
            corpus, dictionary = self.Preprocessing(Modelpath)
        # print(corpus[0])
        # print(dictionary)
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        modellsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
        print("========================================================")
        print("The Docs Topic Dims:")
        for item in modellsi.print_topics(num_topics=10, num_words=1):
            print("\t\t",item)
        print("========================================================")
        list =[]
        count = 0
        for item in corpus:
            modellsi = models.LsiModel([item], id2word=dictionary, num_topics=1)
            item = modellsi.print_topics(num_topics=1, num_words=1)
            list.insert(count,[count,item[0][1].split("*")[1].strip('"')])
            count = count +1

        list.insert(0, ["eventorder", "eventTopic"])
        IO.csv_writer(os.path.join(Modelpath, 'eventTopics.csv'), list)
コード例 #2
0
def readTopics(groupTopicspath, eventTopicspath):
    groupTopics = IO.csv_reader(groupTopicspath)
    # print(len(groupTopics))
    gtopics = []
    [gtopics.append([i[0], i[2], i[3]]) for i in groupTopics]
    # for item in gtopics:
    #     print(item)

    eventTopics = IO.csv_reader(eventTopicspath)
    # print(len(eventTopics))
    etopics = []
    [etopics.append([i[0], i[1]]) for i in eventTopics]
    # for item in etopics:
    #     print(item)
    return gtopics, etopics
コード例 #3
0
def EventTopicCount(path):

    # list  = []
    # if os.path.isdir:
    #     list = os.listdir(path)
    #
    # for item in list:
    #     if item.split(".")[1] == "csv":
    #eventTopic =  IO.csv_reader(os.path.join(path,item))
    eventTopic = IO.csv_reader(path)

    topic = []
    [topic.append(i[1]) for i in eventTopic]
    i = 0
    topicdic = {}
    while i < len(topic):

        if topic[i] in topicdic.keys():
            topicdic[str(topic[i])] = topicdic[str(topic[i])] + 1
        else:
            topicdic[str(topic[i])] = 1
        i = i + 1
    # for key,value in topicdic.items():
    #     print(key,":",value)
    return topicdic
コード例 #4
0
 def __init__(self,Sourpath):
     '''
     预处理类的初始化
     :param Sourpath:
     '''
     self.SourPath = Sourpath
     self.content = IO.csv_reader(Sourpath)
     list = []
     for item in self.content:
         list.append([item[3],item[10]])
     self.content = list
コード例 #5
0
    def LDAModeling(self, Modelpath):
        '''

        :return:
        '''

        if os.path.exists(os.path.join(Modelpath, 'group45494lda.mdl')):

            # 加载字典和语料库和模型
            dictionary = corpora.Dictionary.load(
                os.path.join(Modelpath, 'group45494.dict'))
            corpus = corpora.MmCorpus(os.path.join(Modelpath, 'group45494.mm'))
            ldamodel = models.LdaModel.load(
                os.path.join(Modelpath, 'group45494lda.mdl'))
        else:
            corpus, dictionary = self.Preprocessing(Modelpath)
            ldamodel = models.ldamodel.LdaModel(corpus,
                                                num_topics=1,
                                                id2word=dictionary,
                                                passes=2000)
            ldamodel.save(os.path.join(Modelpath, 'group45494lda.mdl'))
        # print("corpus","=",len(corpus))

        count = 0
        list = []
        for item in corpus:
            ldamodel.update([item])
            list.append([
                count, (ldamodel.print_topics(
                    num_topics=1, num_words=1)[0][1]).split("*")[1].strip('"')
            ])
            print(count,
                  (ldamodel.print_topics(num_topics=1,
                                         num_words=1)[0][1]).split("*")[1])
            count = count + 1
            # for item in ldamodel.print_topics(num_topics=10, num_words=1):
            #     print(item)
        list.insert(0, ["eventorder", "eventTopic"])
        IO.csv_writer(os.path.join(Modelpath, 'group45494EventsTopics.csv'),
                      list)
コード例 #6
0
# -*- coding:utf-8 -*-
import os
from ToolClasses import IO
from nltk.tokenize import RegexpTokenizer
from gensim import corpora, models

if __name__ == "__main__":

    corpus = []  # 存储文档
    tokens = []  # 存储文档中的单词
    # 读取文档的操作

    path = os.getcwd(
    ) + "\\Data\\GroupEvents\\Group_45494\\Group_45494_events.csv"
    temp = []
    corpus = IO.csv_reader(path)
    [temp.append([i[0], i[1], i[10]]) for i in corpus]
    corpus = temp
    print(len(corpus))

    # 去标点符号,去截止词的操作
    file = os.getcwd() + "\\Data\\stopwords.txt"
    f = open(file, "r")
    mystopwords = f.read()
    mystopwords = mystopwords.split('\n')
    en_stop = set()
    for word in mystopwords:
        en_stop.add(word)
    # en_stop = get_stop_words('en')   # 利用Pypi的stop_words包,需要去掉stop_words
    #
    # # 提取主干的词语的操作