예제 #1
0
 def SaveAllLDAMode(self, FilePath):
     """保存所有LDAModel"""
     #检查该目录是否存在,若不存在则创建
     InitialFilter.CreateDir(FilePath)
     index = 0
     for m in self.LDAModelLst:
         joblib.dump(m, FilePath + 'LDA-model-' + str(index) + '.model')
         index += 1
def Ex():
    FilePath = 'Data/DataProcessing/Boston/'
    ex = Extract()
    ex.ReadTwitter(FilePath + 'FilterData.txt')
    InitialFilter.CreateDir(FilePath + 'ClusterResult/')
    for i in range(27):
        path = FilePath + 'KMeans/k-27/C' + str(i) + '.txt'
        WritePath = FilePath + 'ClusterResult/C' + str(i) + '.txt'
        SaveExtractFile(ex.ExtractTwitter(path), WritePath)
예제 #3
0
    def __init__(self, tweets, k, MaxIterations, FilePath):
        """初始化,此处的k不再是int,而是由int型数据组成的list"""
        self.tweets = tweets  #推特推文组成list 型数据
        self.k = k  #聚类数据k的取值范围,list数据类型
        self.MaIterations = MaxIterations  #最大迭代次数
        self.FilePath = FilePath
        self.SSE = []  #list,保留一系列k计算出来的SSE
        self.DI = []  #list,保留一系列k计算出来的DI
        self.IterationNumList = []  #list,保存每次迭代次数

        InitialFilter.CreateDir(FilePath)
예제 #4
0
def TestLDATesy():
    FilePath = 'Data/DataProcessing/Boston/'
    DocLst = ReadDocList(FilePath + 'KMeans/k-7/', 7)
    n_top_words = 15
    MaxFeatures = 2500

    lda = LDATest(DocLst, MaxFeatures)
    lda.LoadModel(FilePath + 'LDATrainResult/BestModel.model')
    InitialFilter.CreateDir(FilePath + 'LDATest/')
    lda.SaveDocTopicDist(FilePath + 'LDATest/')
    lda.SaveTopicWords(n_top_words, FilePath + 'LDATest/')
예제 #5
0
 def SaveClusterFile(self, FilePath):
     """以文件的形式保存聚类结果,FilePath为文件路径"""
     #若目录不存在,则创建路径
     InitialFilter.CreateDir(FilePath)
     #遍历每一个簇
     for i in self.Clusters:
         TempCache = []  #设置缓冲,存储当前簇中所有tweets
         for ID in self.Clusters[i]:  #遍历簇中每一个ID以及ID对应的tweet
             #TempCache.append(str(ID) + '|' + self.tweets[ID] + '\n')    #还需要修改了这一部分东西
             TempCache.append(
                 str(ID) + '|' + ' '.join(self.tweets[ID]) +
                 '\n')  # 还需要修改了这一部分东西
         Classifier.WriteFileLine(FilePath + 'C' + str(i) + '.txt',
                                  TempCache, 'w')
예제 #6
0
def TestTrain():
    FilePath = 'Data/DataProcessing/Boston/'
    DocLst = ReadDocList(FilePath + 'KMeans/k-27/', 27)
    DocSet = GetWordNum(DocLst)
    n_topics = range(1, 27, 1)  #根据perplexity决定
    MaxIter = 8000
    n_top_words = 15
    MaxFeatures = len(DocSet)

    lda = LDATrain(DocLst, n_topics, MaxIter, MaxFeatures)
    lda.LDACountVectorizer()
    lda.IterationLDATrain()
    lda.PrintBestModelAndPerplexity(n_top_words)
    InitialFilter.CreateDir(FilePath + 'LDATrainResult/')
    lda.SaveTopicWords(n_top_words, FilePath + 'LDATrainResult/')
    lda.PrintDocTopicDist()
    lda.SaveDocTopicDist(FilePath + 'LDATrainResult/')
    lda.SaveAllLDAMode(FilePath + 'LDATrainResult/')
    lda.SaveBestModel(FilePath + 'LDATrainResult/')
    lda.SavePerplexityCurveAndText(FilePath + 'LDATrainResult/')
    lda.SaveConfigFile(FilePath + 'LDATrainResult/')
예제 #7
0
 def SavePerplexityCurveAndText(self, FilePath):
     """保存所有的困惑度(Perplexity),对应的曲线图像"""
     #检查该目录是否存在,若不存在则创建
     InitialFilter.CreateDir(FilePath)
     # 保存perplexity结果
     with open(FilePath + 'Perplexity.txt', 'w') as f:
         PerplexityLstStr = ""
         index = 0
         for x in self.PerplexityLst:
             PerplexityLstStr += str(index) + '|' + str(
                 self.NumTopics[index]) + '|' + str(x) + '\n'
             index += 1
         f.write(PerplexityLstStr)
     #绘制曲线并保存
     Figure = plt.figure()
     ax = Figure.add_subplot(1, 1, 1)
     ax.plot(self.NumTopics, self.PerplexityLst)
     ax.set_xlabel("# of topics")
     ax.set_ylabel("Approximate Perplexity")
     plt.grid(True)
     plt.savefig(FilePath + 'PerplexityTrend.png')
     plt.show()