def SaveAllLDAMode(self, FilePath): """保存所有LDAModel""" #检查该目录是否存在,若不存在则创建 InitialFilter.CreateDir(FilePath) index = 0 for m in self.LDAModelLst: joblib.dump(m, FilePath + 'LDA-model-' + str(index) + '.model') index += 1
def Ex(): FilePath = 'Data/DataProcessing/Boston/' ex = Extract() ex.ReadTwitter(FilePath + 'FilterData.txt') InitialFilter.CreateDir(FilePath + 'ClusterResult/') for i in range(27): path = FilePath + 'KMeans/k-27/C' + str(i) + '.txt' WritePath = FilePath + 'ClusterResult/C' + str(i) + '.txt' SaveExtractFile(ex.ExtractTwitter(path), WritePath)
def __init__(self, tweets, k, MaxIterations, FilePath): """初始化,此处的k不再是int,而是由int型数据组成的list""" self.tweets = tweets #推特推文组成list 型数据 self.k = k #聚类数据k的取值范围,list数据类型 self.MaIterations = MaxIterations #最大迭代次数 self.FilePath = FilePath self.SSE = [] #list,保留一系列k计算出来的SSE self.DI = [] #list,保留一系列k计算出来的DI self.IterationNumList = [] #list,保存每次迭代次数 InitialFilter.CreateDir(FilePath)
def TestLDATesy(): FilePath = 'Data/DataProcessing/Boston/' DocLst = ReadDocList(FilePath + 'KMeans/k-7/', 7) n_top_words = 15 MaxFeatures = 2500 lda = LDATest(DocLst, MaxFeatures) lda.LoadModel(FilePath + 'LDATrainResult/BestModel.model') InitialFilter.CreateDir(FilePath + 'LDATest/') lda.SaveDocTopicDist(FilePath + 'LDATest/') lda.SaveTopicWords(n_top_words, FilePath + 'LDATest/')
def SaveClusterFile(self, FilePath): """以文件的形式保存聚类结果,FilePath为文件路径""" #若目录不存在,则创建路径 InitialFilter.CreateDir(FilePath) #遍历每一个簇 for i in self.Clusters: TempCache = [] #设置缓冲,存储当前簇中所有tweets for ID in self.Clusters[i]: #遍历簇中每一个ID以及ID对应的tweet #TempCache.append(str(ID) + '|' + self.tweets[ID] + '\n') #还需要修改了这一部分东西 TempCache.append( str(ID) + '|' + ' '.join(self.tweets[ID]) + '\n') # 还需要修改了这一部分东西 Classifier.WriteFileLine(FilePath + 'C' + str(i) + '.txt', TempCache, 'w')
def TestTrain(): FilePath = 'Data/DataProcessing/Boston/' DocLst = ReadDocList(FilePath + 'KMeans/k-27/', 27) DocSet = GetWordNum(DocLst) n_topics = range(1, 27, 1) #根据perplexity决定 MaxIter = 8000 n_top_words = 15 MaxFeatures = len(DocSet) lda = LDATrain(DocLst, n_topics, MaxIter, MaxFeatures) lda.LDACountVectorizer() lda.IterationLDATrain() lda.PrintBestModelAndPerplexity(n_top_words) InitialFilter.CreateDir(FilePath + 'LDATrainResult/') lda.SaveTopicWords(n_top_words, FilePath + 'LDATrainResult/') lda.PrintDocTopicDist() lda.SaveDocTopicDist(FilePath + 'LDATrainResult/') lda.SaveAllLDAMode(FilePath + 'LDATrainResult/') lda.SaveBestModel(FilePath + 'LDATrainResult/') lda.SavePerplexityCurveAndText(FilePath + 'LDATrainResult/') lda.SaveConfigFile(FilePath + 'LDATrainResult/')
def SavePerplexityCurveAndText(self, FilePath): """保存所有的困惑度(Perplexity),对应的曲线图像""" #检查该目录是否存在,若不存在则创建 InitialFilter.CreateDir(FilePath) # 保存perplexity结果 with open(FilePath + 'Perplexity.txt', 'w') as f: PerplexityLstStr = "" index = 0 for x in self.PerplexityLst: PerplexityLstStr += str(index) + '|' + str( self.NumTopics[index]) + '|' + str(x) + '\n' index += 1 f.write(PerplexityLstStr) #绘制曲线并保存 Figure = plt.figure() ax = Figure.add_subplot(1, 1, 1) ax.plot(self.NumTopics, self.PerplexityLst) ax.set_xlabel("# of topics") ax.set_ylabel("Approximate Perplexity") plt.grid(True) plt.savefig(FilePath + 'PerplexityTrend.png') plt.show()