def ModelInit(self, filename): Docs = LoadData.LoadDataFromFile(os.getcwd() + "/" + filename) self.D = len(Docs) print "Load ", self.D, " docs from the file" StopWordList = LoadData.LoadStopWords() WordListSet = [ Preprocess.PreprocessText(doc, StopWordList) for doc in Docs if type(doc) != unicode ] self.Dictionary = Preprocess.ConstructDictionary(WordListSet) self.W = len(self.Dictionary) print "Total number of words is: ", self.W print "Begin to save the dictionary..." self.SaveDictionary() print "Done!!" print "Begin to map the word to ID" self.IDListSet = [] inv_dict = {v: k for k, v in self.Dictionary.iteritems()} for wdl in WordListSet: IdList = Preprocess.Word2Id(wdl, inv_dict) self.IDListSet.append(IdList) print "Done!!" self.ndsum = ListUtil.Initial(self.D) self.theta = ListUtil.InitialMat(self.D, self.K, 0.0) self.phi = ListUtil.InitialMat(self.K, self.W, 0.0) self.nd = ListUtil.InitialMat(self.D, self.K, 0) self.nw = ListUtil.InitialMat(self.W, self.K, 0) self.Z = [] print "Begin to initialize the LDA model..." self.RandomAssignTopic() print "Topic assignment done!!"
def get_file_list(): """ 获取一个doc里面所有txt的内容, 一个句子一行 一个txt文档一个列表 """ doc_path_list = LoadData.get_doc_path_list() # for i in xrange(0, len(doc_path_list)): file_list = [] for i in xrange(0, 1): doc_path = doc_path_list[i] file_path_list = LoadData.get_file_path_list(doc_path) for j in xrange(0, len(file_path_list)): file_path = file_path_list[j] file = LoadData.LoadDataFromFile(file_path) file_list.append(file) return file_list