Esempio n. 1
0
 def ModelInit(self, filename):
     Docs = LoadData.LoadDataFromFile(os.getcwd() + "/" + filename)
     self.D = len(Docs)
     print "Load ", self.D, " docs from the file"
     StopWordList = LoadData.LoadStopWords()
     WordListSet = [
         Preprocess.PreprocessText(doc, StopWordList) for doc in Docs
         if type(doc) != unicode
     ]
     self.Dictionary = Preprocess.ConstructDictionary(WordListSet)
     self.W = len(self.Dictionary)
     print "Total number of words is: ", self.W
     print "Begin to save the dictionary..."
     self.SaveDictionary()
     print "Done!!"
     print "Begin to map the word to ID"
     self.IDListSet = []
     inv_dict = {v: k for k, v in self.Dictionary.iteritems()}
     for wdl in WordListSet:
         IdList = Preprocess.Word2Id(wdl, inv_dict)
         self.IDListSet.append(IdList)
     print "Done!!"
     self.ndsum = ListUtil.Initial(self.D)
     self.theta = ListUtil.InitialMat(self.D, self.K, 0.0)
     self.phi = ListUtil.InitialMat(self.K, self.W, 0.0)
     self.nd = ListUtil.InitialMat(self.D, self.K, 0)
     self.nw = ListUtil.InitialMat(self.W, self.K, 0)
     self.Z = []
     print "Begin to initialize the LDA model..."
     self.RandomAssignTopic()
     print "Topic assignment done!!"
Esempio n. 2
0
def get_file_list():
    """
	获取一个doc里面所有txt的内容,
	一个句子一行
	一个txt文档一个列表
	"""
    doc_path_list = LoadData.get_doc_path_list()
    # for i in xrange(0, len(doc_path_list)):
    file_list = []
    for i in xrange(0, 1):
        doc_path = doc_path_list[i]
        file_path_list = LoadData.get_file_path_list(doc_path)
        for j in xrange(0, len(file_path_list)):
            file_path = file_path_list[j]
            file = LoadData.LoadDataFromFile(file_path)
            file_list.append(file)
    return file_list