def train(): topics = [50, 100, 200] stop_words = utilities.read_stopwords('../data/aux/stop_words') with open('../data/corpus/luru_fetched_news_processed.txt', 'r') as f: lines = f.readlines() for line in lines: words = line.split(' ') words_list = [] for word in words: if not stop_words.has_key(word): words_list.append(word) processed_docs.append(words_list) word_cnt_dict = gensim.corpora.Dictionary(processed_docs) word_cnt_dict.filter_extremes(no_below=20, no_above=0.1) if not os.path.exists(FLAGS.save_path): os.makedirs(FLAGS.save_path) word_cnt_dict.save( os.path.join(FLAGS.save_path, 'dict_no_below_20_no_above_1')) bag_of_words_corpus = [ word_cnt_dict.doc2bow(pdoc) for pdoc in processed_docs ] logging.info('begin to trian model...') for topic in topics: lda_model = gensim.models.LdaModel(bag_of_words_corpus, num_topics=topic, id2word=word_cnt_dict) lda_model.save( os.path.join(FLAGS.save_path, 'topic_' + str(topic) + '_LDA.model')) print('topic num {} model finished'.format(topic))
def __init__(self, lda_model_file, dict_file, word2vec_model_file, duration=2, update_hour=2): stop_words_file = './data/aux/stop_words' self.stop_words = utilities.read_stopwords(stop_words_file) self.user_info_file = './database/user_list' self.news_info_file = './database/news_list' self.web_log = './news/webapp/usedata/log.log' lda_model = GensimBasedLda() lda_model.load_model(lda_model_file, dict_file) self.lda_model = lda_model logging.info('begin to load word2vec model...') self.word2vec_model = word2vec.KeyedVectors.load_word2vec_format( word2vec_model_file, binary=True) logging.info('word2vec model has been loaded successfully!') self.user_dict = {} # load all news to calcuate user history preference self.news_dict = {} # recent few day news self.candidate_news_dict = {} # deprecated. get user info from analysing log now # self.load_user_info() self.load_news_info() #deprecated self.duration = duration self.update_hour = update_hour
def doc2dict(doc_file='', id_word_dict_file='', no_below=20, no_above=0.5): """form bow from docs Args: doc_file: docs file location id_word_dict_file: no_below: filter words shown less than no_below number docs no_above: filter words shown more than no_above number docs(in percentage rate) Returns: processed_docs: a list of list, every list contains the words of a documents after preprocess id_word_dict: a bow dictionary '{id: word}' for each word word_id_dict: a bow dictionary '{word: id}' for each word vocab: tuple like ('apple', 'bear', 'cat') to show vocabulary """ stop_words = utilities.read_stopwords('../data/aux/stop_words') processed_docs = [] with open(doc_file, 'r') as f: lines = f.readlines() for i in range(0, len(lines), 2): title_and_content = lines[i] + lines[i + 1] words = re.split(' |\n', title_and_content) words_list = [] for word in words: if not stop_words.has_key(word): words_list.append(word) #每一篇文章是一个list processed_docs.append(words_list) # normalized words and their integer ids, a dictionary {id: word} if id_word_dict_file == '': id_word_dict = gensim.corpora.Dictionary(processed_docs) id_word_dict.filter_extremes(no_below=no_below, no_above=no_above) timestamp = str(int(time.time())) id_word_dict.save('../data/aux/dict_no_below_20_no_above_05_' + timestamp) else: id_word_dict = gensim.corpora.Dictionary.load(id_word_dict_file) logging.info('id_word_dict load/calculate finished!') logging.info(str(len(id_word_dict)) + ' words in the dictionary') word_id_dict = {} vocab_list = [] for id in range(0, len(id_word_dict)): word = id_word_dict[id] word_id_dict[word] = id vocab_list.append(word) vocab = tuple(vocab_list) pickle.dump(word_id_dict, open('word_id_dict', 'w')) logging.info('doc2dict function finished!') return processed_docs, id_word_dict, word_id_dict, vocab
def read_corpus_and_preprocess(doc_file): stop_words = utilities.read_stopwords('../data/aux/stop_words') processed_docs = [] with open(doc_file, 'r') as f: lines = f.readlines() for i in range(0, len(lines), 2): title_and_content = lines[i] + lines[i + 1] words = re.split(' |\n', title_and_content) words_list = [] for word in words: if not stop_words.has_key(word): words_list.append(word) #每一篇文章是一个list processed_docs.append(words_list) return processed_docs
def show_topics(self, n_top_words=10): topic_word = self.model.components_ #topic_word = self.model.topic_word_ for i, topic_dist in enumerate(topic_word): topic_words = np.array( self.vocab)[np.argsort(topic_dist)][:-n_top_words - 1:-1] print('topic{}: {}'.format(i, ' '.join(topic_words))) def perplexity(chunk, chunk_size): dox_matrix = util.docs_to_matrix(chunk, self.word_id_dict, chunk_size) ret = self.model.perplexity(dox_matrix) return ret if __name__ == '__main__': stop_words = utilities.read_stopwords('../data/aux/stop_words') vocab, doc_word = docs_to_matrix() #train() train_by_lda(vocab, doc_word) #predict() #model_file = os.path.join('../model/LDA', 'topic50_filter_again', 'LDA.model') #dict_file = '../data/aux/dict_no_below_20_no_above_05' #num_topics = 50 #evaluate(model_file, dict_file, num_topics) #model = gensim.models.LdaModel.load(model_file) #show_topics(model) #query_bow = dictionary.doc2bow(jieba.cut(query, cut_all=False))