Esempio n. 1
0
def train():
    topics = [50, 100, 200]
    stop_words = utilities.read_stopwords('../data/aux/stop_words')
    with open('../data/corpus/luru_fetched_news_processed.txt', 'r') as f:
        lines = f.readlines()
        for line in lines:
            words = line.split(' ')
            words_list = []
            for word in words:
                if not stop_words.has_key(word):
                    words_list.append(word)
            processed_docs.append(words_list)
        word_cnt_dict = gensim.corpora.Dictionary(processed_docs)
        word_cnt_dict.filter_extremes(no_below=20, no_above=0.1)
        if not os.path.exists(FLAGS.save_path):
            os.makedirs(FLAGS.save_path)
        word_cnt_dict.save(
            os.path.join(FLAGS.save_path, 'dict_no_below_20_no_above_1'))

        bag_of_words_corpus = [
            word_cnt_dict.doc2bow(pdoc) for pdoc in processed_docs
        ]
        logging.info('begin to trian model...')
        for topic in topics:
            lda_model = gensim.models.LdaModel(bag_of_words_corpus,
                                               num_topics=topic,
                                               id2word=word_cnt_dict)

            lda_model.save(
                os.path.join(FLAGS.save_path,
                             'topic_' + str(topic) + '_LDA.model'))
            print('topic num {} model finished'.format(topic))
Esempio n. 2
0
    def __init__(self,
                 lda_model_file,
                 dict_file,
                 word2vec_model_file,
                 duration=2,
                 update_hour=2):

        stop_words_file = './data/aux/stop_words'
        self.stop_words = utilities.read_stopwords(stop_words_file)
        self.user_info_file = './database/user_list'
        self.news_info_file = './database/news_list'
        self.web_log = './news/webapp/usedata/log.log'

        lda_model = GensimBasedLda()
        lda_model.load_model(lda_model_file, dict_file)
        self.lda_model = lda_model

        logging.info('begin to load word2vec model...')
        self.word2vec_model = word2vec.KeyedVectors.load_word2vec_format(
            word2vec_model_file, binary=True)
        logging.info('word2vec model has been loaded successfully!')

        self.user_dict = {}
        # load all news to calcuate user history preference
        self.news_dict = {}
        # recent few day news
        self.candidate_news_dict = {}
        # deprecated. get user info from analysing log now
        # self.load_user_info()
        self.load_news_info()

        #deprecated
        self.duration = duration
        self.update_hour = update_hour
Esempio n. 3
0
def doc2dict(doc_file='', id_word_dict_file='', no_below=20, no_above=0.5):
    """form bow from docs
     Args:
         doc_file:
             docs file location
        id_word_dict_file:
         no_below:
             filter words shown less than no_below number docs
         no_above:
             filter words shown more than no_above number docs(in percentage rate)

    Returns:
        processed_docs:
            a list of list, every list contains the words of a documents after preprocess
        id_word_dict:
            a bow dictionary '{id: word}' for each word
        word_id_dict:
            a bow dictionary '{word: id}' for each word
        vocab: tuple
            like ('apple', 'bear', 'cat') to show vocabulary

    """

    stop_words = utilities.read_stopwords('../data/aux/stop_words')
    processed_docs = []
    with open(doc_file, 'r') as f:
        lines = f.readlines()
        for i in range(0, len(lines), 2):
            title_and_content = lines[i] + lines[i + 1]
            words = re.split(' |\n', title_and_content)
            words_list = []
            for word in words:
                if not stop_words.has_key(word):
                    words_list.append(word)
            #每一篇文章是一个list
            processed_docs.append(words_list)
        # normalized words and their integer ids, a dictionary {id: word}
    if id_word_dict_file == '':
        id_word_dict = gensim.corpora.Dictionary(processed_docs)
        id_word_dict.filter_extremes(no_below=no_below, no_above=no_above)
        timestamp = str(int(time.time()))
        id_word_dict.save('../data/aux/dict_no_below_20_no_above_05_' +
                          timestamp)
    else:
        id_word_dict = gensim.corpora.Dictionary.load(id_word_dict_file)
    logging.info('id_word_dict load/calculate finished!')
    logging.info(str(len(id_word_dict)) + ' words in the dictionary')
    word_id_dict = {}
    vocab_list = []
    for id in range(0, len(id_word_dict)):
        word = id_word_dict[id]
        word_id_dict[word] = id
        vocab_list.append(word)
    vocab = tuple(vocab_list)
    pickle.dump(word_id_dict, open('word_id_dict', 'w'))
    logging.info('doc2dict function finished!')
    return processed_docs, id_word_dict, word_id_dict, vocab
Esempio n. 4
0
def read_corpus_and_preprocess(doc_file):
    stop_words = utilities.read_stopwords('../data/aux/stop_words')
    processed_docs = []
    with open(doc_file, 'r') as f:
        lines = f.readlines()
        for i in range(0, len(lines), 2):
            title_and_content = lines[i] + lines[i + 1]
            words = re.split(' |\n', title_and_content)
            words_list = []
            for word in words:
                if not stop_words.has_key(word):
                    words_list.append(word)
            #每一篇文章是一个list
            processed_docs.append(words_list)
    return processed_docs
Esempio n. 5
0
    def show_topics(self, n_top_words=10):
        topic_word = self.model.components_
        #topic_word = self.model.topic_word_
        for i, topic_dist in enumerate(topic_word):
            topic_words = np.array(
                self.vocab)[np.argsort(topic_dist)][:-n_top_words - 1:-1]
            print('topic{}: {}'.format(i, ' '.join(topic_words)))

    def perplexity(chunk, chunk_size):
        dox_matrix = util.docs_to_matrix(chunk, self.word_id_dict, chunk_size)
        ret = self.model.perplexity(dox_matrix)
        return ret


if __name__ == '__main__':
    stop_words = utilities.read_stopwords('../data/aux/stop_words')

    vocab, doc_word = docs_to_matrix()
    #train()
    train_by_lda(vocab, doc_word)
    #predict()
    #model_file = os.path.join('../model/LDA', 'topic50_filter_again', 'LDA.model')
    #dict_file = '../data/aux/dict_no_below_20_no_above_05'
    #num_topics = 50
    #evaluate(model_file, dict_file, num_topics)
    #model = gensim.models.LdaModel.load(model_file)
    #show_topics(model)

    #query_bow = dictionary.doc2bow(jieba.cut(query, cut_all=False))