def construct_vocab(): f = open(cfg.PATH_TO_ENG_Y_TRAIN) g = open(cfg.PATH_TO_ENG_Y_TEST) word_dic = Dictionary() char_dic = Dictionary() target_dic = Dictionary() word_dic.add_documents([["UNK", "EOS"]]) char_dic.add_documents([["UNK", "BOW"]]) target_dic.add_documents([["UNK", "BOW"]]) line = f.readline() while line: sentence = _tokenize(line) word_dic.add_documents([sentence]) char_dic.add_documents([get_chars(line)]) target_dic.add_documents([sentence]) target_dic.add_documents([get_chars(line)]) line = f.readline() f.close line = g.readline() while line: sentence = _tokenize(line) word_dic.add_documents([sentence]) char_dic.add_documents([get_chars(line)]) line = g.readline() g.close return list(word_dic.itervalues()), list(char_dic.itervalues()), list( target_dic.itervalues())
def construct_test(tagger): f = open(cfg.PATH_TO_VGR_domain_text2) g = open(cfg.PATH_TO_X_TEST, 'w') line = f.readline() word_dic = Dictionary() char_dic = Dictionary() word_dic.add_documents([["UNK", "EOS"]]) char_dic.add_documents([["UNK", "BOW"]]) while line: sentence = _tokenize(line, tagger) g.write(" ".join(sentence) + "\n") word_dic.add_documents([sentence]) char_dic.add_documents([list(line)]) line = f.readline() f.close g.close return list(word_dic.itervalues()), list(char_dic.itervalues())
def main(): logger.info('-' * 80) logger.info('Loading data') corpus = load_corpus(args.dataset_dir) logger.info('-' * 80) logger.info('Make dictionary') dictionary = Dictionary(corpus) # Filter out words that occur less than 20 documents, or more than 50% of the documents. dictionary.filter_extremes(no_below=TOKEN_MIN_DOCS, no_above=TOKEN_MAX_DOCS_FRAC) vocab_path = os.path.join(args.dump_dir, 'vocab.txt') with open(vocab_path, 'w') as f: f.write("\n".join(dictionary.itervalues()) + '\n') # Bag-of-words representation of the documents. bow_corpus = [dictionary.doc2bow(doc) for doc in corpus] logger.info(f'Number of unique tokens: {len(dictionary)}') logger.info(f'Number of documents: {len(bow_corpus)}') logger.info('-' * 80) logger.info('Training model') callbacks = [] if 'perplexity' in args.callbacks: perplexity_metric = PerplexityMetric(corpus=bow_corpus) callbacks.append(perplexity_metric) if 'coherence' in args.callbacks: coherence_metric = CoherenceMetric(texts=corpus, dictionary=dictionary, coherence=args.coherence, topn=args.topn) callbacks.append(coherence_metric) model_path = os.path.join(args.dump_dir, 'lda.model') if args.model == 'lda': model = LdaModel(corpus=bow_corpus, num_topics=args.num_topics, id2word=dictionary, passes=args.num_epochs, update_every=1, eval_every=args.eval_every, iterations=args.iterations, alpha='auto', eta='auto', chunksize=args.batch_size, callbacks=callbacks, log_dir=args.log_dir, model_dir=model_path) elif args.model == 'multicore_lda': model = LdaMulticore(corpus=bow_corpus, num_topics=args.num_topics, id2word=dictionary, passes=args.num_epochs, eval_every=args.eval_every, iterations=args.iterations, eta='auto', chunksize=args.batch_size, workers=args.workers, callbacks=callbacks, log_dir=args.log_dir, model_dir=model_path) elif args.model == 'mallet_lda': model = LdaMallet(args.mallet_path, corpus=bow_corpus, num_topics=args.num_topics, id2word=dictionary, workers=args.workers, prefix=os.path.join(args.dump_dir, 'mallet_'), iterations=args.iterations) elif args.model == 'gensim_lda': model = GensimLdaModel(corpus=bow_corpus, num_topics=args.num_topics, id2word=dictionary, passes=args.num_epochs, update_every=1, eval_every=args.eval_every, iterations=args.iterations, alpha='auto', eta='auto', chunksize=args.batch_size) elif args.model == 'gensim_multicore_lda': model = GensimLdaMulticore(corpus=bow_corpus, num_topics=args.num_topics, id2word=dictionary, passes=args.num_epochs, eval_every=args.eval_every, iterations=args.iterations, eta='auto', chunksize=args.batch_size, workers=args.workers) model.save(model_path) logger.info('-' * 80) if args.model != 'mallet_lda': top_topics = model.top_topics(texts=corpus, coherence='c_v') # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics]) / args.num_topics logger.info(f'Average topic coherence: {avg_topic_coherence:.4f}.') for topic_idx, (topic_words, topic_score) in enumerate(top_topics): logger.info(f'Topic #{topic_idx} ({topic_score:.4f}): ' + " ".join((t[1] for t in topic_words[:5]))) logger.info( f'Perplexity: {np.exp2(-model.log_perplexity(bow_corpus)):.4f}') else: pprint(model.show_topics(formatted=False)) # Compute Coherence Score coherence_model_lda = CoherenceModel(model=model, texts=corpus, dictionary=dictionary, coherence=args.coherence, topn=args.topn) coherence_lda = coherence_model_lda.get_coherence() logger.info(f'Coherence : {coherence_lda:.4f}')
output_file_dist = os.path.join(curr_dir, f'distance_{args.smoothing}_{args.mu}') else: output_file_dist = os.path.join(curr_dir, 'distance_js') logging.info('creating the dictionary for ' + str(curr_iter) + '...') if curr_iter == 1: dict_file = os.path.join(curr_dir, 'dict.model') if os.path.exists(dict_file): logging.info(f'loading dictionary file from: {dict_file}') dictionary = Dictionary.load(dict_file) else: dictionary = Dictionary(dark_text) dictionary.add_documents(clean_text) word_dict = {word[1:]: word for word in dictionary.itervalues() if word.startswith('_')} _dict = Dictionary([[word] for word in word_dict.keys()]) dictionary.merge_with(_dict) dictionary = filter_dict(args.vocab_size, dictionary, chain(word_dict.values(), word_dict.keys())) dictionary.save(dict_file) else: dict_file_prev = os.path.join(prev_dir, 'dict.model') dict_file = os.path.join(curr_dir, 'dict.model') mrr_file = os.path.join(prev_dir, 'ranking_list.csv') dictionary = Dictionary.load(dict_file_prev) word_dict = {word[1:]: word for word in dictionary.itervalues() if word.startswith('_')} words = get_dark_words_prev(mrr_file)