def construct_vocab():
    f = open(cfg.PATH_TO_ENG_Y_TRAIN)
    g = open(cfg.PATH_TO_ENG_Y_TEST)
    word_dic = Dictionary()
    char_dic = Dictionary()
    target_dic = Dictionary()
    word_dic.add_documents([["UNK", "EOS"]])
    char_dic.add_documents([["UNK", "BOW"]])
    target_dic.add_documents([["UNK", "BOW"]])

    line = f.readline()
    while line:
        sentence = _tokenize(line)
        word_dic.add_documents([sentence])
        char_dic.add_documents([get_chars(line)])
        target_dic.add_documents([sentence])
        target_dic.add_documents([get_chars(line)])
        line = f.readline()
    f.close

    line = g.readline()
    while line:
        sentence = _tokenize(line)
        word_dic.add_documents([sentence])
        char_dic.add_documents([get_chars(line)])
        line = g.readline()
    g.close
    return list(word_dic.itervalues()), list(char_dic.itervalues()), list(
        target_dic.itervalues())
def construct_test(tagger):
    f = open(cfg.PATH_TO_VGR_domain_text2)
    g = open(cfg.PATH_TO_X_TEST, 'w')
    line = f.readline()
    word_dic = Dictionary()
    char_dic = Dictionary()
    word_dic.add_documents([["UNK", "EOS"]])
    char_dic.add_documents([["UNK", "BOW"]])
    while line:
        sentence = _tokenize(line, tagger)
        g.write(" ".join(sentence) + "\n")
        word_dic.add_documents([sentence])
        char_dic.add_documents([list(line)])
        line = f.readline()
    f.close
    g.close
    return list(word_dic.itervalues()), list(char_dic.itervalues())
Example #3
0
def main():
    logger.info('-' * 80)
    logger.info('Loading data')
    corpus = load_corpus(args.dataset_dir)

    logger.info('-' * 80)
    logger.info('Make dictionary')

    dictionary = Dictionary(corpus)
    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=TOKEN_MIN_DOCS,
                               no_above=TOKEN_MAX_DOCS_FRAC)

    vocab_path = os.path.join(args.dump_dir, 'vocab.txt')
    with open(vocab_path, 'w') as f:
        f.write("\n".join(dictionary.itervalues()) + '\n')

    # Bag-of-words representation of the documents.
    bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]

    logger.info(f'Number of unique tokens: {len(dictionary)}')
    logger.info(f'Number of documents: {len(bow_corpus)}')

    logger.info('-' * 80)
    logger.info('Training model')

    callbacks = []
    if 'perplexity' in args.callbacks:
        perplexity_metric = PerplexityMetric(corpus=bow_corpus)
        callbacks.append(perplexity_metric)
    if 'coherence' in args.callbacks:
        coherence_metric = CoherenceMetric(texts=corpus,
                                           dictionary=dictionary,
                                           coherence=args.coherence,
                                           topn=args.topn)
        callbacks.append(coherence_metric)

    model_path = os.path.join(args.dump_dir, 'lda.model')
    if args.model == 'lda':
        model = LdaModel(corpus=bow_corpus,
                         num_topics=args.num_topics,
                         id2word=dictionary,
                         passes=args.num_epochs,
                         update_every=1,
                         eval_every=args.eval_every,
                         iterations=args.iterations,
                         alpha='auto',
                         eta='auto',
                         chunksize=args.batch_size,
                         callbacks=callbacks,
                         log_dir=args.log_dir,
                         model_dir=model_path)
    elif args.model == 'multicore_lda':
        model = LdaMulticore(corpus=bow_corpus,
                             num_topics=args.num_topics,
                             id2word=dictionary,
                             passes=args.num_epochs,
                             eval_every=args.eval_every,
                             iterations=args.iterations,
                             eta='auto',
                             chunksize=args.batch_size,
                             workers=args.workers,
                             callbacks=callbacks,
                             log_dir=args.log_dir,
                             model_dir=model_path)
    elif args.model == 'mallet_lda':
        model = LdaMallet(args.mallet_path,
                          corpus=bow_corpus,
                          num_topics=args.num_topics,
                          id2word=dictionary,
                          workers=args.workers,
                          prefix=os.path.join(args.dump_dir, 'mallet_'),
                          iterations=args.iterations)
    elif args.model == 'gensim_lda':
        model = GensimLdaModel(corpus=bow_corpus,
                               num_topics=args.num_topics,
                               id2word=dictionary,
                               passes=args.num_epochs,
                               update_every=1,
                               eval_every=args.eval_every,
                               iterations=args.iterations,
                               alpha='auto',
                               eta='auto',
                               chunksize=args.batch_size)
    elif args.model == 'gensim_multicore_lda':
        model = GensimLdaMulticore(corpus=bow_corpus,
                                   num_topics=args.num_topics,
                                   id2word=dictionary,
                                   passes=args.num_epochs,
                                   eval_every=args.eval_every,
                                   iterations=args.iterations,
                                   eta='auto',
                                   chunksize=args.batch_size,
                                   workers=args.workers)

    model.save(model_path)

    logger.info('-' * 80)

    if args.model != 'mallet_lda':
        top_topics = model.top_topics(texts=corpus, coherence='c_v')
        # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
        avg_topic_coherence = sum([t[1] for t in top_topics]) / args.num_topics
        logger.info(f'Average topic coherence: {avg_topic_coherence:.4f}.')
        for topic_idx, (topic_words, topic_score) in enumerate(top_topics):
            logger.info(f'Topic #{topic_idx} ({topic_score:.4f}): ' +
                        " ".join((t[1] for t in topic_words[:5])))
        logger.info(
            f'Perplexity: {np.exp2(-model.log_perplexity(bow_corpus)):.4f}')
    else:
        pprint(model.show_topics(formatted=False))

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=model,
                                         texts=corpus,
                                         dictionary=dictionary,
                                         coherence=args.coherence,
                                         topn=args.topn)
    coherence_lda = coherence_model_lda.get_coherence()
    logger.info(f'Coherence : {coherence_lda:.4f}')
Example #4
0
            output_file_dist = os.path.join(curr_dir, f'distance_{args.smoothing}_{args.mu}')
        else:
            output_file_dist = os.path.join(curr_dir, 'distance_js')

        logging.info('creating the dictionary for ' + str(curr_iter) + '...')

        if curr_iter == 1:
            dict_file = os.path.join(curr_dir, 'dict.model')
            if os.path.exists(dict_file):
                logging.info(f'loading dictionary file from: {dict_file}')
                dictionary = Dictionary.load(dict_file)
            else:
                dictionary = Dictionary(dark_text)
                dictionary.add_documents(clean_text)

                word_dict = {word[1:]: word for word in dictionary.itervalues() if word.startswith('_')}
                _dict = Dictionary([[word] for word in word_dict.keys()])
                dictionary.merge_with(_dict)

                dictionary = filter_dict(args.vocab_size, dictionary, chain(word_dict.values(), word_dict.keys()))
                dictionary.save(dict_file)

        else:
            dict_file_prev = os.path.join(prev_dir, 'dict.model')
            dict_file = os.path.join(curr_dir, 'dict.model')
            mrr_file = os.path.join(prev_dir, 'ranking_list.csv')

            dictionary = Dictionary.load(dict_file_prev)
            word_dict = {word[1:]: word for word in dictionary.itervalues() if word.startswith('_')}
            words = get_dark_words_prev(mrr_file)