Esempio n. 1
0
def read_topic_key_table(opt):
    keyfile = get_topic_model_folder(opt) + 'topickeys.txt'
    topic_table = pd.read_csv(keyfile,
                              sep='\t',
                              header=None,
                              names=['topic', 'ratio', 'keywords'])
    return reformat_topic_key_table(topic_table)
Esempio n. 2
0
def load_gsdmm_model(opt):
    model_path = os.path.join(get_topic_model_folder(opt), 'gsdmm.model')
    print('Loading Topic Model from {}...'.format(model_path))
    filehandler = open(model_path, 'rb')
    gsdmm_model = pickle.load(filehandler)
    print('Done.')
    return gsdmm_model
Esempio n. 3
0
def load_lda_model(opt):
    model_path = os.path.join(get_topic_model_folder(opt), 'lda_model')
    print('Loading Topic Model from {}...'.format(model_path))
    lda_model = gensim.models.LdaModel.load(model_path)
    # update path in case topic model was trained with old code on VM (starting with /data-disk/...)
    lda_model.mallet_path = 'data/topic_models/mallet-2.0.8/bin/mallet'
    print('Done.')
    return lda_model
Esempio n. 4
0
def save_gsdmm_model(gsdmm_model, opt):
    root_folder = get_topic_root_folder(opt)
    if not os.path.exists(root_folder):
        os.mkdir(root_folder)
    model_folder = get_topic_parent_folder(opt)
    if not os.path.exists(model_folder):
        os.mkdir(model_folder)
    model_path = get_topic_model_folder(opt)
    if not os.path.exists(model_path):
        os.mkdir(model_path)
    print('Saving Topic Model to {}...'.format(model_path))
    with open(model_path + 'gsdmm.model', 'wb') as f:
        pickle.dump(gsdmm_model, f)
        f.close()
    write_keywords(gsdmm_model, opt)
Esempio n. 5
0
def write_keywords(gsdmm_model, opt, cutoff=20):
    clusters = [i for i in range(len(gsdmm_model.cluster_doc_count))]
    importance = [
        doc_count / sum(gsdmm_model.cluster_doc_count)
        for doc_count in gsdmm_model.cluster_doc_count
    ]
    topickey_path = os.path.join(get_topic_model_folder(opt), 'topickeys.txt')
    with open(topickey_path, 'w', encoding="utf-8") as outfile:
        for i in clusters:
            sort_dicts = sorted(
                gsdmm_model.cluster_word_distribution[i].items(),
                key=lambda k: k[1],
                reverse=True)[:cutoff]
            keywords = ' '.join([w for w, c in sort_dicts])
            outfile.writelines('{}\t{}\t{}\n'.format(i, importance[i],
                                                     keywords))
Esempio n. 6
0
def train_ldamallet_topic_model(corpus, id2word, opt):
    '''
    Trains and saves LDA model from ldamallet implementation (tends to be better)
    :param corpus: preprocessed corpus
    :param id2word: id2word from preprocessing step
    :param opt: option dictionary
    :return: trained topic model
    '''
    # Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
    mallet_path = os.path.join(opt['datapath'], 'topic_models', 'mallet-2.0.8', 'bin', 'mallet')
    prefix = get_topic_model_folder(opt)  # saves model here, e.g.: 'data/topic_models/Semeval_15/ldamallet'
    parent_folder = get_topic_parent_folder(opt)  # e.g.: 'data/topic_models/Semeval_15'
    alpha = opt.get('topic_alpha', 50)
    if not os.path.exists(parent_folder):
        os.mkdir(parent_folder)
    os.mkdir(prefix)
    lda_model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=opt['num_topics'],
                                                 id2word=id2word, prefix=prefix, alpha=alpha)  # try to change alpha
    # Save model
    save_topic_model(lda_model, opt)
    return lda_model
Esempio n. 7
0
def save_topic_model(topic_model, opt):
    model_path = os.path.join(get_topic_model_folder(opt), 'lda_model')
    print('Saving Topic Model to {}...'.format(model_path))
    topic_model.save(model_path)