def dtm(dtm_path, corpus, dictionary, time_slices, num_topics=40, load=False):
    # dtm_path should have your local binary of Blei-DTM
    print("Running DTM")
    if load is False:
        model = DtmModel(dtm_path,
                         corpus,
                         time_slices,
                         num_topics=num_topics,
                         id2word=dictionary,
                         initialize_lda=True)
        model.save("DTM")
        return model
    elif load is True:
        model = DtmModel.load('DTM')
        return model
pickle.dump(bow_path_by_artist, open(MODEL_SAVE_NAME + "bow_paths.pk", "wb"))


class BoWCorpus(object):
    def __iter__(self, bow_path_by_artist=bow_path_by_artist):
        for artist_id, artist_path, year in bow_path_by_artist:
            # Extract features for first song in the directory
            bow = np.load(BOW_DIR + artist_path +
                          os.listdir(BOW_DIR + artist_path)[0])
            # Convert to sparse encoding
            bow_sparse = [(idx, count) for (idx, count) in enumerate(bow)
                          if count > 0]
            yield bow_sparse


corpus = BoWCorpus()

start = time()

model = DtmModel(dtm_path,
                 corpus,
                 time_seq,
                 num_topics=NUM_TOPICS,
                 initialize_lda=True,
                 model='fixed')

# Save model
model.save(MODEL_SAVE_NAME)

print 'Model fit in', ((time() - start) / 60.) / 60., 'hours'
Beispiel #3
0
t.close()
# 建模

model_gen = DtmModel(dtm_path,
                     corpus=corpus,
                     time_slices=time_series,
                     mode=para['mode'],
                     model=para['model'],
                     num_topics=para['num_topics'],
                     id2word=corpus.dictionary,
                     prefix=None,
                     lda_sequence_min_iter=para['lda_sequence_min_iter'],
                     lda_sequence_max_iter=para['lda_sequence_max_iter'],
                     lda_max_em_iter=para['lda_max_em_iter'],
                     alpha=para['alpha'],
                     top_chain_var=para['top_chain_var'],
                     rng_seed=para['rng_seed'],
                     initialize_lda=para['initialize_lda'])

# model_gen = LdaSeqModel(corpus = corpus, time_slice=time_series, id2word = dictionary, num_topics = num_topics)
print 'model training finish'
model_gen.save(main_path + 'result/dtm_o_' + sys.platform + '_topic_' +
               str(para['num_topics']) + '.model')
print 'model saving finish'
#model1 = DtmModel.load('topic1.model')
#topics = model1.show_topic(topicid=0, time=0, topn=10)

#for i in range(10):
#    print pd.DataFrame(model.show_topic(topicid=i, time=1, topn=10))

#corpora.textcorpus.TextCorpus(5)
Beispiel #4
0
import time
from gensim.models.wrappers.dtmmodel import DtmModel
from gensim import corpora
start_time = time.time()

dtm_path = "dtm-linux64"

# Importation de la liste de texte lemmatisé
corpus = pickle.load(open('corpus_geo.pkl', 'rb'))

# Mise en format pour gensim
dictionary = corpora.Dictionary(corpus)
corpus = [dictionary.doc2bow(text) for text in corpus]

# Pour 10 topics
time_slice = [11468]*9
time_slice.append(11472)

# Pour 20 topics
#time_slice = [5734]*9
# time_slice.append(5738)

nb_topics = 10

model = DtmModel(dtm_path, corpus, time_slice, num_topics=nb_topics,
                 id2word=dictionary, initialize_lda=True)

model.save("DTMModel")

print("---- %s seconds ----" % (time.time() - start_time))
Beispiel #5
0
    #model_DTM.save('dtm_ei_10')
    print("Fin de l'entrainement du modèle DTM pour EI\n")

    print("\n---------------------\n")

    print("Début de l'entrainement du modèle DIM pour AE \n")
    #model_DIM = DtmModel(dtm_path, corpus_AE_toy, time_slices_AE_toy, num_topics=num_topics, id2word=corpus_AE_toy.dictionary, initialize_lda=True, model='fixed')
    #model_DIM.save('dim_ae_10')
    print("Fin de l'entrainement du modèle DIM pour AE\n")

    print("Début de l'entrainement du modèle DIM pour RI \n")
    model_DIM = DtmModel(dtm_path,
                         corpus_RI_toy,
                         time_slices_RI_toy,
                         num_topics=num_topics,
                         id2word=corpus_RI_toy.dictionary,
                         initialize_lda=True,
                         model='fixed')
    model_DIM.save('dim_ri_10')
    print("Fin de l'entrainement du modèle DIM pour RI\n")

    print("Début de l'entrainement du modèle DIM pour EI \n")
    model_DIM = DtmModel(dtm_path,
                         corpus_EI_toy,
                         time_slices_EI_toy,
                         num_topics=num_topics,
                         id2word=corpus_EI_toy.dictionary,
                         initialize_lda=True,
                         model='fixed')
    model_DIM.save('dim_ei_10')
    print("Fin de l'entrainement du modèle DIM pour EI\n")
Beispiel #6
0
# save preprocessed corpus
with open(r'dtm\full-preprocessed-pickle', 'wb') as f:
    pickle.dump(texts, f)

# get time slices (number of tweets each day)
time_slices = df['created_at'].apply(
    lambda x: x[:10]).value_counts().sort_index().values.tolist()

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

################################################################################

dtm_exe_path = r'C:\Program Files\DTM\dtm-win64.exe'

print('({}) Model started training'.format(timestamp()))
start = datetime.now()
dtm_model = DtmModel(dtm_exe_path,
                     corpus=corpus[:],
                     time_slices=time_slices,
                     num_topics=20,
                     id2word=dictionary)
elapsed = datetime.now() - start
print('({}) Model finished training'.format(timestamp()))
print('Elapsed time:', elapsed)

print('Saving model...')
dtm_model.save(dtm_out_path)
print('({}) Model saved'.format(timestamp()))