Beispiel #1
0
def train_dtm(articles, n_topics, timeslices, model_filename):
    # train DTM
    print("\nCreating common_corpus and common_dictionary")
    common_dictionary = corpora.Dictionary(articles)
    common_corpus = [common_dictionary.doc2bow(doc) for doc in articles]
    chain_var = 0.1
    print("Size of vocabulary: ", str(len(common_dictionary)))
    print("\nLDASeq Params:")
    print("time_slice = ", str(timeslices))
    print("n_topics = ", str(n_topics))
    print("chain_var = ", str(chain_var))
    print("\nStart training Ldaseq model")
    ldaseq = LdaSeqModel(corpus=common_corpus,
                         time_slice=timeslices,
                         num_topics=n_topics,
                         id2word=common_dictionary,
                         chain_variance=chain_var)

    model_file = model_filename
    ldaseq.save(model_file)
    print("***** Done training! Saved trained model as", model_file, "*****")
    # save common_dictionary
    f = open(model_file + "_dict.pkl", "wb")
    pickle.dump(common_dictionary, f)
    f.close()
    print("Saved common_dictionary!")
    # save common_corpus
    f = open(model_file + "_corpus.pkl", "wb")
    pickle.dump(common_corpus, f)
    f.close()
    print("Saved common_corpus!")
Beispiel #2
0
def model_dtm(category_id):
    dictionary,corpus = load_dict_corpus(category_id)
    time_range = time_ranges[category_id]

    ldaseq = LdaSeqModel(corpus=corpus, id2word=dictionary, time_slice=time_range,
                         num_topics=3, chunksize=10, passes=1, random_state=3)
    
    ldaseq.save(datapath('dynamic_topic_models/dtm_'+str(category_id)+'_model'))
    pickle.dump(ldaseq, open('dynamic_topic_models/dtm_'+str(category_id)+'_model.p', 'wb'))
    return ldaseq
Beispiel #3
0
    def _train_model(self):
        # Create Dictionary
        id2word = corpora.Dictionary(self.data_words)

        # Create Corpus
        texts = self.data_words

        # Term Document Frequency
        corpus = [id2word.doc2bow(text) for text in texts]

        # Train Test Split
        corpus_train = train_test_split(corpus,
                                        range(len(corpus)),
                                        test_size=self.test_size,
                                        random_state=self.random_seed)[0]

        if self.model_name == 'lda':
            # Model
            self.model = gensim.models.wrappers.LdaMallet(
                self.path_to_mallet,
                alpha=100,
                corpus=corpus_train,
                num_topics=self.k,
                id2word=id2word,
                random_seed=self.random_seed,
                iterations=self.iterations)
            # Save Coherence Model
            co = 'c_v'  # the way to calculate coherence score
            coherencemodel = CoherenceModel(model=self.model,
                                            texts=texts,
                                            dictionary=id2word,
                                            coherence=co)
            pickle.dump(coherencemodel, open(self.coh_path, "wb"))

            # Save Doc Lda
            doc_lda = self.model[corpus]
            pickle.dump(doc_lda, open(self.doc_lda_path, 'wb'))

        elif self.model_name == 'dtm':
            self.model = LdaSeqModel(corpus=corpus_train,
                                     time_slice=year_slice,
                                     id2word=id2word,
                                     lda_model=lda_model,
                                     num_topics=self.k)

        # Save Corpus
        pickle.dump(corpus, open(self.corpus_path, 'wb'))
Beispiel #4
0
def train_dtm(articles, timeslices, filename, k=10):
    chain_var = 0.1
    common_dictionary = corpora.Dictionary(articles)
    common_corpus = [common_dictionary.doc2bow(a) for a in articles]
    print("Training DTM for all articles")
    print("Articles:", len(articles))
    print("Topics:", k)
    print("Time slices: ", timeslices)
    print("Vocab len: ", len(common_dictionary))
    ldaseq = LdaSeqModel(corpus=common_corpus,
                         time_slice=timeslices,
                         num_topics=k,
                         id2word=common_dictionary,
                         chain_variance=chain_var)
    model_file = filename
    ldaseq.save(model_file)
    dict_filename = model_file + "_dict.pkl"
    pickle.dump(common_dictionary, open(dict_filename, "wb"))
    dict_filename = model_file + "_corpus.pkl"
    pickle.dump(common_corpus, open(dict_filename, "wb"))
    print("Saved DTM model as", model_file, "!")
Beispiel #5
0
def train_dtm(articles, n_topics, outfile="dtm", dates=None, time_slices=None):
    if time_slices is None and dates is not None:
        counts = Counter(dates)
        time_slices = list(counts.values())
        print("Dates: ", counts)
    print("Time slices:", time_slices)
    chain_var = 0.1
    common_dictionary = corpora.Dictionary(articles)
    common_corpus = [common_dictionary.doc2bow(a) for a in articles]
    ldaseq = LdaSeqModel(corpus=common_corpus,
                         time_slice=time_slices,
                         num_topics=n_topics,
                         id2word=common_dictionary,
                         chain_variance=chain_var)
    model_file = "trained_models/" + outfile
    ldaseq.save(model_file)
    dict_filename = model_file + "_dict.pkl"
    pickle.dump(common_dictionary, open(dict_filename, "wb"))
    dict_filename = model_file + "_corpus.pkl"
    pickle.dump(common_corpus, open(dict_filename, "wb"))

    print("Saved DTM model as", model_file, "!")
Beispiel #6
0
def run_ldaseq(data, sections, **kw):
    """Run LDA sequential model."""
    nips = NipsData()
    nips.load_data()
    yrs, cnts = papers_per_year(data)
    data = nips.combined_sections(sections, data)
    name = get_save_name(sections)
    ncomps = kw.get('num_topics', lda_seq_defaults['num_topics'])
    modname = f"ldaseq_{ncomps}_" + name

    mod = lda_load_model(modname)
    if mod:
        return mod

    d, bow = lda_get_dictionary(data, name=name, save=True)

    print(f"Running LDAseq on {sections}")
    lda_args = {**lda_seq_defaults, **kw}
    mod = LdaSeqModel(corpus=bow, time_slice=cnts, id2word=d, **lda_args)

    path, _ = get_save_path(modname)
    mod.save(path)
    return mod
Beispiel #7
0
class model():
    def __init__(self,
                 data_words_path,
                 model_name,
                 k,
                 path_to_mallet,
                 model_path,
                 corpus_path,
                 coh_path,
                 doc_lda_path,
                 a_sq=None,
                 pub_year=None,
                 test_size=0.2,
                 random_seed=100,
                 iterations=1000):
        self.data_words_path = data_words_path
        self.model_name = model_name
        self.k = k
        self.path_to_mallet = path_to_mallet
        self.model_path = model_path
        self.corpus_path = corpus_path
        self.coh_path = coh_path
        self.doc_lda_path = doc_lda_path
        self.test_size = test_size
        self.random_seed = random_seed
        self.iterations = iterations
        self.a_sq = a_sq
        self.pub_year = pub_year
        self._load_data_words()
        self._train_model()
        self._save_model()

    def _load_data_words(self):
        with open(self.data_words_path, "rb") as fp:
            self.data_words = pickle.load(fp)

    def _train_model(self):
        # Create Dictionary
        id2word = corpora.Dictionary(self.data_words)

        # Create Corpus
        texts = self.data_words

        # Term Document Frequency
        corpus = [id2word.doc2bow(text) for text in texts]

        # Train Test Split
        corpus_train = train_test_split(corpus,
                                        range(len(corpus)),
                                        test_size=self.test_size,
                                        random_state=self.random_seed)[0]

        if self.model_name == 'lda':
            # Model
            self.model = gensim.models.wrappers.LdaMallet(
                self.path_to_mallet,
                alpha=100,
                corpus=corpus_train,
                num_topics=self.k,
                id2word=id2word,
                random_seed=self.random_seed,
                iterations=self.iterations)
            # Save Coherence Model
            co = 'c_v'  # the way to calculate coherence score
            coherencemodel = CoherenceModel(model=self.model,
                                            texts=texts,
                                            dictionary=id2word,
                                            coherence=co)
            pickle.dump(coherencemodel, open(self.coh_path, "wb"))

            # Save Doc Lda
            doc_lda = self.model[corpus]
            pickle.dump(doc_lda, open(self.doc_lda_path, 'wb'))

        elif self.model_name == 'dtm':
            self.model = LdaSeqModel(corpus=corpus_train,
                                     time_slice=year_slice,
                                     id2word=id2word,
                                     lda_model=lda_model,
                                     num_topics=self.k)

        # Save Corpus
        pickle.dump(corpus, open(self.corpus_path, 'wb'))

    def _save_model(self):
        self.model.save(self.model_path)
        multFile.write(str(wordID) + ':' + str(weigth) + ' ')

    multFile.write('\n')

multFile.close()

print(multFile)

time_slice = total_yearly_list

#LdaSeqModel(corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10, initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100)

#use LdaSeqModel to generate DTM results
#ldaseq = LdaSeqModel(corpus=corpus_memory_friendly, id2word=dictionary, time_slice=time_slice, num_topics=5)
ldaseq = LdaSeqModel(corpus=corpus_memory_friendly,
                     id2word=dictionary,
                     time_slice=time_slice,
                     num_topics=4)
# for given time, the distriibution of each topic
ldaseq.print_topics(time=1)
# for given topic the word distribution over time
DTM_topic_0 = ldaseq.print_topic_times(topic=0, top_terms=10)
DTM_topic_1 = ldaseq.print_topic_times(topic=1, top_terms=10)
DTM_topic_2 = ldaseq.print_topic_times(topic=2, top_terms=10)
DTM_topic_3 = ldaseq.print_topic_times(topic=3, top_terms=10)
DTM_topic_4 = ldaseq.print_topic_times(topic=4, top_terms=10)


def topic_time(DTM_topic, time_stamps):
    for i in range(len(time_slice) - 1):
        if i == 0:
            temp_a1 = pd.DataFrame(DTM_topic[i])