def init_model(self, n_samples=15): """ initializes the LDA2EC model :return: """ # number of documents n_docs = self.doc_ids.max() + 1 # Number of unique words in the vocabulary self.n_vocab = self.flattened.max() + 1 # Get the string representation for every compact key self.words = self.corpus.word_list(self.vocab)[:self.n_vocab] # How many tokens are in each document doc_idx, lengths = np.unique(self.doc_ids, return_counts=True) self.doc_lengths = np.zeros(self.doc_ids.max() + 1, dtype='int32') self.doc_lengths[doc_idx] = lengths # Count all token frequencies tok_idx, freq = np.unique(self.flattened, return_counts=True) self.term_frequency = np.zeros(self.n_vocab, dtype='int32') self.term_frequency[tok_idx] = freq self.model = LDA2Vec(n_documents=n_docs, n_document_topics=self.n_topics, n_units=self.n_units, n_vocab=self.n_vocab, counts=self.term_frequency, n_samples=n_samples, power=self.power, temperature=self.temperature)
# How many tokens are in each document doc_idx, lengths = np.unique(doc_ids, return_counts=True) doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32') doc_lengths[doc_idx] = lengths # Count all token frequencies tok_idx, freq = np.unique(flattened, return_counts=True) term_frequency = np.zeros(n_vocab, dtype='int32') term_frequency[tok_idx] = freq for key in sorted(locals().keys()): val = locals()[key] if len(str(val)) < 100 and '<' not in str(val): six.print_(key, val) model = LDA2Vec(n_documents=n_docs, n_document_topics=n_topics, n_units=n_units, n_vocab=n_vocab, counts=term_frequency, n_samples=15, power=power, temperature=temperature) if os.path.exists('lda2vec.hdf5'): six.print_("Reloading from saved") serializers.load_hdf5("lda2vec.hdf5", model) if pretrained: model.sampler.W.data[:, :] = vectors[:n_vocab, :] model.to_gpu() optimizer = O.Adam() optimizer.setup(model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) j = 0 epoch = 0 fraction = batchsize * 1.0 / flattened.shape[0]
# Number of topics to fit n_topics = 20 batchsize = 4096 # Get the string representation for every compact key words = corpus.word_list(vocab)[:n_vocab] # How many tokens are in each document doc_idx, lengths = np.unique(doc_ids, return_counts=True) doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32') doc_lengths[doc_idx] = lengths # Count all token frequencies tok_idx, freq = np.unique(flattened, return_counts=True) term_frequency = np.zeros(n_vocab, dtype='int32') term_frequency[tok_idx] = freq model = LDA2Vec(n_documents=n_docs, n_document_topics=n_topics, n_units=n_units, n_vocab=n_vocab, counts=term_frequency, n_samples=15) if os.path.exists('lda2vec.hdf5'): print "Reloading from saved" serializers.load_hdf5("lda2vec.hdf5", model) model.to_gpu() optimizer = O.Adam() optimizer.setup(model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) j = 0 epoch = 0 fraction = batchsize * 1.0 / flattened.shape[0] for epoch in range(5000): data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),
sty_len[sty_idx] = lengths # How many tokens are in each author aut_idx, lengths = np.unique(author_id, return_counts=True) aut_len = np.zeros(aut_idx.max() + 1, dtype='int32') aut_len[aut_idx] = lengths # Count all token frequencies tok_idx, freq = np.unique(flattened, return_counts=True) term_frequency = np.zeros(n_vocab, dtype='int32') term_frequency[tok_idx] = freq model = LDA2Vec(n_stories=n_stories, n_story_topics=n_story_topics, n_authors=n_authors, n_author_topics=n_author_topics, n_units=n_units, n_vocab=n_vocab, counts=term_frequency, n_samples=15) if os.path.exists('lda2vec.hdf5'): six.print_("Reloading from saved") serializers.load_hdf5("lda2vec.hdf5", model) model.to_gpu() optimizer = O.Adam() optimizer.setup(model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) j = 0 epoch = 0 fraction = batchsize * 1.0 / flattened.shape[0]
# Number of unique words in the vocabulary n_vocab = flattened.max() + 1 # Number of dimensions in a single word vector n_units = 256 # 'Strength' of the dircihlet prior; 200.0 seems to work well clambda = 200.0 # Number of topics to fit n_story_topics = 40 n_author_topics = 20 batchsize = 4096 * 2 counts = corpus.keys_counts[:n_vocab] # Get the string representation for every compact key words = corpus.word_list(vocab)[:n_vocab] model = LDA2Vec(n_stories, n_story_topics, n_authors, n_author_topics, n_units=n_units, n_vocab=n_vocab, counts=counts, n_samples=7) if os.path.exists('lda2vec_hn.hdf5'): print "Reloading from saved" serializers.load_hdf5("lda2vec_hn.hdf5", model) model.to_gpu() optimizer = O.Adam() optimizer.setup(model) j = 0 epoch = 0 fraction = batchsize * 1.0 / flattened.shape[0] for epoch in range(5000): print "Story topics" w = cuda.to_cpu(model.mixture_stories.weights.W.data).copy() f = cuda.to_cpu(model.mixture_stories.factors.W.data).copy()
n_docs = doc_ids.max() + 1 # Number of unique words in the vocabulary n_vocab = flattened.max() + 1 # Number of dimensions in a single word vector n_units = 256 # 'Strength' of the dircihlet prior; 200.0 seems to work well clambda = 200.0 # Number of topics to fit n_topics = 20 batchsize = 4096 * 8 counts = corpus.keys_counts[:n_vocab] # Get the string representation for every compact key words = corpus.word_list(vocab)[:n_vocab] model = LDA2Vec(n_documents=n_docs, n_document_topics=n_topics, n_units=n_units, n_vocab=n_vocab, counts=counts, n_samples=15) if os.path.exists('lda2vec.hdf5'): print "Reloading from saved" serializers.load_hdf5("lda2vec.hdf5", model) model.to_gpu() optimizer = O.Adam() optimizer.setup(model) j = 0 epoch = 0 fraction = batchsize * 1.0 / flattened.shape[0] for epoch in range(5000): data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(), cuda.to_cpu(model.mixture.factors.W.data).copy(), cuda.to_cpu(model.embed.W.data).copy(),
doc_idx, lengths = np.unique(doc_ids, return_counts=True) doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32') doc_lengths[doc_idx] = lengths # Count all token frequencies tok_idx, freq = np.unique(flattened, return_counts=True) term_frequency = np.zeros(n_vocab, dtype='int32') term_frequency[tok_idx] = freq for key in sorted(locals().keys()): val = locals()[key] if len(str(val)) < 100 and '<' not in str(val): print(key, val) model = LDA2Vec(n_documents=n_docs, n_document_topics=n_topics, n_units=n_units, n_vocab=n_vocab, counts=term_frequency, n_samples=15, power=power, temperature=temperature, vocab=words, docu_initialW=doc_weights_init) if os.path.exists('lda2vec.hdf5'): print("Reloading from saved") serializers.load_hdf5("lda2vec.hdf5", model) if pretrained: logger.info('Use pre-trained Google word2vec') model.sampler.W.data[:, :] = vectors[:n_vocab, :] np.nan_to_num(model.sampler.W.data, copy=False) if gpu_id >= 0: model.to_gpu() else: model.to_cpu()