Beispiel #1
0
    def create_topics(self, do_print, epoch):
        """
        for an epoch this function prepares topics with given corpus word list and
        extracts top words from those.
        It stores intermediate results of the data in pyldavis file and the model in a hdf5 file .

        <<<< This is the LDA part >>>>

        :param do_print: print top words in an epoch
        :param epoch: index of an epoch
        :return:
        """
        j=0
        # prepare the topic_term_distributions, document_topic_distributions and term_frequencies using softmax
        data = prepare_topics(weights=cuda.to_cpu(self.model.mixture.weights.W.data).copy(),
                              topic_vectors=cuda.to_cpu(self.model.mixture.factors.W.data).copy(),
                              word_vectors=cuda.to_cpu(self.model.sampler.W.data).copy(),
                              vocab=self.words, doprint=False)

        #top_words = print_top_words_per_topic(data, do_print=do_print)
        #if j % 100 == 0 and j > 100 and do_print:
        #    coherence = topic_coherence(top_words)
        #    for j in range(self.n_topics):
        #        print j, coherence[(j, 'cv')]
        data['doc_lengths'] = self.doc_lengths
        data['term_frequency'] = self.term_frequency
        np.savez('topics_' + self.modelid + '.pyldavis', **data)
        for d, f in utils.chunks(self.batchsize, self.doc_ids, self.flattened):
            self.update_per_chunk(d, epoch, f)
            j+=1
        # saves the parameters of model into a file in hdf5 format
        serializers.save_hdf5("lda2vec_" + self.modelid + ".hdf5", self.model)
Beispiel #2
0
model = LDA(n_docs, n_topics, n_units, n_vocab)
if os.path.exists('lda.hdf5'):
    six.print_("Reloading from saved")
    serializers.load_hdf5("lda.hdf5", model)
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)

j = 0
fraction = batchsize * 1.0 / bow.shape[0]
for epoch in range(50000000):
    if epoch % 100 == 0:
        p = cuda.to_cpu(model.proportions.W.data).copy()
        f = cuda.to_cpu(model.factors.W.data).copy()
        w = cuda.to_cpu(model.embedding.W.data).copy()
        d = prepare_topics(p, f, w, words)
        print_top_words_per_topic(d)
    for (ids, batch) in utils.chunks(batchsize, np.arange(bow.shape[0]), bow):
        t0 = time.time()
        optimizer.zero_grads()
        rec, ld = model.forward(ids, batch)
        l = rec + ld
        l.backward()
        optimizer.update()
        msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} "
               "P:{ld:1.3e} R:{rate:1.3e}")
        l.to_cpu()
        rec.to_cpu()
        ld.to_cpu()
        t1 = time.time()
        dt = t1 - t0
Beispiel #3
0
    serializers.load_hdf5("lda2vec.hdf5", model)
if pretrained:
    model.sampler.W.data[:, :] = vectors[:n_vocab, :]
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)
clip = chainer.optimizer.GradientClipping(5.0)
optimizer.add_hook(clip)

j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
progress = shelve.open('progress.shelve')
for epoch in range(200):
    data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),
                          cuda.to_cpu(model.mixture.factors.W.data).copy(),
                          cuda.to_cpu(model.sampler.W.data).copy(),
                          words)
    top_words = print_top_words_per_topic(data)
    coherence = topic_coherence(top_words)
    for j in range(n_topics):
        print j, coherence[(j, 'cv')]
    kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
    progress[str(epoch)] = pickle.dumps(kw)
    data['doc_lengths'] = doc_lengths
    data['term_frequency'] = term_frequency
    np.savez('topics.pyldavis', **data)
    for d, f in utils.chunks(batchsize, doc_ids, flattened):
        t0 = time.time()
        optimizer.zero_grads()
        l = model.fit_partial(d.copy(), f.copy())
        prior = model.prior()
Beispiel #4
0
    serializers.load_hdf5("lda2vec.hdf5", model)
if pretrained:
    model.sampler.W.data[:, :] = vectors[:n_vocab, :]
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)
clip = chainer.optimizer.GradientClipping(5.0)
optimizer.add_hook(clip)

j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
progress = shelve.open('progress.shelve')
for epoch in range(200):
    data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),
                          cuda.to_cpu(model.mixture.factors.W.data).copy(),
                          cuda.to_cpu(model.sampler.W.data).copy(),
                          words)
    top_words = print_top_words_per_topic(data)
    if j % 100 == 0 and j > 100:
        coherence = topic_coherence(top_words)
        for j in range(n_topics):
            six.print_(j, coherence[(j, 'cv')])
        kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
        progress[str(epoch)] = pickle.dumps(kw)
    data['doc_lengths'] = doc_lengths
    data['term_frequency'] = term_frequency
    np.savez('topics.pyldavis', **data)
    for d, f in utils.chunks(batchsize, doc_ids, flattened):
        t0 = time.time()
        optimizer.zero_grads()
        l = model.fit_partial(d.copy(), f.copy())
    def infer(self,docs=None,epochs=200, update_words=False, update_topics=False, topic_vectors=None):
        """ Infers the featurs of a new document that is passed in.
         By running the Lda2vec algorithm again.
        But by updating only the topic distributions"""

        texts = docs
        docs = []
        for text in texts:
            docs.append(unicode(" ".join(word for word in text.split() if word in self.word2vec_model.vocab)))

        logging.info("preprocessing")
        
        self.preprocess(docs)
        
        logging.info('preprocessed!')
        
        self.infer_model = LDA2Vec(n_documents=self.n_docs,\
                        n_document_topics=self.n_topics,\
                        n_units=300,\
                        n_vocab=self.n_vocab,\
                        counts=self.term_frequency,\
                        n_samples=15,\
                        power=self.power,\
                        temperature=self.temp)
        
        
        if self.words_pretrained:
            self.infer_model.sampler.W.data = self.vectors[:self.n_vocab, :]

        self.infer_model.mixture.factors.W.data = self.train_model.mixture.factors.W.data
        if topic_vectors is not None:
            assert(topic_vectors.shape==self.infer_model.mixture.factors.W.data.shape), ("topic vectors shape doesn't match")
            self.infer_model.mixture.factors.W.data = topic_vectors


        optimizer = O.Adam()
        optimizer.setup(self.infer_model)
        clip = chainer.optimizer.GradientClipping(5.0)
        optimizer.add_hook(clip)
        
        
        
        j = 0
        msgs = defaultdict(list)
        for epoch in range(epochs):
            print "epoch : ",epoch
            data = prepare_topics(cuda.to_cpu(self.infer_model.mixture.weights.W.data).copy(),
                                  cuda.to_cpu(self.infer_model.mixture.factors.W.data).copy(),
                                  cuda.to_cpu(self.infer_model.sampler.W.data).copy(),
                                  self.words)
            top_words = print_top_words_per_topic(data)
            if j % 100 == 0 and j > 100:
                coherence = topic_coherence(top_words)
                for j in range(self.n_topics):
                    print j, coherence[(j, 'cv')]
                kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
                #progress[str(epoch)] = pickle.dumps(kw)
            data['doc_lengths'] = self.doc_lengths
            data['term_frequency'] = self.term_frequency
            #np.savez('topics.pyldavis', **data)
            for d, f in utils.chunks(self.batchsize, self.doc_ids, self.flattened):
                t0 = time.time()
                optimizer.zero_grads()
                l = self.infer_model.fit_partial(d.copy(), f.copy(), update_words=update_words, update_topics=update_topics)
                prior = self.infer_model.prior()
                loss = prior * self.fraction
                loss.backward()
                optimizer.update()
                msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
                       "P:{prior:1.3e} R:{rate:1.3e}")
                prior.to_cpu()
                loss.to_cpu()
                t1 = time.time()
                dt = t1 - t0
                rate = self.batchsize / dt
                
                

                msgs["E"].append(epoch)
                msgs["L"].append(float(l))

                
                j += 1
            logs = dict(loss=float(l), epoch=epoch, j=j, prior=float(prior.data), rate=rate)
            print msg.format(**logs)
            print "\n ================================= \n"
            #serializers.save_hdf5("lda2vec.hdf5", self.model)
            msgs["loss_per_epoch"].append(float(l))
        return data, msgs
Beispiel #6
0
    def train(
            self,
            doc_ids,
            flattened,
            vocab,
            words,
            max_epochs=np.inf,
            verbose=False,  #added vocab & words to add save(npz) 
            #option during training
        loss_switch_epochs=0,  # num epochs until LDA loss switched on
            save=False,
            save_every=1000,
            outdir="./out",
            summarize=True,
            summarize_every=1000,
            metadata="metadata.tsv",
            metadata_docs="metadata.docs.tsv"):

        n_vocab = flattened.max() + 1
        # How many tokens are in each document
        doc_idx, lengths = np.unique(doc_ids, return_counts=True)
        doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32')
        doc_lengths[doc_idx] = lengths
        # Count all token frequencies
        tok_idx, freq = np.unique(flattened, return_counts=True)
        term_frequency = np.zeros(n_vocab, dtype='int32')
        term_frequency[tok_idx] = freq

        if save:
            try:
                os.mkdir(outdir)
            except OSError as e:  #for Python 2
                if e.errno == errno.EEXIST:
                    pass
            saver = tf.train.Saver(tf.global_variables())
            outdir = os.path.abspath(self.log_dir)

        if summarize:
            try:
                self.logger.flush()
            except (AttributeError):  # not yet logging
                self.logger = tf.summary.FileWriter(self.log_dir,
                                                    self.sesh.graph)
            merged = self._addSummaries(metadata, metadata_docs)

        j = 0
        epoch = 0

        fraction = self.batch_size / len(flattened)  # == batch / n_corpus
        self.sesh.run(tf.assign(self.fraction, fraction))
        progress = shelve.open('progress.shelve')

        # turn on LDA loss after n iters of training
        iters_per_epoch = (int(len(flattened) / self.batch_size) +
                           np.ceil(len(flattened) % self.batch_size))
        n = iters_per_epoch * loss_switch_epochs
        self.sesh.run(tf.assign(self.switch_loss, n))

        now = datetime.now().isoformat()[11:]
        print("------- Training begin: {} -------\n".format(now))

        while epoch < max_epochs:
            try:

                # doc_ids, word_idxs
                for d, f in utils.chunks(self.batch_size, doc_ids, flattened):
                    t0 = datetime.now()

                    feed_dict = self.make_feed_dict(d, f)

                    # if len(feed_dict[self.pivot_idxs]) == 0:
                    # 	print("Empty batch. Skipping...")
                    # 	continue

                    fetches = [
                        self.loss_lda, self.loss_word2vec, self.loss,
                        self.train_op
                    ]
                    loss_lda, loss_word2vec, loss, _ = self.sesh.run(
                        fetches, feed_dict=feed_dict)

                    if j > 5:
                        print(loss_lda, loss_word2vec, loss)  #py2

                    j += 1

                    if verbose and j % 1000 == 0:
                        msg = (
                            "J:{j:05d} E:{epoch:05d} L_nce:{l_word2vec:1.3e} "
                            "L_dirichlet:{l_lda:1.3e} R:{rate:1.3e}")

                        t1 = datetime.now().timestamp()
                        dt = t1 - t0
                        rate = self.batch_size / dt
                        logs = dict(l_word2vec=loss_word2vec,
                                    epoch=epoch,
                                    j=j,
                                    l_lda=loss_lda,
                                    rate=rate)

                        print(msg.format(**logs))

                    if save and j % save_every == 0:
                        outfile = os.path.join(
                            outdir, "{}_lda2vec".format(self.datetime))
                        saver.save(self.sesh, outfile, global_step=self.step)

                    if summarize and j % summarize_every == 0:
                        summary = self.sesh.run(merged, feed_dict=feed_dict)
                        self.logger.add_summary(summary, global_step=self.step)

                    #if j % 100 == 0 and j > 100 and epoch > 1:
                    #    coherence = topic_coherence(top_words)
                    #    for j in range(n_topics):
                    #        print(j, coherence[(j, 'cv')])
                    #    kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
                    #    progress[str(epoch)] = pickle.dumps(kw)

                epoch += 1
                a = self.mixture.W.eval(session=self.sesh)
                b = self.mixture.factors.eval(session=self.sesh)
                c = self.sampler.W.eval(session=self.sesh)
                data = prepare_topics(a, b, c, words)
                print("------- epoch: {}-------\n".format(epoch))
                top_words = print_top_words_per_topic(data)
                data['doc_lengths'] = doc_lengths
                data['term_frequency'] = term_frequency
                np.savez('topics.pyldavis', **data)

            except (KeyboardInterrupt):
                break

        print("epoch", epoch)
        print("max", max_epochs)
        now = datetime.now().isoformat()[11:]
        print("------- Training end: {} -------\n".format(now))

        if save:
            outfile = os.path.join(outdir, "{}_lda2vec".format(self.datetime))
            saver.save(self.sesh, outfile, global_step=self.step)

        try:
            self.logger.flush()
            self.logger.close()
        except (AttributeError):  # not logging
            pass
Beispiel #7
0
model = NSLDA(counts, n_docs, n_topics, n_units, n_vocab)
if os.path.exists('nslda.hdf5'):
    print "Reloading from saved"
    serializers.load_hdf5("nslda.hdf5", model)
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)

j = 0
fraction = batchsize * 1.0 / flattened.shape[0]
for epoch in range(50000000):
    p = cuda.to_cpu(model.proportions.W.data).copy()
    f = cuda.to_cpu(model.factors.W.data).copy()
    w = cuda.to_cpu(model.loss_func.W.data).copy()
    d = prepare_topics(p, f, w, words)
    print_top_words_per_topic(d)
    for (doc_ids, flat) in utils.chunks(batchsize, doc_id, flattened):
        t0 = time.time()
        optimizer.zero_grads()
        rec, ld = model.forward(doc_ids, flat)
        l = rec + ld * fraction * strength
        l.backward()
        optimizer.update()
        msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} "
               "P:{ld:1.3e} R:{rate:1.3e}")
        l.to_cpu()
        rec.to_cpu()
        ld.to_cpu()
        t1 = time.time()
        dt = t1 - t0
Beispiel #8
0
if os.path.exists('lda2vec_hn.hdf5'):
    print "Reloading from saved"
    serializers.load_hdf5("lda2vec_hn.hdf5", model)
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)

j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
for epoch in range(5000):
    print "Story topics"
    w = cuda.to_cpu(model.mixture_stories.weights.W.data).copy()
    f = cuda.to_cpu(model.mixture_stories.factors.W.data).copy()
    v = cuda.to_cpu(model.embed.W.data).copy()
    d = prepare_topics(w, f, v, words)
    print_top_words_per_topic(d)
    print "Author topics"
    w = cuda.to_cpu(model.mixture_authors.weights.W.data).copy()
    f = cuda.to_cpu(model.mixture_authors.factors.W.data).copy()
    d = prepare_topics(w, f, v, words)
    print_top_words_per_topic(d)
    for s, a, f in utils.chunks(batchsize, story_id, author_id, flattened):
        t0 = time.time()
        l = model.fit_partial(s.copy(), a.copy(), f.copy())
        prior = model.prior()
        loss = l + prior * fraction * clambda
        optimizer.zero_grads()
        loss.backward()
        optimizer.update()
        msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
Beispiel #9
0
for epoch in range(200):
    # After the first execution of the statement below, data.keys() =>
    # dict_keys(['vocab', 'doc_lengths', 'doc_topic_dists', 'topic_term_dists', 'term_frequency']
    #
    # Also the data['vocab'] is mostly <OoV>
    # (Pdb) print(sum(x != '<OoV>' for x in data['vocab']), 'out of', len(data['vocab']), ' is NOT <OoV>')
    # 27 out of 5835  is NOT <OoV>
    #
    # Debug>>>
    # (Pdb) model.mixture.weights.W.data.shape -> (11314, 20) (weights)
    # (Pdb) model.mixture.factors.W.data.shape -> (20, 300) (factors -> factor_vector)
    # (Pdb) model.sampler.W.data.shape -> (5837, 300) (word_vectors)
    # (Pdb) len(words) -> 5837 (vocab)
    if gpu_id >= 0:
        data = prepare_topics(cuda.to_gpu(model.mixture.weights.W.data).copy(),
                              cuda.to_gpu(model.mixture.factors.W.data).copy(),
                              cuda.to_gpu(model.sampler.W.data).copy(),
                              words, normalize = False)
    else:
        data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),
                              cuda.to_cpu(model.mixture.factors.W.data).copy(),
                              cuda.to_cpu(model.sampler.W.data).copy(),
                              words, normalize = False)

    top_words = print_top_words_per_topic(data)

    if j % 100 == 0 and j > 100:
        coherence = topic_coherence(top_words)
        for j in range(n_topics):
            print(j, coherence[(j, 'cv')])
        kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
        progress[str(epoch)] = pickle.dumps(kw)
Beispiel #10
0
    serializers.load_hdf5("lda2vec%3d.hdf5" % latest, model)
# model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)
clip = chainer.optimizer.GradientClipping(5.0)
optimizer.add_hook(clip)

j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
progress = shelve.open('progress.shelve')
steps = flattened.shape[0] // batchsize
print('steps per epoch: %d' % steps)
for epoch in range(5000):
    ts = prepare_topics(model.mixture_sty.weights.W.data,
                        model.mixture_sty.factors.W.data, model.sampler.W.data,
                        words)

    print_top_words_per_topic(ts)
    ts['doc_lengths'] = sty_len
    ts['term_frequency'] = term_frequency
    np.savez('topics.story.pyldavis', **ts)
    ta = prepare_topics(model.mixture_aut.weights.W.data,
                        model.mixture_aut.factors.W.data, model.sampler.W.data,
                        words)

    print_top_words_per_topic(ta)
    ta['doc_lengths'] = aut_len
    ta['term_frequency'] = term_frequency
    np.savez('topics.author.pyldavis', **ta)
    for s, a, f in utils.chunks(batchsize, story_id, author_id, flattened):