Example #1
0
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)
clip = chainer.optimizer.GradientClipping(5.0)
optimizer.add_hook(clip)

j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
progress = shelve.open('progress.shelve')
for epoch in range(200):
    data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),
                          cuda.to_cpu(model.mixture.factors.W.data).copy(),
                          cuda.to_cpu(model.sampler.W.data).copy(),
                          words)
    top_words = print_top_words_per_topic(data)
    coherence = topic_coherence(top_words)
    for j in range(n_topics):
        print j, coherence[(j, 'cv')]
    kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
    progress[str(epoch)] = pickle.dumps(kw)
    data['doc_lengths'] = doc_lengths
    data['term_frequency'] = term_frequency
    np.savez('topics.pyldavis', **data)
    for d, f in utils.chunks(batchsize, doc_ids, flattened):
        t0 = time.time()
        optimizer.zero_grads()
        l = model.fit_partial(d.copy(), f.copy())
        prior = model.prior()
        loss = prior * fraction
        loss.backward()
Example #2
0
if os.path.exists('lda.hdf5'):
    six.print_("Reloading from saved")
    serializers.load_hdf5("lda.hdf5", model)
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)

j = 0
fraction = batchsize * 1.0 / bow.shape[0]
for epoch in range(50000000):
    if epoch % 100 == 0:
        p = cuda.to_cpu(model.proportions.W.data).copy()
        f = cuda.to_cpu(model.factors.W.data).copy()
        w = cuda.to_cpu(model.embedding.W.data).copy()
        d = prepare_topics(p, f, w, words)
        print_top_words_per_topic(d)
    for (ids, batch) in utils.chunks(batchsize, np.arange(bow.shape[0]), bow):
        t0 = time.time()
        optimizer.zero_grads()
        rec, ld = model.forward(ids, batch)
        l = rec + ld
        l.backward()
        optimizer.update()
        msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} "
               "P:{ld:1.3e} R:{rate:1.3e}")
        l.to_cpu()
        rec.to_cpu()
        ld.to_cpu()
        t1 = time.time()
        dt = t1 - t0
        rate = batchsize / dt
Example #3
0
    def infer(self,docs=None,epochs=200, update_words=False, update_topics=False, topic_vectors=None):
        """ Infers the featurs of a new document that is passed in.
         By running the Lda2vec algorithm again.
        But by updating only the topic distributions"""

        texts = docs
        docs = []
        for text in texts:
            docs.append(unicode(" ".join(word for word in text.split() if word in self.word2vec_model.vocab)))

        logging.info("preprocessing")
        
        self.preprocess(docs)
        
        logging.info('preprocessed!')
        
        self.infer_model = LDA2Vec(n_documents=self.n_docs,\
                        n_document_topics=self.n_topics,\
                        n_units=300,\
                        n_vocab=self.n_vocab,\
                        counts=self.term_frequency,\
                        n_samples=15,\
                        power=self.power,\
                        temperature=self.temp)
        
        
        if self.words_pretrained:
            self.infer_model.sampler.W.data = self.vectors[:self.n_vocab, :]

        self.infer_model.mixture.factors.W.data = self.train_model.mixture.factors.W.data
        if topic_vectors is not None:
            assert(topic_vectors.shape==self.infer_model.mixture.factors.W.data.shape), ("topic vectors shape doesn't match")
            self.infer_model.mixture.factors.W.data = topic_vectors


        optimizer = O.Adam()
        optimizer.setup(self.infer_model)
        clip = chainer.optimizer.GradientClipping(5.0)
        optimizer.add_hook(clip)
        
        
        
        j = 0
        msgs = defaultdict(list)
        for epoch in range(epochs):
            print "epoch : ",epoch
            data = prepare_topics(cuda.to_cpu(self.infer_model.mixture.weights.W.data).copy(),
                                  cuda.to_cpu(self.infer_model.mixture.factors.W.data).copy(),
                                  cuda.to_cpu(self.infer_model.sampler.W.data).copy(),
                                  self.words)
            top_words = print_top_words_per_topic(data)
            if j % 100 == 0 and j > 100:
                coherence = topic_coherence(top_words)
                for j in range(self.n_topics):
                    print j, coherence[(j, 'cv')]
                kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
                #progress[str(epoch)] = pickle.dumps(kw)
            data['doc_lengths'] = self.doc_lengths
            data['term_frequency'] = self.term_frequency
            #np.savez('topics.pyldavis', **data)
            for d, f in utils.chunks(self.batchsize, self.doc_ids, self.flattened):
                t0 = time.time()
                optimizer.zero_grads()
                l = self.infer_model.fit_partial(d.copy(), f.copy(), update_words=update_words, update_topics=update_topics)
                prior = self.infer_model.prior()
                loss = prior * self.fraction
                loss.backward()
                optimizer.update()
                msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
                       "P:{prior:1.3e} R:{rate:1.3e}")
                prior.to_cpu()
                loss.to_cpu()
                t1 = time.time()
                dt = t1 - t0
                rate = self.batchsize / dt
                
                

                msgs["E"].append(epoch)
                msgs["L"].append(float(l))

                
                j += 1
            logs = dict(loss=float(l), epoch=epoch, j=j, prior=float(prior.data), rate=rate)
            print msg.format(**logs)
            print "\n ================================= \n"
            #serializers.save_hdf5("lda2vec.hdf5", self.model)
            msgs["loss_per_epoch"].append(float(l))
        return data, msgs
Example #4
0
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)
clip = chainer.optimizer.GradientClipping(5.0)
optimizer.add_hook(clip)

j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
progress = shelve.open('progress.shelve')
for epoch in range(200):
    data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),
                          cuda.to_cpu(model.mixture.factors.W.data).copy(),
                          cuda.to_cpu(model.sampler.W.data).copy(),
                          words)
    top_words = print_top_words_per_topic(data)
    if j % 100 == 0 and j > 100:
        coherence = topic_coherence(top_words)
        for j in range(n_topics):
            six.print_(j, coherence[(j, 'cv')])
        kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
        progress[str(epoch)] = pickle.dumps(kw)
    data['doc_lengths'] = doc_lengths
    data['term_frequency'] = term_frequency
    np.savez('topics.pyldavis', **data)
    for d, f in utils.chunks(batchsize, doc_ids, flattened):
        t0 = time.time()
        optimizer.zero_grads()
        l = model.fit_partial(d.copy(), f.copy())
        prior = model.prior()
        loss = prior * fraction
Example #5
0
    serializers.load_hdf5("lda2vec.hdf5", model)
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)
clip = chainer.optimizer.GradientClipping(5.0)
optimizer.add_hook(clip)

j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
for epoch in range(5000):
    ts = prepare_topics(cuda.to_cpu(model.mixture_sty.weights.W.data).copy(),
                        cuda.to_cpu(model.mixture_sty.factors.W.data).copy(),
                        cuda.to_cpu(model.sampler.W.data).copy(),
                        words)
    print_top_words_per_topic(ts)
    ts['doc_lengths'] = sty_len
    ts['term_frequency'] = term_frequency
    np.savez('topics.story.pyldavis', **ts)
    ta = prepare_topics(cuda.to_cpu(model.mixture_aut.weights.W.data).copy(),
                        cuda.to_cpu(model.mixture_aut.factors.W.data).copy(),
                        cuda.to_cpu(model.sampler.W.data).copy(),
                        words)
    print_top_words_per_topic(ta)
    ta['doc_lengths'] = aut_len
    ta['term_frequency'] = term_frequency
    np.savez('topics.author.pyldavis', **ta)
    for s, a, f in utils.chunks(batchsize, story_id, author_id, flattened):
        t0 = time.time()
        optimizer.zero_grads()
        l = model.fit_partial(s.copy(), a.copy(), f.copy())
Example #6
0
    serializers.load_hdf5("lda2vec.hdf5", model)
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)
clip = chainer.optimizer.GradientClipping(5.0)
optimizer.add_hook(clip)

j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
for epoch in range(5000):
    ts = prepare_topics(
        cuda.to_cpu(model.mixture_sty.weights.W.data).copy(),
        cuda.to_cpu(model.mixture_sty.factors.W.data).copy(),
        cuda.to_cpu(model.sampler.W.data).copy(), words)
    print_top_words_per_topic(ts)
    ts['doc_lengths'] = sty_len
    ts['term_frequency'] = term_frequency
    np.savez('topics.story.pyldavis', **ts)
    ta = prepare_topics(
        cuda.to_cpu(model.mixture_aut.weights.W.data).copy(),
        cuda.to_cpu(model.mixture_aut.factors.W.data).copy(),
        cuda.to_cpu(model.sampler.W.data).copy(), words)
    print_top_words_per_topic(ta)
    ta['doc_lengths'] = aut_len
    ta['term_frequency'] = term_frequency
    np.savez('topics.author.pyldavis', **ta)
    for s, a, f in utils.chunks(batchsize, story_id, author_id, flattened):
        t0 = time.time()
        optimizer.zero_grads()
        l = model.fit_partial(s.copy(), a.copy(), f.copy())
Example #7
0
    def train(
            self,
            doc_ids,
            flattened,
            vocab,
            words,
            max_epochs=np.inf,
            verbose=False,  #added vocab & words to add save(npz) 
            #option during training
        loss_switch_epochs=0,  # num epochs until LDA loss switched on
            save=False,
            save_every=1000,
            outdir="./out",
            summarize=True,
            summarize_every=1000,
            metadata="metadata.tsv",
            metadata_docs="metadata.docs.tsv"):

        n_vocab = flattened.max() + 1
        # How many tokens are in each document
        doc_idx, lengths = np.unique(doc_ids, return_counts=True)
        doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32')
        doc_lengths[doc_idx] = lengths
        # Count all token frequencies
        tok_idx, freq = np.unique(flattened, return_counts=True)
        term_frequency = np.zeros(n_vocab, dtype='int32')
        term_frequency[tok_idx] = freq

        if save:
            try:
                os.mkdir(outdir)
            except OSError as e:  #for Python 2
                if e.errno == errno.EEXIST:
                    pass
            saver = tf.train.Saver(tf.global_variables())
            outdir = os.path.abspath(self.log_dir)

        if summarize:
            try:
                self.logger.flush()
            except (AttributeError):  # not yet logging
                self.logger = tf.summary.FileWriter(self.log_dir,
                                                    self.sesh.graph)
            merged = self._addSummaries(metadata, metadata_docs)

        j = 0
        epoch = 0

        fraction = self.batch_size / len(flattened)  # == batch / n_corpus
        self.sesh.run(tf.assign(self.fraction, fraction))
        progress = shelve.open('progress.shelve')

        # turn on LDA loss after n iters of training
        iters_per_epoch = (int(len(flattened) / self.batch_size) +
                           np.ceil(len(flattened) % self.batch_size))
        n = iters_per_epoch * loss_switch_epochs
        self.sesh.run(tf.assign(self.switch_loss, n))

        now = datetime.now().isoformat()[11:]
        print("------- Training begin: {} -------\n".format(now))

        while epoch < max_epochs:
            try:

                # doc_ids, word_idxs
                for d, f in utils.chunks(self.batch_size, doc_ids, flattened):
                    t0 = datetime.now()

                    feed_dict = self.make_feed_dict(d, f)

                    # if len(feed_dict[self.pivot_idxs]) == 0:
                    # 	print("Empty batch. Skipping...")
                    # 	continue

                    fetches = [
                        self.loss_lda, self.loss_word2vec, self.loss,
                        self.train_op
                    ]
                    loss_lda, loss_word2vec, loss, _ = self.sesh.run(
                        fetches, feed_dict=feed_dict)

                    if j > 5:
                        print(loss_lda, loss_word2vec, loss)  #py2

                    j += 1

                    if verbose and j % 1000 == 0:
                        msg = (
                            "J:{j:05d} E:{epoch:05d} L_nce:{l_word2vec:1.3e} "
                            "L_dirichlet:{l_lda:1.3e} R:{rate:1.3e}")

                        t1 = datetime.now().timestamp()
                        dt = t1 - t0
                        rate = self.batch_size / dt
                        logs = dict(l_word2vec=loss_word2vec,
                                    epoch=epoch,
                                    j=j,
                                    l_lda=loss_lda,
                                    rate=rate)

                        print(msg.format(**logs))

                    if save and j % save_every == 0:
                        outfile = os.path.join(
                            outdir, "{}_lda2vec".format(self.datetime))
                        saver.save(self.sesh, outfile, global_step=self.step)

                    if summarize and j % summarize_every == 0:
                        summary = self.sesh.run(merged, feed_dict=feed_dict)
                        self.logger.add_summary(summary, global_step=self.step)

                    #if j % 100 == 0 and j > 100 and epoch > 1:
                    #    coherence = topic_coherence(top_words)
                    #    for j in range(n_topics):
                    #        print(j, coherence[(j, 'cv')])
                    #    kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
                    #    progress[str(epoch)] = pickle.dumps(kw)

                epoch += 1
                a = self.mixture.W.eval(session=self.sesh)
                b = self.mixture.factors.eval(session=self.sesh)
                c = self.sampler.W.eval(session=self.sesh)
                data = prepare_topics(a, b, c, words)
                print("------- epoch: {}-------\n".format(epoch))
                top_words = print_top_words_per_topic(data)
                data['doc_lengths'] = doc_lengths
                data['term_frequency'] = term_frequency
                np.savez('topics.pyldavis', **data)

            except (KeyboardInterrupt):
                break

        print("epoch", epoch)
        print("max", max_epochs)
        now = datetime.now().isoformat()[11:]
        print("------- Training end: {} -------\n".format(now))

        if save:
            outfile = os.path.join(outdir, "{}_lda2vec".format(self.datetime))
            saver.save(self.sesh, outfile, global_step=self.step)

        try:
            self.logger.flush()
            self.logger.close()
        except (AttributeError):  # not logging
            pass
Example #8
0
model = NSLDA(counts, n_docs, n_topics, n_units, n_vocab)
if os.path.exists('nslda.hdf5'):
    print "Reloading from saved"
    serializers.load_hdf5("nslda.hdf5", model)
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)

j = 0
fraction = batchsize * 1.0 / flattened.shape[0]
for epoch in range(50000000):
    p = cuda.to_cpu(model.proportions.W.data).copy()
    f = cuda.to_cpu(model.factors.W.data).copy()
    w = cuda.to_cpu(model.loss_func.W.data).copy()
    d = prepare_topics(p, f, w, words)
    print_top_words_per_topic(d)
    for (doc_ids, flat) in utils.chunks(batchsize, doc_id, flattened):
        t0 = time.time()
        optimizer.zero_grads()
        rec, ld = model.forward(doc_ids, flat)
        l = rec + ld * fraction * strength
        l.backward()
        optimizer.update()
        msg = ("J:{j:05d} E:{epoch:05d} L:{rec:1.3e} "
               "P:{ld:1.3e} R:{rate:1.3e}")
        l.to_cpu()
        rec.to_cpu()
        ld.to_cpu()
        t1 = time.time()
        dt = t1 - t0
        rate = batchsize / dt
Example #9
0
    # (Pdb) model.mixture.weights.W.data.shape -> (11314, 20) (weights)
    # (Pdb) model.mixture.factors.W.data.shape -> (20, 300) (factors -> factor_vector)
    # (Pdb) model.sampler.W.data.shape -> (5837, 300) (word_vectors)
    # (Pdb) len(words) -> 5837 (vocab)
    if gpu_id >= 0:
        data = prepare_topics(cuda.to_gpu(model.mixture.weights.W.data).copy(),
                              cuda.to_gpu(model.mixture.factors.W.data).copy(),
                              cuda.to_gpu(model.sampler.W.data).copy(),
                              words, normalize = False)
    else:
        data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),
                              cuda.to_cpu(model.mixture.factors.W.data).copy(),
                              cuda.to_cpu(model.sampler.W.data).copy(),
                              words, normalize = False)

    top_words = print_top_words_per_topic(data)

    if j % 100 == 0 and j > 100:
        coherence = topic_coherence(top_words)
        for j in range(n_topics):
            print(j, coherence[(j, 'cv')])
        kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
        progress[str(epoch)] = pickle.dumps(kw)

    data['doc_lengths'] = doc_lengths
    data['term_frequency'] = term_frequency
    np.savez('topics.pyldavis', **data)

    window_size = 5
    word2vec_only = False
    # word2vec_only = epoch <= 5: