Exemple #1
0
    def gen_lda_model(self,
                      vectorizer,
                      n_topics,
                      finalCorpus,
                      update_mat=False):
        #vectorizer = gen_feature_vectorizer(self,t)
        t0 = time()
        #print(finalCorpus)
        if update_mat or self.tfMatrix == None:  # dont want to re-create the everytime we want to generate an lda_model (e.g. different topic numbers)
            # unless this is a new vectorizer, or a new corpus
            t0 = time()
            #print('[gen_lda_model] finalCorpus:', len(finalCorpus), type(finalCorpus), len(finalCorpus[0]), finalCorpus.columns, "||", finalCorpus[0][1])
            #try this
            #finalCorpus = ['This is the first document.', 'This is the second second document.', 'And the third one.','Is this the first document?']
            #finalCorpus = finalCorpus.iloc[0:5]
            self.tfMatrix = vectorizer.fit_transform(finalCorpus)
            #transformer = TfidfTransformer()               # this would be a substitue for TFIDFVectorizor, but already using it...
            #self.tfMatrix = transformer.fit_transform(TermDocMatrix)
            print("[tffeature]: gen_lda_model: transform done in %0.3fs." %
                  (time() - t0))
            print("[tffeature]: gen_lda_model: tfMatrix shape:",
                  self.tfMatrix.shape)
            self.tfMatrix = normalize(self.tfMatrix, norm='l1', axis=1)

        # rule of thumb:
        # alpha >> beta - we get simpler topic distributions with more topic weight appearing in documents
        # alpha << beta - we get complicated topic distributions with fewer topic weight appearing in documents
        # alpha, beta -  small : we get contension in simpler topics, but also fewer topics in documents
        # alpha, beta - big: contension in complicated topics, but many topics in documents
        # default alpha = beta = 1/n_topics (which depending on number of topics can be big or small)
        alpha = 1.0 / float(n_topics)
        beta = (1.0 / float(n_topics))**2
        #beta = 1.0/float(n_topics)

        lda_model = LatentDirichletAllocation(n_topics=n_topics,
                                              max_iter=20,
                                              learning_method='online',
                                              random_state=0,
                                              n_jobs=-1)
        t0 = time()
        lda_model.fit(self.tfMatrix)
        print("[tffeature]: gen_lda_model: topics done in %0.3fs." %
              (time() - t0))
        print("\n[tffeature]: gen_lda_model: Topics in LDA model:")
        print_top_words(lda_model, vectorizer.get_feature_names(), 20)

        return lda_model
Exemple #2
0
def main():
    # Report url
    all_reports_url = 'http://www.gov.cn/guowuyuan/baogao.htm'
    x = html_parse(gov_url=all_reports_url)
#    plot_word_cloud(x.corpus[2],max_words = 30)
    
    # LDA
    n_features = 1000
    # Delete stop word
    stop_word_txt ="中文停用词表.txt"
    with open(stop_word_txt, 'rb') as fp:
        stopword = fp.read().decode('utf-8')  
    stopword_list = stopword.splitlines() 
    
    tf_vectorizer = CountVectorizer(stop_words= stopword_list,max_features=n_features)
    tf = tf_vectorizer.fit_transform(x.corpus)
    n_topics = 5
    lda = LatentDirichletAllocation(n_components=n_topics, max_iter=500,learning_method='batch')
    lda.fit(tf) 
    
    # Get feature name
    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, n_top_words= 20)
Exemple #3
0
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', type=str, default="configs/1k.yaml",
                        help="Which configuration to use. See into 'config' folder")

    opt = parser.parse_args()

    with open(opt.config, 'r') as ymlfile:
        config = yaml.load(ymlfile, Loader=yaml.FullLoader)
    print(config)
    # dataset
    config_dataset = config['dataset']
    data_path = osp.join(
        config_dataset['folder-path'], config_dataset['data-file'])
    vocab_path = osp.join(
        config_dataset['folder-path'], config_dataset['vocab-file'])
    seedword_path = osp.join(
        config_dataset['folder-path'], config_dataset['sw-file'])
    labels = config_dataset['labels']

    # model
    config_model = config['model']
    n_encoder_1 = config_model['n_encoder_1']
    n_encoder_2 = config_model['n_encoder_2']
    n_latent = config_model['n_latent']
    dr = config_model['dropout']
    ld = config_model['lambda']
    al = config_model['alpha']

    # training
    config_training = config['training']
    lr = config_training['lr']
    bs = config_training['bs']
    d_step = config_training['d_step']
    epochs = config_training['epochs']
    n_topwords = config_training['n_topwords']
    ratio = config_training['ratio']
    exp = config_training['exp']
    result = config_training['result']
    write = config_training['write']

    # create result folders
    os.makedirs(result, exist_ok=True)

    dataset = np.load(data_path)
    with open(vocab_path, 'rb') as vocabfile:
        vocab = pickle.load(vocabfile)

    dataset_x, dataset_y = dataset[:, 0], dataset[:, 1]
    id_vocab = utils.sort_values(vocab)
    vocab_size = len(vocab)
    seedwords = utils.read_seedword(seedword_path)
    tfms_x = [Onehotify(vocab_size)]
    tfms_y = []

    # Split data
    sss = StratifiedShuffleSplit(n_splits=exp, test_size=ratio, random_state=0)
    splitted_data = sss.split(dataset_x, dataset_y)

    gamma_prior = np.zeros((vocab_size, n_latent))
    gamma_prior_batch = np.zeros((bs, vocab_size, n_latent))
    for idx_topic, seed_topic in enumerate(seedwords):
        for idx_word, seed_word in enumerate(seed_topic):
            idx_vocab = vocab[seed_word]
            gamma_prior[idx_vocab, idx_topic] = 1.0  # V x K
            gamma_prior_batch[:, idx_vocab, :] = 1.0  # N x V x K

    model = AVIAD(n_encoder_1, n_encoder_2, 
                    vocab_size, n_latent, 
                    gamma_prior=gamma_prior, ld=ld, al=al, lr=lr, dr=dr)

    for ds_train_idx, ds_test_idx in splitted_data:
        train_ds, test_ds = URSADataset(dataset_x[ds_train_idx], dataset_y[ds_train_idx], tfms_x, tfms_y), \
            URSADataset(dataset_x[ds_test_idx], dataset_y[ds_test_idx], tfms_x, tfms_y)
        
        train_dl, test_dl = DataLoader(train_ds, bs, False), DataLoader(test_ds, bs, False)
        beta = None

        for epoch in range(epochs):
            avg_cost = 0.
            sum_t_c = 0.

            for batch_train_x,  batch_train_y in train_dl:
                t_c = time.time()
                cost, beta = model.partial_fit(batch_train_x, gamma_prior_batch)
                c_elap = time.time() - t_c

                # Compute average loss
                avg_cost += cost / len(train_ds) * bs

                # Compute avg time
                sum_t_c += c_elap

                if np.isnan(avg_cost):
                    print('Encountered NaN, stopping training. Please check the learning_rate settings and the momentum.')
                    sys.exit()

            # Display logs per epoch step
            if (epoch + 1) % d_step == 0:
                print("##################################################")
                print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost))
                utils.print_top_words(epoch + 1, beta, id_vocab, n_topwords, result, write)
                print("##################################################")

            print("epoch={}, cost={:.9f}, sum_t_c={:.2f}".format((epoch + 1), avg_cost, sum_t_c))

        gamma = model.gamma_test()
        utils.print_gamma(gamma, vocab, seedwords)
        utils.classification_evaluate_dl(model, train_dl, n_latent, labels, show=True)
        utils.classification_evaluate_dl(model, test_dl, n_latent, labels, show=True)
        utils.print_top_words(epoch + 1, beta, id_vocab, n_topwords, result, write)
        utils.calc_perp(model, test_dl, gamma_prior_batch)  
def train(sess, model, 
          train_url, 
          test_url, 
          batch_size, 
          vocab_size,
          training_epochs=200, 
          alternate_epochs=1,#10
          lexicon=[],
          result_file='test.txt',
          B=1,
          warm_up_period=100):
  """train nvdm model."""
  train_set, train_count = utils.data_set(train_url)
  test_set, test_count = utils.data_set(test_url)
  # hold-out development dataset
  train_size=len(train_set)
  validation_size=int(train_size*0.1)
  dev_set = train_set[:validation_size]
  dev_count = train_count[:validation_size]
  train_set = train_set[validation_size:]
  train_count = train_count[validation_size:]
  print('sizes',train_size,validation_size,len(dev_set),len(train_set))
  optimize_jointly = True
  dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False)
  test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False)

  warm_up = 0
  start_min_alpha = 0.00001
  min_alpha = start_min_alpha
  warm_up_alpha=False
  start_B=4
  curr_B=B
  
  #for early stopping
  best_print_ana_ppx=1e10
  early_stopping_iters=30
  no_improvement_iters=0
  stopped=False
  epoch=-1
  #for epoch in range(training_epochs):
  while not stopped:
    epoch+=1
    train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True)
    if warm_up<1.:
      warm_up += 1./warm_up_period
    else:
      warm_up=1.
   
    # train
    #for switch in range(0, 2):
    if optimize_jointly:
      optim = model.optim_all
      print_mode = 'updating encoder and decoder'
    elif switch == 0:
      optim = model.optim_dec
      print_mode = 'updating decoder'
    else:
      optim = model.optim_enc
      print_mode = 'updating encoder'
    for i in range(alternate_epochs):
      loss_sum = 0.0
      ana_loss_sum = 0.0
      ppx_sum = 0.0
      kld_sum = 0.0
      ana_kld_sum = 0.0
      word_count = 0
      doc_count = 0
      recon_sum=0.0
      for idx_batch in train_batches:
        data_batch, count_batch, mask = utils.fetch_data(
        train_set, train_count, idx_batch, vocab_size)
        input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 0.75,model.warm_up.name: warm_up,model.min_alpha.name:min_alpha,model.B.name: curr_B}
        _, (loss,recon, kld,ana_loss,ana_kld) = sess.run((optim, 
                                    [model.true_objective, model.recons_loss, model.kld,model.analytical_objective,model.analytical_kld]),
                                    input_feed)
        loss_sum += np.sum(loss)
        ana_loss_sum += np.sum(ana_loss)
        kld_sum += np.sum(kld) / np.sum(mask) 
        ana_kld_sum += np.sum(ana_kld) / np.sum(mask)
        word_count += np.sum(count_batch)
        # to avoid nan error
        count_batch = np.add(count_batch, 1e-12)
        # per document loss
        ppx_sum += np.sum(np.divide(loss, count_batch)) 
        doc_count += np.sum(mask)
        recon_sum+=np.sum(recon)
      print_loss = recon_sum/len(train_batches)
      dec_vars = utils.variable_parser(tf.trainable_variables(), 'decoder')
      phi = dec_vars[0]
      phi = sess.run(phi)
      utils.print_top_words(phi, lexicon,result_file=None)
      print_ppx = np.exp(loss_sum / word_count)
      print_ana_ppx = np.exp(ana_loss_sum / word_count)
      print_ppx_perdoc = np.exp(ppx_sum / doc_count)
      print_kld = kld_sum/len(train_batches)
      print_ana_kld = ana_kld_sum/len(train_batches)
      

      print('| Epoch train: {:d} |'.format(epoch+1), 
               print_mode, '{:d}'.format(i),
               '| Corpus ppx: {:.5f}'.format(print_ppx),  # perplexity for all docs
               '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),  # perplexity for per doc
               '| KLD: {:.5}'.format(print_kld),
               '| Loss: {:.5}'.format(print_loss),
               '| ppx anal.: {:.5f}'.format(print_ana_ppx),
               '|KLD anal.: {:.5f}'.format(print_ana_kld))
    if warm_up_alpha:
      if min_alpha>0.0001:
        min_alpha-=(start_min_alpha-0.0001)/training_epochs
    #-------------------------------
    # dev
    loss_sum = 0.0
    kld_sum = 0.0
    ppx_sum = 0.0
    word_count = 0
    doc_count = 0
    recon_sum=0.0
    print_ana_ppx = 0.0
    ana_loss_sum = 0.0
    for idx_batch in dev_batches:
      data_batch, count_batch, mask = utils.fetch_data(
          dev_set, dev_count, idx_batch, vocab_size)
      input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 1.0,model.warm_up.name: 1.0,model.min_alpha.name:min_alpha,model.B.name: B}#,model.B.name: B
      loss,recon, kld,ana_loss = sess.run([model.objective, model.recons_loss, model.analytical_kld,model.analytical_objective],
                           input_feed)
      loss_sum += np.sum(loss)
      ana_loss_sum += np.sum(ana_loss)
      kld_sum += np.sum(kld) / np.sum(mask)  
      word_count += np.sum(count_batch)
      count_batch = np.add(count_batch, 1e-12)
      ppx_sum += np.sum(np.divide(loss, count_batch))
      doc_count += np.sum(mask) 
      recon_sum+=np.sum(recon)
    print_ana_ppx = np.exp(ana_loss_sum / word_count)
    print_ppx = np.exp(loss_sum / word_count)
    print_ppx_perdoc = np.exp(ppx_sum / doc_count)
    print_kld = kld_sum/len(dev_batches)
    print_loss = recon_sum/len(dev_batches)
    if print_ana_ppx<best_print_ana_ppx:
      no_improvement_iters=0
      best_print_ana_ppx=print_ana_ppx
      #check on validation set, if ppx better-> save improved model
      tf.train.Saver().save(sess, 'models/improved_model_bernoulli') 
    else:
      no_improvement_iters+=1
      print('no_improvement_iters',no_improvement_iters,'best ppx',best_print_ana_ppx)
      if no_improvement_iters>=early_stopping_iters:
          #if model has not improved for 30 iterations, stop training
          ###########STOP TRAINING############
          stopped=True
          print('stop training after',epoch,'iterations,no_improvement_iters',no_improvement_iters)
          ###########LOAD BEST MODEL##########
          print('load stored model')
          tf.train.Saver().restore(sess,'models/improved_model_bernoulli')
          
    print('| Epoch dev: {:d} |'.format(epoch+1), 
           '| Perplexity: {:.9f}'.format(print_ppx),
           '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
           '| KLD: {:.5}'.format(print_kld)  ,
           '| Loss: {:.5}'.format(print_loss))  
    #-------------------------------
    # test
    #if epoch%10==0 or epoch==training_epochs-1:
    if FLAGS.test:
      #if epoch==training_epochs-1:
      if stopped:
        #only do it once in the end
        coherence=utils.topic_coherence(test_set,phi, lexicon)
        print('topic coherence',str(coherence))
      loss_sum = 0.0
      kld_sum = 0.0
      ppx_sum = 0.0
      word_count = 0
      doc_count = 0
      recon_sum = 0.0
      ana_loss_sum = 0.0
      ana_kld_sum = 0.0
      for idx_batch in test_batches:
        data_batch, count_batch, mask = utils.fetch_data(
          test_set, test_count, idx_batch, vocab_size)
        input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 1.0,model.warm_up.name: 1.0,model.min_alpha.name:min_alpha,model.B.name: B}
        loss, recon,kld,ana_loss,ana_kld = sess.run([model.objective, model.recons_loss,model.kld,model.analytical_objective,model.analytical_kld],
                             input_feed)
        loss_sum += np.sum(loss)
        kld_sum += np.sum(kld)/np.sum(mask) 
        ana_loss_sum += np.sum(ana_loss)
        ana_kld_sum += np.sum(ana_kld) / np.sum(mask)
        word_count += np.sum(count_batch)
        count_batch = np.add(count_batch, 1e-12)
        ppx_sum += np.sum(np.divide(loss, count_batch))
        doc_count += np.sum(mask) 
        recon_sum+=np.sum(recon)
      print_loss = recon_sum/len(test_batches)
      print_ppx = np.exp(loss_sum / word_count)
      print_ppx_perdoc = np.exp(ppx_sum / doc_count)
      print_kld = kld_sum/len(test_batches)
      print_ana_ppx = np.exp(ana_loss_sum / word_count)
      print_ana_kld = ana_kld_sum/len(train_batches)
      print('| Epoch test: {:d} |'.format(epoch+1), 
             '| Perplexity: {:.9f}'.format(print_ppx),
             '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
             '| KLD: {:.5}'.format(print_kld),
             '| Loss: {:.5}'.format(print_loss),
             '| ppx anal.: {:.5f}'.format(print_ana_ppx),
               '|KLD anal.: {:.5f}'.format(print_ana_kld)) 
    clf = LatentDirichletAllocation(n_topics)
    count_vect = CountVectorizer(stop_words=stop_words)

    print('Fitting count_vectorizer')
    mat = count_vect.fit_transform(mc.get_text())
    print('Done!')
    print('-'*100)

    print('Fitting LDA model')
    clf.fit(mat)
    print('Done!')
    print('-'*100)

    print('Perplexity = {}'.format(clf.perplexity(mat)))
    print('-'*100)

    print('Topics:')
    print_top_words(clf, count_vect.get_feature_names(), 10)
    print('-'*100)

    mc.disconnect()

    print('Saving to ' + save_path)
    if not exists(save_path):
        print('Path doesn\'t exist, creating one')
        makedirs(save_path)
    with open(save_path + 'lda_model_' + str(int(time())) + '.pickle', 'wb') as f:
        pickle.dump(clf, f)
    print('-'*100)
Exemple #6
0
    print("preprocessing data")
    df = utils.preprocess_data(df, analyzer, tt)
    df.to_csv("data/tesi_US_preprocessed.csv", index=None)
else:
    print("loading preprocessed data")
    df = pd.read_csv("data/tesi_US_preprocessed.csv")

print("training vectorizer")
TDmat = cv.fit_transform(df['preprocessed'])
joblib.dump(cv, "models/cv_{}.pkl".format(n_features))

if isinstance(n_topics, list):
    topic_numbers = n_topics
else:
    topic_numbers = [n_topics]

for num in topic_numbers:
    lda = LatentDirichletAllocation(n_components=num,
                                    max_iter=12,
                                    learning_method='online',
                                    learning_offset=30.,
                                    random_state=0,
                                    n_jobs=6)
    print("training lda with {} topics".format(num))
    lda.fit(cv.transform(df['preprocessed']))
    utils.print_top_words(lda, cv.get_feature_names(), n_top_words)

    joblib.dump(lda, "models/lda_{}_{}.pkl".format(num, n_features))
    utils.visualize_lda(lda, TDmat, cv, True,
                        "html/lda_{}_{}.html".format(num, n_features))
Exemple #7
0
    def train(self):
        logging.info('Training...')

        best_hr = [0.]
        best_ndcg = [0.]

        # file to store the generated topic words
        if os.path.exists('res/topics_' + self.dataset + '.txt'):
            os.remove('res/topics_' + self.dataset + '.txt')
            logging.info('Successfully remove existing topic file!')

        batch_num = int(len(self.train_pairs) / self.batch_size) + 1
        for epoch in range(self.max_epoch):

            if (epoch + 1) % 20 == 0:
                self.init_lr = self.init_lr * self.lr_decay

            logging.info('Training at epoch ' + str(epoch + 1) + ' ...')

            loss_total, gen_loss_total, latent_loss_total, reg_loss_total, cf_loss_total = 0., 0., 0., 0., 0.
            for batch in range(batch_num):
                batch_data, batch_labels = utils.load_batch(
                    self.train_pairs, self.train_labels, batch,
                    self.batch_size)
                batch_data = np.transpose(batch_data)
                docu1 = batch_data[0]
                docu2 = batch_data[1]

                docus = np.concatenate((docu1, docu2), axis=0)
                get_emb = [
                    list(range(len(docu1))),
                    list(range(len(docu1),
                               len(docu1) + len(docu2)))
                ]

                feed_dict = {
                    self.batch_data: self.doc_contents[docus],
                    self.batch_labels: np.array(batch_labels),
                    self.learning_rate: self.init_lr,
                    self.keep_prob: 1.,
                    self.get_emb: get_emb
                }

                _, loss_tmp, gen_loss_tmp, latent_loss_tmp, reg_loss_tmp, cf_loss_tmp = self.sess.run(
                    (self.train_op, self.loss, self.gen_loss, self.latent_loss,
                     self.reg_loss, self.cf_loss),
                    feed_dict=feed_dict)

                loss_total += loss_tmp
                gen_loss_total += gen_loss_tmp
                latent_loss_total += latent_loss_tmp
                reg_loss_total += reg_loss_tmp
                cf_loss_total += cf_loss_tmp

            if (epoch + 1) % self.trained_print_step == 0:
                logging.info(
                    'Epoch {0}: avg batch loss = {1}, gen loss = {2}, latent loss = {3}, reg loss = {4}, cf loss = {5}\n'
                    .format(epoch + 1, loss_total / batch_num,
                            gen_loss_total / batch_num,
                            latent_loss_total / batch_num,
                            reg_loss_total / batch_num,
                            1000. * cf_loss_total / batch_num))

            if (epoch + 1) % self.test_step == 0:
                logging.info('Testing at epoch ' + str(epoch + 1) + ' ...')

                z_test = self.sess.run(self.z,
                                       feed_dict={
                                           self.batch_data: self.doc_contents,
                                           self.keep_prob: 1.0
                                       })
                feed_dict = {self.z_test: z_test, self.keep_prob: 1.0}

                # ave_rank, ave_auc = self._auc_test(feed_dict)
                # logging.info('ave rank = ' + str(ave_rank) + ', ave auc = ' + str(ave_auc) + '\n')

                hits, ndcgs = self._hit_test(feed_dict)
                logging.info('HR = ' + str(hits))
                logging.info('NDCGS = ' + str(ndcgs) + '\n')
                if best_hr[-1] < hits[-1]:
                    best_hr = hits
                if best_ndcg[-1] < ndcgs[-1]:
                    best_ndcg = ndcgs

            if (epoch + 1) % self.print_words_step == 0:
                utils.print_top_words(self.sess.run(self.weights_words),
                                      self.vocab, self.dataset)

        logging.info('BEST HR = ' + str(best_hr))
        logging.info('BEST NDCGS = ' + str(best_ndcg) + '\n\n\n')
Exemple #8
0
 def get_top_N_topic_terms(self, N, lda_model, vectorizer):
     top_words_list = print_top_words(lda_model,
                                      vectorizer.get_feature_names(), N)
     flat_top_words_list = set(
         [item for sublist in top_words_list for item in sublist])
     return list(flat_top_words_list)
    def _train_all(self, train_doc, x_valid):

        logging.info('Combined pre-training...')

        tf.reset_default_graph()
        input_dim = train_doc.shape[1]  # the same as self.layers_list[-1]

        with tf.variable_scope('inference'):
            rec = {
                'W1':
                tf.get_variable(name='W1',
                                initializer=tf.constant(
                                    self.encode_weights[0]),
                                dtype=tf.float32),
                'b1':
                tf.get_variable(name='b1',
                                initializer=tf.constant(self.encode_biases[0]),
                                dtype=tf.float32)
            }
            for layer_id in range(1, len(self.layers_list)):
                key_w = 'W' + str(layer_id + 1)
                key_b = 'b' + str(layer_id + 1)
                rec[key_w] = tf.get_variable(
                    name=key_w,
                    initializer=tf.constant(self.encode_weights[layer_id]),
                    dtype=tf.float32)
                rec[key_b] = tf.get_variable(name=key_b,
                                             initializer=tf.constant(
                                                 self.encode_biases[layer_id]),
                                             dtype=tf.float32)

            rec['W_z_mean'] = tf.get_variable(name='W_z_mean',
                                              initializer=tf.constant(
                                                  self.encode_weights[-2]),
                                              dtype=tf.float32)
            rec['b_z_mean'] = tf.get_variable(name='b_z_mean',
                                              initializer=tf.constant(
                                                  self.encode_biases[-2]),
                                              dtype=tf.float32)
            rec['W_z_log_sigma'] = tf.get_variable(
                name='W_z_log_sigma',
                initializer=tf.constant(self.encode_weights[-1]),
                dtype=tf.float32)
            rec['b_z_log_sigma'] = tf.get_variable(name='b_z_log_sigma',
                                                   initializer=tf.constant(
                                                       self.encode_biases[-1]),
                                                   dtype=tf.float32)

        with tf.variable_scope('generation'):
            gen = {}
            key_w = 'Wz'
            key_b = 'bz'
            gen[key_w] = tf.get_variable(name=key_w,
                                         initializer=tf.constant(
                                             self.decode_weights[-1]),
                                         dtype=tf.float32)
            gen[key_b] = tf.get_variable(name=key_b,
                                         initializer=tf.constant(
                                             self.decode_biases[-1]),
                                         dtype=tf.float32)
            for layer_id in reversed(range(1, len(self.layers_list))):
                key_w = 'W' + str(layer_id + 1)
                key_b = 'b' + str(layer_id + 1)
                gen[key_w] = tf.transpose(rec[key_w])
                gen[key_b] = rec['b' + str(layer_id)]

            gen['W1'] = tf.transpose(rec['W1'])
            gen['b1'] = tf.get_variable(
                'b1',
                shape=[input_dim],
                initializer=tf.constant_initializer(0.),
                dtype=tf.float32)

        for key in rec:
            self.weights.append(rec[key])

        self.weights += [gen['Wz'], gen['bz'], gen['b1']]
        self.saver = tf.train.Saver(self.weights)

        doc_x_ = tf.placeholder(name='doc_x_',
                                shape=[None, input_dim],
                                dtype=tf.float32)
        net = utils.activate(tf.matmul(doc_x_, rec['W1']) + rec['b1'],
                             activator=self.activations[0])
        learning_rate = tf.placeholder(dtype=tf.float32, name='learning_rate')

        for layer_id in range(1, len(self.layers_list)):
            key_w = 'W' + str(layer_id + 1)
            key_b = 'b' + str(layer_id + 1)
            net = utils.activate(tf.matmul(net, rec[key_w]) + rec[key_b],
                                 activator=self.activations[layer_id])

        z_mean = tf.matmul(net, rec['W_z_mean']) + rec['b_z_mean']
        z_log_sigma_sq = tf.matmul(net,
                                   rec['W_z_log_sigma']) + rec['b_z_log_sigma']

        eps = tf.random_normal((self.batch_size, self.hidden_dim),
                               0,
                               1,
                               seed=0,
                               dtype=tf.float32)
        z = z_mean + tf.sqrt(tf.maximum(tf.exp(z_log_sigma_sq), 1e-10)) * eps

        net = utils.activate(tf.matmul(z, gen['Wz']) + gen['bz'],
                             activator=self.activations[-1])

        self.weights_words = gen['Wz']
        for layer_id in reversed(range(1, len(self.layers_list))):
            key_w = 'W' + str(layer_id + 1)
            key_b = 'b' + str(layer_id + 1)
            net = utils.activate(tf.matmul(net, gen[key_w]) + gen[key_b],
                                 activator=self.activations[layer_id])
            self.weights_words = tf.matmul(self.weights_words, gen[key_w])

        x_recon = tf.squeeze(tf.matmul(net, gen['W1']) + gen['b1'])
        self.weights_words = tf.matmul(self.weights_words, gen['W1'])

        gen_loss = tf.reduce_mean(
            tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(
                labels=doc_x_, logits=x_recon),
                          axis=1))

        latent_loss = 0.5 * tf.reduce_mean(
            tf.reduce_sum(tf.square(z_mean) + tf.exp(z_log_sigma_sq) -
                          z_log_sigma_sq - 1,
                          axis=1))

        loss = gen_loss + latent_loss

        train_op = tf.train.AdamOptimizer(
            learning_rate=learning_rate).minimize(loss)

        sess = tf.Session()
        sess.run(tf.global_variables_initializer())

        batch_num = int(len(train_doc) / self.batch_size) + 1
        for epoch in range(self.max_epoch * 2):
            loss_total, gen_loss_total, latent_loss_total = 0., 0., 0.
            for batch in range(batch_num):
                batch_x_, idx = utils.get_batch(train_doc, self.batch_size)
                feed_dict = {doc_x_: batch_x_, learning_rate: self.init_lr}
                _, loss_tmp, gen_loss_tmp, latent_loss_tmp = sess.run(
                    (train_op, loss, gen_loss, latent_loss),
                    feed_dict=feed_dict)
                loss_total += loss_tmp
                gen_loss_total += gen_loss_tmp
                latent_loss_total += latent_loss_tmp

            if (epoch + 1) % self.print_step == 0:
                if x_valid:
                    valid_loss = self._validate_test(train_doc, sess, gen_loss,
                                                     doc_x_, learning_rate)
                    logging.info(
                        'Epoch {0}: avg batch loss = {1}, gen loss = {2}, latent loss = {3}, valid loss = {4}'
                        .format(epoch + 1, loss_total / batch_num,
                                gen_loss_total / batch_num,
                                latent_loss_total / batch_num, valid_loss))
                else:
                    logging.info(
                        'Epoch {0}: avg batch loss = {1}, gen loss = {2}, latent loss = {3}'
                        .format(epoch + 1, loss_total / batch_num,
                                gen_loss_total / batch_num,
                                latent_loss_total / batch_num))

            # print out the topic words generated in stacked variational auto-encoder
            if (epoch + 1) % self.print_words_step == 0:
                utils.print_top_words(sess.run(self.weights_words), self.vocab,
                                      self.dataset)

        self.saver.save(sess, self.pretrain_dir)
        logging.info('Weights saved at ' + self.pretrain_dir)
Exemple #10
0
def main():
    # Hyper Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', type=str, default="configs/1k.yaml",
                        help="Which configuration to use. See into 'config' folder")

    opt = parser.parse_args()

    with open(opt.config, 'r') as ymlfile:
        config = yaml.load(ymlfile, Loader=yaml.FullLoader)
    print(config)
    
    # dataset
    config_dataset = config['dataset']
    folder_path = config_dataset['folder_path']
    data_path = osp.join(folder_path, config_dataset['data_file'])
    vocab_path = osp.join(folder_path, config_dataset['vocab_file'])
    labels = config_dataset['labels']
    maxlen = config_dataset['maxlen']
    num_classes = len(labels)

    # model
    config_model = config['model']
    n_encoder_1 = config_model['n_encoder_1']
    n_encoder_2 = config_model['n_encoder_2']
    n_latent = config_model['n_latent']
    n_sentiment = config_model['n_sentiment']
    dr = config_model['dropout']
    ld = config_model['lambda']
    al = config_model['alpha']

    # training
    config_training = config['training']
    lr = config_training['lr']
    cls_lr = config_training['cls_lr']
    bs = config_training['bs']
    d_step = config_training['d_step']
    epochs = config_training['epochs']
    n_topwords = config_training['n_topwords']
    n_labeled = config_training['n_labeled']
    ratio = config_training['ratio']
    exp = config_training['exp']
    result = config_training['result']
    write = config_training['write']

    # create result folders
    os.makedirs(result, exist_ok=True)

    dataset = np.load(data_path)
    with open(vocab_path, 'rb') as vocabfile:
        vocab = pickle.load(vocabfile)

    (dataset_train_x, dataset_train_y), (dataset_test_x, dataset_test_y) = dataset
    id_vocab = utils.sort_values(vocab)
    vocab_size = len(vocab)
    tfms_unlabeled_x = [Onehotify(vocab_size)]
    tfms_labeled_x = [Padify(maxlen)]
    tfms_y = [YOnehotify(num_classes)]

    network_architecture = dict(n_hidden_recog_1=n_encoder_1, # 1st layer encoder neurons
                                n_hidden_recog_2=n_encoder_2, # 2nd layer encoder neurons
                                n_hidden_gener_1=vocab_size, # 1st layer decoder neurons
                                n_input=vocab_size, # MNIST data input (img shape: 28*28)
                                n_input_pi=maxlen,
                                n_z=n_latent,
                                n_p=num_classes)

    model = AVIJST(network_architecture,
                              learning_rate=lr,
                              cls_learning_rate=cls_lr,
                              batch_size=bs)

    # Split data
    sss = StratifiedShuffleSplit(n_splits=exp, test_size=n_labeled, random_state=0)
    splitted_train = sss.split(dataset_train_x, dataset_train_y)
    for _, ds_train_labeled_idx in splitted_train:
        train_unlabeled_ds = Dataset(np.concatenate((dataset_train_x, dataset_test_x), axis=0),\
                                     np.concatenate((dataset_train_y, dataset_test_y), axis=0),\
                                     tfms_unlabeled_x, tfms_y)
        train_unlabeled_pi_ds = Dataset(np.concatenate((dataset_train_x, dataset_test_x), axis=0),\
                                        np.concatenate((dataset_train_y, dataset_test_y), axis=0),\
                                        tfms_labeled_x, tfms_y)
        train_labeled_ds = Dataset(dataset_train_x[ds_train_labeled_idx], dataset_train_y[ds_train_labeled_idx], tfms_labeled_x, tfms_y)
        test_ds = Dataset(dataset_test_x, dataset_test_y, tfms_labeled_x, tfms_y)
        
        train_unlabeled_dl = DataLoader(train_unlabeled_ds, bs, False)
        train_unlabeled_pi_dl = DataLoader(train_unlabeled_pi_ds, bs, False)
        train_labeled_dl = DataLoader(train_labeled_ds, bs, False)
        test_dl = DataLoader(test_ds, bs, False)

        start = time.time()
        print ("train_labeled_ds: ", len(train_labeled_ds))
        print ("train_unlabeled_pi_ds: ", len(train_unlabeled_pi_ds))
        print ("train_unlabeled_ds: ", len(train_unlabeled_ds))
        
        train_labeled_iter = iter(train_labeled_dl)
        for epoch in range(epochs):
            avg_loss = 0.
            avg_kl_s_loss = 0.
            avg_kl_z_loss = 0.
            avg_cls_loss = 0.
            sum_t_c = 0.

            for idx, ((train_unlabeled_x, _), (train_unlabeled_pi_x, _)) in enumerate(zip(train_unlabeled_dl, train_unlabeled_pi_dl)):
                t_c = time.time()
                # Labeled
                if len(train_labeled_ds) > 0:
                    train_labeled_x, train_labeled_y = None, None
                    try:
                        train_labeled_x, train_labeled_y = next(train_labeled_iter)
                    except:
                        train_labeled_iter = iter(train_labeled_dl)
                        train_labeled_x, train_labeled_y = next(train_labeled_iter)
                    loss_l = model.cls_fit(train_labeled_x, train_labeled_y)
                    avg_cls_loss += loss_l / len(train_unlabeled_ds) * bs
                
                # Unlabeled
                loss_u, kl_s_loss, kl_z_loss, emb = model.partial_fit(train_unlabeled_x, train_unlabeled_pi_x)
                avg_loss += loss_u / len(train_unlabeled_ds) * bs
                avg_kl_s_loss += kl_s_loss / len(train_unlabeled_ds) * bs
                avg_kl_z_loss += kl_z_loss / len(train_unlabeled_ds) * bs
                c_elap = time.time() - t_c
                # Compute avg time
                sum_t_c += c_elap

            # Display logs per epoch step
            if (epoch + 1) % d_step == 0:
                weights = model.get_weights()
                print("##################################################")
                print("Epoch:", "%04d" % (epoch+1), \
                        "cls_cost=", "{:.9f}".format(avg_cls_loss), \
                        "cost=", "{:.9f}".format(avg_loss), \
                        "kl_s=", "{:.9f}".format(avg_kl_s_loss), \
                        "kl_z=", "{:.9f}".format(avg_kl_z_loss), \
                        "sum_t_c={:.2f}".format(sum_t_c))
                utils.classification_evaluate_dl(lambda x: model.senti_prop(x), test_dl, n_sentiment, labels=['negative', 'positive'])
                utils.print_top_words(epoch + 1, weights, id_vocab, n_topwords, result, write, printout=False)
                print("##################################################")

        weights = model.get_weights()
        utils.classification_evaluate_dl(model, train_dl, n_latent, labels, show=True)
        utils.classification_evaluate_dl(model, test_dl, n_latent, labels, show=True)
        utils.print_top_words(epoch + 1, weights, id_vocab, n_topwords, result, write)
        utils.calc_perp(model, test_dl, gamma_prior_batch)
Exemple #11
0
    ngram_range=(1, 1),
)
bow = cvec.fit_transform(lyrics)

# TF-IDF
tvec = TfidfVectorizer(
    min_df=5,
    max_df=0.95,
    stop_words=stop_words,
    ngram_range=(1, 3),
)
tfidf = tvec.fit_transform(lyrics)

# Topic Modelling

n_top_words = 7  # Set number

bow_feature_names = cvec.get_feature_names()
tfidf_feature_names = tvec.get_feature_names()

# NMF with TFIDF
nmf2 = NMF(n_components=5,
           random_state=42,
           beta_loss='kullback-leibler',
           solver='mu',
           max_iter=1000,
           alpha=.01,
           l1_ratio=.5).fit(tfidf)
print("\nTopics in NMF model (tfidf):")
print_top_words(nmf2, tfidf_feature_names, n_top_words)