Exemple #1
0
def generate_autoencoder_training_data(
        no_need_start_tag=FLAGS.no_need_start_tag):
    dataLoader = DataLoader(base_dir=FLAGS.data_dir, dataset=FLAGS.dataset)
    (X_train,
     y_train), (X_test,
                y_test), (X_unsup, ) = dataLoader.load_data(include_unsup=True)
    if X_unsup is not None:
        X_train_total = np.concatenate([X_train, X_test, X_unsup])
    else:
        X_train_total = np.concatenate([X_train, X_test])
    logger.info("dataset shape: %s" % X_train_total.shape)
    X = []
    y = []
    weight = []
    if no_need_start_tag:
        start_index = 1
    else:
        start_index = 0
    for doc in X_train_total:
        seq_all = doc[start_index:]
        seq_but_last = doc[start_index:-1]
        X.append(np.array(seq_all + seq_but_last).reshape(-1, 1))
        seq_len = len(seq_all)
        seq_but_last_len = len(seq_but_last)
        y.append(np.array([0] * seq_but_last_len + seq_all).reshape(-1, 1))
        weight.append(
            np.array([0] * seq_but_last_len + [1] * seq_len,
                     dtype=np.float32).reshape(-1, 1))
    pickle_data = {"X": X, "y": y, "weight": weight}
    rand_index = np.random.choice(len(X), 1)[0]
    logger.info("random sampled X: %s" % X[rand_index])
    logger.info("random sampled y: %s" % y[rand_index])
    logger.info("random sampled weight: %s" % weight[rand_index])
    with open(osp.join(FLAGS.output_dir, "imdb_ae_dataset.pickle"), "wb") as f:
        pickle.dump(pickle_data, f)
Exemple #2
0
def generate_language_training_data(no_need_start_tag=FLAGS.no_need_start_tag):
    dataLoader = DataLoader(base_dir=FLAGS.data_dir, dataset=FLAGS.dataset)
    (X_train,
     y_train), (X_test,
                y_test), (X_unsup, ) = dataLoader.load_data(include_unsup=True)
    if X_unsup is not None:
        X_train_total = np.concatenate([X_train, X_test, X_unsup])
    else:
        X_train_total = np.concatenate([X_train, X_test])
    logger.info("dataset shape: %s" % X_train_total.shape)
    X = []
    y = []
    if no_need_start_tag:
        start_index = 1
    else:
        start_index = 0
    for doc in X_train_total:
        X.append(doc[start_index:-1])
        y.append(doc[start_index + 1:])
    pickle_data = {"X": X, "y": y}
    rand_index = np.random.choice(len(X), 1)[0]
    logger.info("random sampled X: %s" % X[rand_index])
    logger.info("random sampled y: %s" % y[rand_index])
    with open(osp.join(FLAGS.output_dir, "imdb_lm_dataset.pickle"), "wb") as f:
        pickle.dump(pickle_data, f)
Exemple #3
0
 def eval(self, save_model_path, batch_size=2, seq_len=200):
     wordCounter = DataLoader.reload_word_counter(
         vocab_abspath=getDatasetFilePath(
             self.arguments["inputs"]["datapath"], self.arguments["inputs"]
             ["dataset"], "word_freqs"))
     vocab_size = self.arguments["ae_sequence"]["vocab_size"]
     EOS_TAG = 2
     choosed_words_index = np.array([EOS_TAG] * batch_size).astype(np.int64)
     self.loss_layer.build([
         (-1, self.arguments["ae_sequence"]["rnn_cell_size"])
     ])
     rnn_to_embedding = layers.RnnOutputToEmbedding(
         vocab_size=self.arguments["ae_sequence"]["vocab_size"],
         embedding_weights=self.sequences["ae_sequence"].embedding_layer.
         var,
         var_w=self.loss_layer.lin_w,
         var_b=self.loss_layer.lin_b)
     self.eval_lm_model = seq.EvalLanguageModel(
         language_model_seq=self.sequences["ae_sequence"],
         lm_lin_w=self.loss_layer.lin_w,
         lm_lin_b=self.loss_layer.lin_b,
         rnnOutputToEmbedding=rnn_to_embedding)
     generated_sequences = self.eval_lm_model(
         start_word_indexes=choosed_words_index, time_steps=seq_len)
     with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
             allow_growth=True))) as sess:
         self._resotre_training_model(sess=sess,
                                      save_model_path=save_model_path)
         generated_sequences_val = sess.run(generated_sequences)
     generated_sentences = wordCounter.reverse(
         indices=generated_sequences_val, num_words=vocab_size)
     for sentence in generated_sentences:
         logger.info(sentence)
         logger.info("-" * 100)
Exemple #4
0
def test_data_loader(base_dir):
    dataLoader = DataLoader(base_dir=base_dir, dataset="imdb")
    training_dataset, testing_dataset, unsup_dataset = dataLoader.load_data()
    logger.info(
        "training_dataset shape: %s; testing_dataset shape: %s; unsup_dataset: %s"
        % (training_dataset[0].shape, testing_dataset[0].shape,
           unsup_dataset[0].shape))
    training_sample_index = np.random.choice(len(training_dataset[0]), 1)[0]
    testing_dataset_index = np.random.choice(len(testing_dataset[0]), 1)[0]
    unsup_dataset_index = np.random.choice(len(unsup_dataset[0]), 1)[0]
    logger.info("training sample: %s; sample label: %s" %
                (training_dataset[0][training_sample_index],
                 training_dataset[1][training_sample_index]))
    logger.info("testing sample: %s; sample label: %s" %
                (testing_dataset[0][testing_dataset_index],
                 testing_dataset[1][testing_dataset_index]))
    logger.info("unsup sample: %s" % (unsup_dataset[0][unsup_dataset_index], ))
Exemple #5
0
 def eval(self, save_model_path, batch_size=2, seq_len=200):
     wordCounter = DataLoader.reload_word_counter(
         vocab_abspath=getDatasetFilePath(
             self.arguments["inputs"]["datapath"], self.arguments["inputs"]
             ["dataset"], "word_freqs"))
     words_list = np.array(wordCounter.words_list)
     words = words_list[:, 0]
     freqs = words_list[:, 1]
     vocab_size = self.arguments["lm_sequence"]["vocab_size"]
     freqs = freqs[:vocab_size].astype(np.float32)
     freqs[:10] = 0
     freqs /= np.sum(freqs)
     freqs = None
     choosed_words_index = np.random.choice(vocab_size, batch_size,
                                            p=freqs).astype(np.int64)
     choosed_words = words[choosed_words_index]
     # add up special tokens index
     choosed_words_index += 3
     self.loss_layer.build([
         (-1, self.arguments["lm_sequence"]["rnn_cell_size"])
     ])
     rnn_to_embedding = layers.RnnOutputToEmbedding(
         vocab_size=self.arguments["lm_sequence"]["vocab_size"],
         embedding_weights=self.sequences["lm_sequence"].embedding_layer.
         var,
         var_w=self.loss_layer.lin_w,
         var_b=self.loss_layer.lin_b)
     self.eval_lm_model = seq.EvalLanguageModel(
         language_model_seq=self.sequences["lm_sequence"],
         lm_lin_w=self.loss_layer.lin_w,
         lm_lin_b=self.loss_layer.lin_b,
         rnnOutputToEmbedding=rnn_to_embedding)
     generated_sequences = self.eval_lm_model(
         start_word_indexes=choosed_words_index,
         time_steps=seq_len,
         zero_states=True)
     with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
             allow_growth=True))) as sess:
         self._resotre_training_model(sess=sess,
                                      save_model_path=save_model_path)
         generated_sequences_val = sess.run(generated_sequences)
     generated_sentences = wordCounter.reverse(
         indices=generated_sequences_val, num_words=vocab_size)
     logger.info("choosed words: %s " % choosed_words)
     for sentence in generated_sentences:
         logger.info(sentence)
         logger.info("-" * 100)
Exemple #6
0
 def eval(self,
          inputs_docs,
          save_model_path,
          lower_case=True,
          apply_filter=False):
     batch_size = len(inputs_docs)
     wordCounter = DataLoader.reload_word_counter(
         vocab_abspath=getDatasetFilePath(
             self.arguments["inputs"]["datapath"], "summary", "word_freqs"))
     wordCounter.lower_case = lower_case
     filter_words_list = wordCounter.load_filter_words(
         getDatasetFilePath(self.arguments["inputs"]["datapath"], "summary",
                            "summary_word_freqs"),
         max_words=self.arguments["lm_sequence"]["vocab_size"],
         apply_filter_onthego=apply_filter)
     logger.info("filtered %s words" % len(filter_words_list))
     inputs_docs_idx = wordCounter.transform_docs(
         docs=inputs_docs,
         max_words=self.arguments["lm_sequence"]["vocab_size"])
     max_len = max(map(len, inputs_docs_idx))
     seq_len = []
     for i, idx in enumerate(inputs_docs_idx):
         padding_size = max_len - len(idx)
         inputs_docs_idx[i] = np.pad(idx, [(0, padding_size)],
                                     "constant",
                                     constant_values=[0] * 2)
         seq_len.append(len(idx))
     to_embedding_layers_placeholder = tf.placeholder(tf.int32,
                                                      shape=[None, max_len])
     seq_len_placeholder = tf.placeholder(tf.int32, shape=[None])
     outputs, final_sequence_lengths = self.eval_layer(
         batch_size=batch_size,
         sos_tag=SOS_TAG,
         eos_tag=EOS_TAG,
         encoder_len=seq_len_placeholder,
         encoder_embed_inputs=self.to_embedding(
             to_embedding_layers_placeholder),
         beam_width=self.arguments["summary"]["beam_width"],
         maximum_iterations=self.arguments["summary"]["maximum_iterations"])
     with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
             allow_growth=True))) as sess:
         self._resotre_training_model(sess=sess,
                                      save_model_path=save_model_path)
         output_idx, final_sequence_lengths_val = sess.run(
             [outputs, final_sequence_lengths],
             feed_dict={
                 to_embedding_layers_placeholder: inputs_docs_idx,
                 seq_len_placeholder: seq_len
             })
     output_idx = output_idx[:, :, 0]
     final_sequence_lengths_val = final_sequence_lengths_val[:, 0]
     output_words = wordCounter.reverse(
         output_idx,
         self.arguments["lm_sequence"]["vocab_size"],
         return_list=True)
     inputs_docs = wordCounter.reverse(
         inputs_docs_idx, self.arguments["lm_sequence"]["vocab_size"])
     for i in range(len(inputs_docs)):
         logger.info("-" * 20 + " doc-%s " % i + "-" * 20)
         logger.info("doc: " + inputs_docs[i])
         logger.info("doc_idx: " + str(inputs_docs_idx[i].tolist()))
         logger.info(
             "title: " +
             " ".join(output_words[i][:final_sequence_lengths_val[i]]))
Exemple #7
0
def generate_classification_data(validation_rate=FLAGS.validation_rate,
                                 shuffle_onval=FLAGS.shuffle_onval,
                                 no_need_start_tag=FLAGS.no_need_start_tag):
    dataLoader = DataLoader(base_dir=FLAGS.data_dir, dataset=FLAGS.dataset)
    (X_train, y_train), (X_test,
                         y_test) = dataLoader.load_data(include_unsup=False)
    logger.info("training dataset shape: %s, testing dataset shape: %s" %
                (X_train.shape, X_test.shape))
    weight_train = []
    weight_test = []
    if no_need_start_tag:
        for i in range(X_train.shape[0]):
            X_train[i] = X_train[i][1:]
        for i in range(X_test.shape[0]):
            X_test[i] = X_test[i][1:]
    for i in range(X_train.shape[0]):
        seq_len = len(X_train[i])
        seq_weights = np.zeros(seq_len)
        if seq_len < 2:
            seq_weights[:] = 1
        else:
            seq_weights[:] = np.arange(0, seq_len) / (seq_len - 1)
        weight_train.append(seq_weights.tolist())
    for i in range(X_test.shape[0]):
        seq_len = len(X_test[i])
        seq_weights = np.zeros(seq_len)
        if seq_len < 2:
            seq_weights[:] = 1
        else:
            seq_weights[:] = np.arange(0, seq_len) / (seq_len - 1)
        weight_test.append(seq_weights.tolist())
    pickle_data = {
        "X_train": X_train,
        "y_train": y_train,
        "weight_train": weight_train,
        "X_test": X_test,
        "y_test": y_test,
        "weight_test": weight_test
    }
    if 0 < validation_rate < 1:
        X_train, X_val, y_train, y_val, weight_train, weight_val = train_test_split(
            X_train,
            y_train,
            weight_train,
            test_size=validation_rate,
            shuffle=shuffle_onval)
        pickle_data["X_train"] = X_train
        pickle_data["y_train"] = y_train
        pickle_data["X_val"] = X_val
        pickle_data["y_val"] = y_val
        pickle_data["weight_train"] = weight_train
        pickle_data["weight_val"] = weight_val
    else:
        logger.info("No validation set.")
    rand_index = np.random.choice(len(X_train), 1)[0]
    logger.info("random sampled X: %s" % X_train[rand_index])
    logger.info("random sampled y: %s" % y_train[rand_index])
    logger.info("random sampled weight: %s" % weight_train[rand_index])
    rand_index = np.random.choice(len(X_test), 1)[0]
    logger.info("random sampled X: %s" % X_test[rand_index])
    logger.info("random sampled y: %s" % y_test[rand_index])
    logger.info("random sampled weight: %s" % weight_test[rand_index])
    with open(osp.join(FLAGS.output_dir, "imdb_classification_dataset.pickle"),
              "wb") as f:
        pickle.dump(pickle_data, f)