def generate_autoencoder_training_data( no_need_start_tag=FLAGS.no_need_start_tag): dataLoader = DataLoader(base_dir=FLAGS.data_dir, dataset=FLAGS.dataset) (X_train, y_train), (X_test, y_test), (X_unsup, ) = dataLoader.load_data(include_unsup=True) if X_unsup is not None: X_train_total = np.concatenate([X_train, X_test, X_unsup]) else: X_train_total = np.concatenate([X_train, X_test]) logger.info("dataset shape: %s" % X_train_total.shape) X = [] y = [] weight = [] if no_need_start_tag: start_index = 1 else: start_index = 0 for doc in X_train_total: seq_all = doc[start_index:] seq_but_last = doc[start_index:-1] X.append(np.array(seq_all + seq_but_last).reshape(-1, 1)) seq_len = len(seq_all) seq_but_last_len = len(seq_but_last) y.append(np.array([0] * seq_but_last_len + seq_all).reshape(-1, 1)) weight.append( np.array([0] * seq_but_last_len + [1] * seq_len, dtype=np.float32).reshape(-1, 1)) pickle_data = {"X": X, "y": y, "weight": weight} rand_index = np.random.choice(len(X), 1)[0] logger.info("random sampled X: %s" % X[rand_index]) logger.info("random sampled y: %s" % y[rand_index]) logger.info("random sampled weight: %s" % weight[rand_index]) with open(osp.join(FLAGS.output_dir, "imdb_ae_dataset.pickle"), "wb") as f: pickle.dump(pickle_data, f)
def generate_language_training_data(no_need_start_tag=FLAGS.no_need_start_tag): dataLoader = DataLoader(base_dir=FLAGS.data_dir, dataset=FLAGS.dataset) (X_train, y_train), (X_test, y_test), (X_unsup, ) = dataLoader.load_data(include_unsup=True) if X_unsup is not None: X_train_total = np.concatenate([X_train, X_test, X_unsup]) else: X_train_total = np.concatenate([X_train, X_test]) logger.info("dataset shape: %s" % X_train_total.shape) X = [] y = [] if no_need_start_tag: start_index = 1 else: start_index = 0 for doc in X_train_total: X.append(doc[start_index:-1]) y.append(doc[start_index + 1:]) pickle_data = {"X": X, "y": y} rand_index = np.random.choice(len(X), 1)[0] logger.info("random sampled X: %s" % X[rand_index]) logger.info("random sampled y: %s" % y[rand_index]) with open(osp.join(FLAGS.output_dir, "imdb_lm_dataset.pickle"), "wb") as f: pickle.dump(pickle_data, f)
def eval(self, save_model_path, batch_size=2, seq_len=200): wordCounter = DataLoader.reload_word_counter( vocab_abspath=getDatasetFilePath( self.arguments["inputs"]["datapath"], self.arguments["inputs"] ["dataset"], "word_freqs")) vocab_size = self.arguments["ae_sequence"]["vocab_size"] EOS_TAG = 2 choosed_words_index = np.array([EOS_TAG] * batch_size).astype(np.int64) self.loss_layer.build([ (-1, self.arguments["ae_sequence"]["rnn_cell_size"]) ]) rnn_to_embedding = layers.RnnOutputToEmbedding( vocab_size=self.arguments["ae_sequence"]["vocab_size"], embedding_weights=self.sequences["ae_sequence"].embedding_layer. var, var_w=self.loss_layer.lin_w, var_b=self.loss_layer.lin_b) self.eval_lm_model = seq.EvalLanguageModel( language_model_seq=self.sequences["ae_sequence"], lm_lin_w=self.loss_layer.lin_w, lm_lin_b=self.loss_layer.lin_b, rnnOutputToEmbedding=rnn_to_embedding) generated_sequences = self.eval_lm_model( start_word_indexes=choosed_words_index, time_steps=seq_len) with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) as sess: self._resotre_training_model(sess=sess, save_model_path=save_model_path) generated_sequences_val = sess.run(generated_sequences) generated_sentences = wordCounter.reverse( indices=generated_sequences_val, num_words=vocab_size) for sentence in generated_sentences: logger.info(sentence) logger.info("-" * 100)
def test_data_loader(base_dir): dataLoader = DataLoader(base_dir=base_dir, dataset="imdb") training_dataset, testing_dataset, unsup_dataset = dataLoader.load_data() logger.info( "training_dataset shape: %s; testing_dataset shape: %s; unsup_dataset: %s" % (training_dataset[0].shape, testing_dataset[0].shape, unsup_dataset[0].shape)) training_sample_index = np.random.choice(len(training_dataset[0]), 1)[0] testing_dataset_index = np.random.choice(len(testing_dataset[0]), 1)[0] unsup_dataset_index = np.random.choice(len(unsup_dataset[0]), 1)[0] logger.info("training sample: %s; sample label: %s" % (training_dataset[0][training_sample_index], training_dataset[1][training_sample_index])) logger.info("testing sample: %s; sample label: %s" % (testing_dataset[0][testing_dataset_index], testing_dataset[1][testing_dataset_index])) logger.info("unsup sample: %s" % (unsup_dataset[0][unsup_dataset_index], ))
def eval(self, save_model_path, batch_size=2, seq_len=200): wordCounter = DataLoader.reload_word_counter( vocab_abspath=getDatasetFilePath( self.arguments["inputs"]["datapath"], self.arguments["inputs"] ["dataset"], "word_freqs")) words_list = np.array(wordCounter.words_list) words = words_list[:, 0] freqs = words_list[:, 1] vocab_size = self.arguments["lm_sequence"]["vocab_size"] freqs = freqs[:vocab_size].astype(np.float32) freqs[:10] = 0 freqs /= np.sum(freqs) freqs = None choosed_words_index = np.random.choice(vocab_size, batch_size, p=freqs).astype(np.int64) choosed_words = words[choosed_words_index] # add up special tokens index choosed_words_index += 3 self.loss_layer.build([ (-1, self.arguments["lm_sequence"]["rnn_cell_size"]) ]) rnn_to_embedding = layers.RnnOutputToEmbedding( vocab_size=self.arguments["lm_sequence"]["vocab_size"], embedding_weights=self.sequences["lm_sequence"].embedding_layer. var, var_w=self.loss_layer.lin_w, var_b=self.loss_layer.lin_b) self.eval_lm_model = seq.EvalLanguageModel( language_model_seq=self.sequences["lm_sequence"], lm_lin_w=self.loss_layer.lin_w, lm_lin_b=self.loss_layer.lin_b, rnnOutputToEmbedding=rnn_to_embedding) generated_sequences = self.eval_lm_model( start_word_indexes=choosed_words_index, time_steps=seq_len, zero_states=True) with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) as sess: self._resotre_training_model(sess=sess, save_model_path=save_model_path) generated_sequences_val = sess.run(generated_sequences) generated_sentences = wordCounter.reverse( indices=generated_sequences_val, num_words=vocab_size) logger.info("choosed words: %s " % choosed_words) for sentence in generated_sentences: logger.info(sentence) logger.info("-" * 100)
def eval(self, inputs_docs, save_model_path, lower_case=True, apply_filter=False): batch_size = len(inputs_docs) wordCounter = DataLoader.reload_word_counter( vocab_abspath=getDatasetFilePath( self.arguments["inputs"]["datapath"], "summary", "word_freqs")) wordCounter.lower_case = lower_case filter_words_list = wordCounter.load_filter_words( getDatasetFilePath(self.arguments["inputs"]["datapath"], "summary", "summary_word_freqs"), max_words=self.arguments["lm_sequence"]["vocab_size"], apply_filter_onthego=apply_filter) logger.info("filtered %s words" % len(filter_words_list)) inputs_docs_idx = wordCounter.transform_docs( docs=inputs_docs, max_words=self.arguments["lm_sequence"]["vocab_size"]) max_len = max(map(len, inputs_docs_idx)) seq_len = [] for i, idx in enumerate(inputs_docs_idx): padding_size = max_len - len(idx) inputs_docs_idx[i] = np.pad(idx, [(0, padding_size)], "constant", constant_values=[0] * 2) seq_len.append(len(idx)) to_embedding_layers_placeholder = tf.placeholder(tf.int32, shape=[None, max_len]) seq_len_placeholder = tf.placeholder(tf.int32, shape=[None]) outputs, final_sequence_lengths = self.eval_layer( batch_size=batch_size, sos_tag=SOS_TAG, eos_tag=EOS_TAG, encoder_len=seq_len_placeholder, encoder_embed_inputs=self.to_embedding( to_embedding_layers_placeholder), beam_width=self.arguments["summary"]["beam_width"], maximum_iterations=self.arguments["summary"]["maximum_iterations"]) with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) as sess: self._resotre_training_model(sess=sess, save_model_path=save_model_path) output_idx, final_sequence_lengths_val = sess.run( [outputs, final_sequence_lengths], feed_dict={ to_embedding_layers_placeholder: inputs_docs_idx, seq_len_placeholder: seq_len }) output_idx = output_idx[:, :, 0] final_sequence_lengths_val = final_sequence_lengths_val[:, 0] output_words = wordCounter.reverse( output_idx, self.arguments["lm_sequence"]["vocab_size"], return_list=True) inputs_docs = wordCounter.reverse( inputs_docs_idx, self.arguments["lm_sequence"]["vocab_size"]) for i in range(len(inputs_docs)): logger.info("-" * 20 + " doc-%s " % i + "-" * 20) logger.info("doc: " + inputs_docs[i]) logger.info("doc_idx: " + str(inputs_docs_idx[i].tolist())) logger.info( "title: " + " ".join(output_words[i][:final_sequence_lengths_val[i]]))
def generate_classification_data(validation_rate=FLAGS.validation_rate, shuffle_onval=FLAGS.shuffle_onval, no_need_start_tag=FLAGS.no_need_start_tag): dataLoader = DataLoader(base_dir=FLAGS.data_dir, dataset=FLAGS.dataset) (X_train, y_train), (X_test, y_test) = dataLoader.load_data(include_unsup=False) logger.info("training dataset shape: %s, testing dataset shape: %s" % (X_train.shape, X_test.shape)) weight_train = [] weight_test = [] if no_need_start_tag: for i in range(X_train.shape[0]): X_train[i] = X_train[i][1:] for i in range(X_test.shape[0]): X_test[i] = X_test[i][1:] for i in range(X_train.shape[0]): seq_len = len(X_train[i]) seq_weights = np.zeros(seq_len) if seq_len < 2: seq_weights[:] = 1 else: seq_weights[:] = np.arange(0, seq_len) / (seq_len - 1) weight_train.append(seq_weights.tolist()) for i in range(X_test.shape[0]): seq_len = len(X_test[i]) seq_weights = np.zeros(seq_len) if seq_len < 2: seq_weights[:] = 1 else: seq_weights[:] = np.arange(0, seq_len) / (seq_len - 1) weight_test.append(seq_weights.tolist()) pickle_data = { "X_train": X_train, "y_train": y_train, "weight_train": weight_train, "X_test": X_test, "y_test": y_test, "weight_test": weight_test } if 0 < validation_rate < 1: X_train, X_val, y_train, y_val, weight_train, weight_val = train_test_split( X_train, y_train, weight_train, test_size=validation_rate, shuffle=shuffle_onval) pickle_data["X_train"] = X_train pickle_data["y_train"] = y_train pickle_data["X_val"] = X_val pickle_data["y_val"] = y_val pickle_data["weight_train"] = weight_train pickle_data["weight_val"] = weight_val else: logger.info("No validation set.") rand_index = np.random.choice(len(X_train), 1)[0] logger.info("random sampled X: %s" % X_train[rand_index]) logger.info("random sampled y: %s" % y_train[rand_index]) logger.info("random sampled weight: %s" % weight_train[rand_index]) rand_index = np.random.choice(len(X_test), 1)[0] logger.info("random sampled X: %s" % X_test[rand_index]) logger.info("random sampled y: %s" % y_test[rand_index]) logger.info("random sampled weight: %s" % weight_test[rand_index]) with open(osp.join(FLAGS.output_dir, "imdb_classification_dataset.pickle"), "wb") as f: pickle.dump(pickle_data, f)