def generate_text(): """ Generates the text that is hopefully Trumpian. """ # Minimize TF warnings which are not helpful in generate mode. os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' tf.logging.set_verbosity(tf.logging.ERROR) net_features = network.construct() sess = tf.Session() Config.import_model(sess) # Enable the user to enter multiple text strings with a do-while loop while True: _generate_output(sess, net_features) if not Config.Generate.loop: break while True: print("") logging.info( "Please supply a new seed text then press enter when complete: " ) Config.Generate.seed_text = input("") if len(Config.Generate.seed_text) > Config.Generate.min_seed_len: print("You entered: \"" + Config.Generate.seed_text + "\"") print("") break logging.info( "Invalid Seed Text. Must be at least %d characters long" % Config.Generate.min_seed_len) sess.close()
def _build_target_vector(idx): """ Creates a one hot vector for the target with "1" in the correct character location and zero everywhere else. :param idx: Integer corresponding to the expected character :type idx: int :return: One hot vector for the target character :rtype: np.array """ assert (0 <= idx < Config.vocab_size()) one_hot = np.zeros([Config.vocab_size()]) one_hot[idx] = 1 return one_hot
def _print_basic_text_statistics(): # Print basic statistics on the training set logging.info("Total Number of Characters: %d" % Config.dataset_size) if Config.word_count > 0: logging.info("Total Word Count: \t%d" % Config.word_count) logging.info("Vocabulary Size: \t%d" % Config.vocab_size()) logging.info("Training Set Size: \t%d" % Config.Train.size()) logging.info("Validation Set Size: \t%d" % Config.Validation.size())
def _build_feed_forward(ff_input, rand_func): """ Feed-Forward Network Builder Constructs and initializes the feed-forward network. :param ff_input: Input to the feed-forward network. :type ff_input: tf.Tensor :param rand_func: Function to generate the :type rand_func: Callable :return: Output of the feed-forward network. :rtype: tf.Tensor """ # Separate from the for loop in case no hidden layers input_width = int(ff_input.shape[1]) ff_in = ff_input for i in range(0, Config.FF.depth): if i > 0: input_width = Config.FF.hidden_width # noinspection PyUnboundLocalVariable ff_in = hidden_out bias_input = rand_func([Config.FF.hidden_width]) hidden_layer = rand_func([input_width, Config.FF.hidden_width]) a_hidden = tf.add(tf.matmul(ff_in, hidden_layer), bias_input) hidden_out = tf.nn.relu(a_hidden) # Construct the output layer bias_input = rand_func([Config.vocab_size()]) out_layer = rand_func([input_width, Config.vocab_size()]) # noinspection PyUnboundLocalVariable a_out = tf.nn.relu(tf.add(tf.matmul(ff_in, out_layer), bias_input)) return a_out
def build_training_and_verification_sets(): """ Training and Verification Set Builder Builds the training and verification datasets. Depending on the configuration, this may be from the source files or from pickled files. """ if not Config.Train.restore: input_str = read_input() create_examples(input_str) # Character to integer map required during text generation Config.export_character_to_integer_map() # Export the training and verification data in case # the previous setup will be trained on aga Config.export_train_and_verification_data() Config.word_count = len(input_str.split(" ")) else: Config.import_character_to_integer_map() Config.import_train_and_verification_data() Config.dataset_size = Config.Train.size() + Config.Validation.size() _print_basic_text_statistics()
def _build_input_sequence(int_sequence): """ One-Hot Sequence Builder Converts a list of integers into a sequence of integers. :param int_sequence: List of the character indices :type int_sequence: List[int] :return: Input sequence converted into a matrix of one hot rows :rtype: np.ndarray """ assert (0 < len(int_sequence) <= Config.sequence_length) one_hots = [] while len(one_hots) < Config.sequence_length: idx = len(one_hots) char_id = 0 # This is used to pad the list as needed if idx < len(int_sequence): char_id = int_sequence[idx] vec = np.zeros([Config.vocab_size()]) vec[char_id] = 1 one_hots.append(vec) seq = np.vstack(one_hots) return seq
def construct(): """ Trump Neural Network Constructor Builds all layers of the neural network. """ # create data input placeholder input_x = tf.placeholder(tf.int32, shape=[Config.batch_size, None]) # create target input placeholder target = tf.placeholder(tf.float32, shape=[Config.batch_size, Config.vocab_size()]) # Create the embedding matrix embed_matrix = tf.get_variable( "word_embeddings", [Config.vocab_size(), Config.RNN.hidden_size]) embedded = tf.nn.embedding_lookup(embed_matrix, input_x) # create RNN cell cells = [] for _ in range(Config.RNN.num_layers): cells.append(tf.nn.rnn_cell.BasicLSTMCell(Config.RNN.hidden_size)) if Config.is_train() or Config.Generate.enable_dropout: cells = [ tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=0.8) for cell in cells ] # cells = [tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=1.0, output_keep_prob=0.8, # state_keep_prob=1.0) for cell in cells] # else: # cells = [tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=0.98, output_keep_prob=0.98, # state_keep_prob=1.0) for cell in cells] # get rnn outputs seq_len = tf.placeholder(tf.int32, shape=[Config.batch_size]) multi_cell = tf.contrib.rnn.MultiRNNCell(cells) rnn_output, rnn_state = tf.nn.dynamic_rnn(multi_cell, embedded, sequence_length=seq_len, dtype=tf.float32) # transpose rnn_output into a time major form seq_end = tf.range(Config.batch_size) * tf.shape(rnn_output)[1] + ( seq_len - 1) rnn_final_output = tf.gather( tf.reshape(rnn_output, [-1, Config.RNN.hidden_size]), seq_end) softmax_out = setup_feed_forward(rnn_final_output) final_output = softmax_out return { 'X': input_x, 'target': target, 'seq_len': seq_len, 'output': final_output }
feed_dict={ x: input_x, seq_len: phrase_seq_len }) pred_char_id = Config.DecisionEngine.function(sess, softmax_out) pred_char = Config.Generate.int2char()[pred_char_id] generated_text.append(pred_char) if cur_seq_len == Config.sequence_length: # Delete off the front of the list if it has reached the specified sequence length del input_x[0][0] else: # Shave last dummy element off since fixed batch size del input_x[0][Config.sequence_length - 1] cur_seq_len += 1 input_x[0].insert(cur_seq_len - 1, pred_char_id) Config.Generate.prev_char = pred_char logging.info("Output Text: " + Config.Generate.seed_text + "".join(generated_text)) if __name__ == "__main__": Config.parse_args() Config.import_character_to_integer_map() Config.Generate.build_int2char() Config.Generate.build_seed_x() generate_text()
def create_examples(input_string, ): """ from the input, produce examples where the input is a sequence of integers representing a string of characters, and the target is the character immediately following the input sequence """ sequences = [] targets = [] depths = [] Config.char2int = {c: i for i, c in enumerate(sorted(set(input_string)))} # ToDo Discuss with Ben how we want to train on text shorter than the window size? # Get all examples if Config.dataset_size == -1: # iterate over the file window by window i = 0 while i + Config.sequence_length + 1 < len(input_string): sequences += [[ Config.char2int[c] for c in input_string[i:i + Config.sequence_length] ]] depths.append(Config.sequence_length) targets += [ Config.char2int[input_string[i + Config.sequence_length]] ] i += 1 else: # get size many examples for z in range(Config.dataset_size): # get a random starting point r = random.choice( range(len(input_string) - Config.sequence_length - 1)) sequences.append([ Config.char2int[c] for c in input_string[r:r + Config.sequence_length] ]) depths.append(Config.sequence_length) targets.append( Config.char2int[input_string[r + Config.sequence_length]]) assert (len(sequences) == len(targets)) # Define how to randomly split the input data into train and test shuffled_list = list(range(len(sequences))) random.shuffle(shuffled_list) # Determine whether to do a validation split if Config.perform_validation(): split_point = int(Config.training_split_ratio * len(sequences)) else: split_point = len(sequences) Config.Train.x = [sequences[idx] for idx in shuffled_list[:split_point]] Config.Train.depth = [depths[idx] for idx in shuffled_list[:split_point]] Config.Train.t = list( map(lambda idx: _build_target_vector(targets[idx]), shuffled_list[:split_point])) if Config.perform_validation(): Config.Validation.x = [ sequences[idx] for idx in shuffled_list[split_point:] ] Config.Validation.depth = [ depths[idx] for idx in shuffled_list[split_point:] ] Config.Validation.t = list( map(lambda idx: _build_target_vector(targets[idx]), shuffled_list[split_point:]))
def run_training(): net_features = network.construct() input_x = net_features["X"] target = net_features["target"] seq_len = net_features["seq_len"] # Setup the training procedure cross_h = tf.nn.softmax_cross_entropy_with_logits( logits=net_features["output"], labels=target) loss_op = tf.reduce_sum(cross_h) optimizer = tf.train.AdamOptimizer( learning_rate=Config.Train.learning_rate) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(loss_op, tvars), 5.) global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0.0)) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # train_op = optimizer.minimize(loss_op) sess = tf.Session() if Config.Train.restore: Config.import_model(sess) else: sess.run(tf.global_variables_initializer()) num_batches = 0 for epoch in range(0, Config.Train.num_epochs): # Shuffle the batches for each epoch shuffled_list = list(range(Config.Train.size())) random.shuffle(shuffled_list) train_err = 0 for batch in range(0, Config.Train.num_batch()): end_batch = min((batch + 1) * Config.batch_size, Config.Train.size()) start_batch = max(0, end_batch - Config.batch_size) # Use the randomized batches train_x = list( map(lambda idx: Config.Train.x[idx], shuffled_list[start_batch:end_batch])) train_t = list( map(lambda idx: Config.Train.t[idx], shuffled_list[start_batch:end_batch])) seqlen = list( map(lambda idx: Config.Train.depth[idx], shuffled_list[start_batch:end_batch])) _, err = sess.run([train_op, loss_op], feed_dict={ input_x: train_x, target: train_t, seq_len: seqlen }) train_err += err num_batches += 1 BATCH_PRINT_FREQUENCY = 1000 if num_batches % BATCH_PRINT_FREQUENCY == 0: print("Epoch %d: Total Batches %d: Last Batch Error: %0.3f" % (epoch, num_batches, err)) # ToDo It would be nice to add perplexity here. logging.info("EPOCH #%05d COMPLETED" % epoch) train_err /= Config.Train.num_batch() logging.info("Epoch %05d: Average Batch Training Error: \t\t%0.3f" % (epoch, train_err)) if Config.perform_validation(): test_err = _calculate_validation_error(sess, loss_op, input_x, target, seq_len) logging.info( "Epoch %05d: Average Batch Verification Error: \t%0.3f" % (epoch, test_err)) if epoch % Config.Train.checkpoint_frequency == 0: Config.export_model(sess, epoch) sess.close()
def _calculate_validation_error(sess, loss_op, input_x, target, seq_len): """ Determines the validation error """ validation_err = 0 for batch in range(0, Config.Validation.num_batch()): end_batch = min((batch + 1) * Config.batch_size, Config.Validation.size()) start_batch = max(0, end_batch - Config.batch_size) # Use the randomized batches valid_x = Config.Validation.x[start_batch:end_batch] valid_t = Config.Validation.t[start_batch:end_batch] seqlen = Config.Validation.depth[start_batch:end_batch] err = sess.run(loss_op, feed_dict={ input_x: valid_x, target: valid_t, seq_len: seqlen }) validation_err += err validation_err /= Config.Validation.num_batch() return validation_err if __name__ == "__main__": Config.parse_args() data_parser.build_training_and_verification_sets() run_training()