def main(args): """Main function to train the model. Args: args: Parsed arguments. Returns: Execution status defined by `constants.ExitCode`. """ # Validate paths. if not validate_paths(args): return constants.ExitCode.INVALID_PATH # Extract paths. input_dir = args.input_dir model_dir = args.model_dir log_dir = args.log_dir existing_model = args.existing_model # Extract model parameters. batch_size = args.batch_size dropout_pkeep = args.dropout_pkeep hidden_state_size = args.hidden_state_size hidden_layer_size = args.hidden_layer_size learning_rate = args.learning_rate # Extract additional flags. debug = args.debug validation = args.validation # Split corpus for training and validation. # validation_text will be empty if validation is False. code_text, validation_text, input_ranges = utils.read_data_files( input_dir, validation=validation) # Bail out if we don't have enough corpus for training. if len(code_text) < batch_size * constants.TRAINING_SEQLEN + 1: return constants.ExitCode.CORPUS_TOO_SMALL # Get corpus files info. Will be used in debug mode to generate sample text. files_info_list = [] if debug: files_info_list = utils.get_files_info(input_dir) assert files_info_list # Calculate validation batch size. It will be 0 if we choose not to validate. validation_batch_size = len(validation_text) // constants.VALIDATION_SEQLEN # Display some stats on the data. epoch_size = len(code_text) // (batch_size * constants.TRAINING_SEQLEN) utils.print_data_stats(len(code_text), len(validation_text), epoch_size) # Set global random seed, so any random sequence generated is repeatable. # It could also be removed. tf.random.set_seed(0) # Build the RNN model. model = utils.build_model(hidden_layer_size * hidden_state_size, dropout_pkeep, batch_size, debug) # Choose Adam optimizer to compute gradients. optimizer = tf.keras.optimizers.Adam(learning_rate) # Init Tensorboard stuff. # This will save Tensorboard information in folder specified in command line. # Two sets of data are saved so that you can compare training and # validation curves visually in Tensorboard. timestamp = str(math.trunc(time.time())) summary_writer = tf.summary.create_file_writer( os.path.join(log_dir, timestamp + '-training')) validation_writer = tf.summary.create_file_writer( os.path.join(log_dir, timestamp + '-validation')) # For display: init the progress bar. step_size = batch_size * constants.TRAINING_SEQLEN frequency = constants.DISPLAY_FREQ * step_size progress = utils.Progress(constants.DISPLAY_FREQ, size=constants.DISPLAY_LEN, msg='Training on next {} batches'.format( constants.DISPLAY_FREQ)) # We continue training on existing model, or start with a new model. if existing_model: print('Continue training on existing model: {}'.format(existing_model)) try: model.load_weights(existing_model) except: print(('Failed to restore existing model since model ' 'parameters do not match.'), file=sys.stderr) return constants.ExitCode.TENSORFLOW_ERROR else: print('No existing model provided. Start training with a new model.') # Num of bytes we have trained so far. steps = 0 # Training loop. for input_batch, expected_batch, epoch in utils.rnn_minibatch_sequencer( code_text, batch_size, constants.TRAINING_SEQLEN, nb_epochs=constants.EPOCHS): # Train on one mini-batch. seq_loss, batch_loss, accuracy, output_bytes = train_step( model, optimizer, input_batch, expected_batch, train=True) # Log training data for Tensorboard display a mini-batch of sequences # every `frequency` batches. if debug and steps % frequency == 0: utils.print_learning_learned_comparison(input_batch, output_bytes, seq_loss, input_ranges, batch_loss, accuracy, epoch_size, steps, epoch) with summary_writer.as_default(): # pylint: disable=not-context-manager tf.summary.scalar('batch_loss', batch_loss, step=steps) tf.summary.scalar('batch_accuracy', accuracy, step=steps) summary_writer.flush() # Run a validation step every `frequency` batches. # The validation text should be a single sequence but that's too slow. # We cut it up and batch the pieces (slightly inaccurate). if validation and steps % frequency == 0 and validation_batch_size: utils.print_validation_header(len(code_text), input_ranges) validation_x, validation_y, _ = next( utils.rnn_minibatch_sequencer(validation_text, validation_batch_size, constants.VALIDATION_SEQLEN, 1)) validation_model = utils.build_model( hidden_layer_size * hidden_state_size, dropout_pkeep, validation_batch_size, False) last_weights = tf.train.latest_checkpoint(model_dir) if last_weights: validation_model.load_weights( tf.train.latest_checkpoint(model_dir)) validation_model.build( tf.TensorShape([validation_batch_size, None])) validation_model.reset_states() # Run one single inference step _, batch_loss, accuracy, _ = train_step(validation_model, optimizer, validation_x, validation_y, train=False) utils.print_validation_stats(batch_loss, accuracy) # Save validation data for Tensorboard. with validation_writer.as_default(): # pylint: disable=not-context-manager tf.summary.scalar('batch_loss', batch_loss, step=steps) tf.summary.scalar('batch_accuracy', accuracy, step=steps) validation_writer.flush() # Display a short text generated with the current weights and biases. # If enabled, there will be a large output. if debug and steps // 4 % frequency == 0: utils.print_text_generation_header() file_info = utils.random_element_from_list(files_info_list) first_byte, file_size = file_info['first_byte'], file_info[ 'file_size'] ry = np.array([[first_byte]]) sample = [first_byte] generation_model = utils.build_model( hidden_layer_size * hidden_state_size, dropout_pkeep, 1, False) last_weights = tf.train.latest_checkpoint(model_dir) if last_weights: generation_model.load_weights( tf.train.latest_checkpoint(model_dir)) generation_model.build(tf.TensorShape([1, None])) generation_model.reset_states() for _ in range(file_size - 1): prediction = generation_model(ry) prediction = tf.squeeze(prediction, 0).numpy() rc = utils.sample_from_probabilities( prediction, topn=10 if epoch <= 1 else 2) sample.append(rc) ry = np.array([[rc]]) print(repr(utils.decode_to_text(sample))) utils.print_text_generation_footer() # Save a checkpoint every `10 * frequency` batches. Each checkpoint is # a version of model. if steps // 10 % frequency == 0: saved_model_name = constants.RNN_MODEL_NAME + '_' + timestamp saved_model_path = os.path.join(model_dir, saved_model_name) model.save_weights(saved_model_path) print('Saved model: {}'.format(saved_model_path)) # Display progress bar. if debug: progress.step(reset=steps % frequency == 0) # Update state. steps += step_size # Save the model after training is done. saved_model_name = constants.RNN_MODEL_NAME + '_' + timestamp saved_model_path = os.path.join(model_dir, saved_model_name) model.save_weights(saved_model_path) print('Saved model: {}'.format(saved_model_path)) return constants.ExitCode.SUCCESS
def main(args): """Main function to train the model. Args: args: Parsed arguments. Returns: Execution status defined by `constants.ExitCode`. """ # Validate paths. if not validate_paths(args): return constants.ExitCode.INVALID_PATH # Extract paths. input_dir = args.input_dir model_dir = args.model_dir log_dir = args.log_dir existing_model = args.existing_model # Extract model parameters. batch_size = args.batch_size dropout_pkeep = args.dropout_pkeep hidden_state_size = args.hidden_state_size hidden_layer_size = args.hidden_layer_size learning_rate = args.learning_rate # Extract additional flags. debug = args.debug validation = args.validation # Split corpus for training and validation. # validation_text will be empty if validation is False. code_text, validation_text, input_ranges = utils.read_data_files( input_dir, validation=validation) # Bail out if we don't have enough corpus for training. if len(code_text) < batch_size * constants.TRAINING_SEQLEN + 1: return constants.ExitCode.CORPUS_TOO_SMALL # Get corpus files info. Will be used in debug mode to generate sample text. files_info_list = [] if debug: files_info_list = utils.get_files_info(input_dir) if not files_info_list: raise AssertionError # Calculate validation batch size. It will be 0 if we choose not to validate. validation_batch_size = len(validation_text) // constants.VALIDATION_SEQLEN # Display some stats on the data. epoch_size = len(code_text) // (batch_size * constants.TRAINING_SEQLEN) utils.print_data_stats(len(code_text), len(validation_text), epoch_size) # Set graph-level random seed, so any random sequence generated in this # graph is repeatable. It could also be removed. tf.set_random_seed(0) # Define placeholder for learning rate, dropout and batch size. lr = tf.placeholder(tf.float32, name='lr') pkeep = tf.placeholder(tf.float32, name='pkeep') batchsize = tf.placeholder(tf.int32, name='batchsize') # Input data. input_bytes = tf.placeholder(tf.uint8, [None, None], name='input_bytes') input_onehot = tf.one_hot(input_bytes, constants.ALPHA_SIZE, 1.0, 0.0) # Expected outputs = same sequence shifted by 1, since we are trying to # predict the next character. expected_bytes = tf.placeholder(tf.uint8, [None, None], name='expected_bytes') expected_onehot = tf.one_hot(expected_bytes, constants.ALPHA_SIZE, 1.0, 0.0) # Input state. hidden_state = tf.placeholder( tf.float32, [None, hidden_state_size * hidden_layer_size], name='hidden_state') # "naive dropout" implementation. cells = [rnn.GRUCell(hidden_state_size) for _ in range(hidden_layer_size)] dropcells = [ rnn.DropoutWrapper(cell, input_keep_prob=pkeep) for cell in cells ] multicell = rnn.MultiRNNCell(dropcells, state_is_tuple=False) multicell = rnn.DropoutWrapper(multicell, output_keep_prob=pkeep) output_raw, next_state = tf.nn.dynamic_rnn(multicell, input_onehot, dtype=tf.float32, initial_state=hidden_state) next_state = tf.identity(next_state, name='next_state') # Reshape training outputs. output_flat = tf.reshape(output_raw, [-1, hidden_state_size]) output_logits = layers.linear(output_flat, constants.ALPHA_SIZE) # Reshape expected outputs. expected_flat = tf.reshape(expected_onehot, [-1, constants.ALPHA_SIZE]) # Compute training loss. loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=output_logits, labels=expected_flat) loss = tf.reshape(loss, [batchsize, -1]) # Use softmax to normalize training outputs. output_onehot = tf.nn.softmax(output_logits, name='output_onehot') # Use argmax to get the max value, which is the predicted bytes. output_bytes = tf.argmax(output_onehot, 1) output_bytes = tf.reshape(output_bytes, [batchsize, -1], name='output_bytes') # Choose Adam optimizer to compute gradients. optimizer = tf.train.AdamOptimizer(lr).minimize(loss) # Stats for display. seqloss = tf.reduce_mean(loss, 1) batchloss = tf.reduce_mean(seqloss) accuracy = tf.reduce_mean( tf.cast(tf.equal(expected_bytes, tf.cast(output_bytes, tf.uint8)), tf.float32)) loss_summary = tf.summary.scalar('batch_loss', batchloss) acc_summary = tf.summary.scalar('batch_accuracy', accuracy) summaries = tf.summary.merge([loss_summary, acc_summary]) # Init Tensorboard stuff. # This will save Tensorboard information in folder specified in command line. # Two sets of data are saved so that you can compare training and # validation curves visually in Tensorboard. timestamp = str(math.trunc(time.time())) summary_writer = tf.summary.FileWriter( os.path.join(log_dir, timestamp + '-training')) validation_writer = tf.summary.FileWriter( os.path.join(log_dir, timestamp + '-validation')) # Init for saving models. # They will be saved into a directory specified in command line. saver = tf.train.Saver(max_to_keep=constants.MAX_TO_KEEP) # For display: init the progress bar. step_size = batch_size * constants.TRAINING_SEQLEN frequency = constants.DISPLAY_FREQ * step_size progress = utils.Progress(constants.DISPLAY_FREQ, size=constants.DISPLAY_LEN, msg='Training on next {} batches'.format( constants.DISPLAY_FREQ)) # Set initial state. state = np.zeros([batch_size, hidden_state_size * hidden_layer_size]) session = tf.Session() # We continue training on exsiting model, or start with a new model. if existing_model: print('Continue training on existing model: {}'.format(existing_model)) try: saver.restore(session, existing_model) except: print(('Failed to restore existing model since model ' 'parameters do not match.'), file=sys.stderr) return constants.ExitCode.TENSORFLOW_ERROR else: print('No existing model provided. Start training with a new model.') session.run(tf.global_variables_initializer()) # Num of bytes we have trained so far. steps = 0 # Training loop. for input_batch, expected_batch, epoch in utils.rnn_minibatch_sequencer( code_text, batch_size, constants.TRAINING_SEQLEN, nb_epochs=constants.EPOCHS): # Train on one mini-batch. feed_dict = { input_bytes: input_batch, expected_bytes: expected_batch, hidden_state: state, lr: learning_rate, pkeep: dropout_pkeep, batchsize: batch_size } _, predicted, new_state = session.run( [optimizer, output_bytes, next_state], feed_dict=feed_dict) # Log training data for Tensorboard display a mini-batch of sequences # every `frequency` batches. if debug and steps % frequency == 0: feed_dict = { input_bytes: input_batch, expected_bytes: expected_batch, hidden_state: state, pkeep: 1.0, batchsize: batch_size } predicted, seq_loss, batch_loss, acc_value, summaries_value = session.run( [output_bytes, seqloss, batchloss, accuracy, summaries], feed_dict=feed_dict) utils.print_learning_learned_comparison(input_batch, predicted, seq_loss, input_ranges, batch_loss, acc_value, epoch_size, steps, epoch) summary_writer.add_summary(summaries_value, steps) # Run a validation step every `frequency` batches. # The validation text should be a single sequence but that's too slow. # We cut it up and batch the pieces (slightly inaccurate). if validation and steps % frequency == 0 and validation_batch_size: utils.print_validation_header(len(code_text), input_ranges) validation_x, validation_y, _ = next( utils.rnn_minibatch_sequencer(validation_text, validation_batch_size, constants.VALIDATION_SEQLEN, 1)) null_state = np.zeros( [validation_batch_size, hidden_state_size * hidden_layer_size]) feed_dict = { input_bytes: validation_x, expected_bytes: validation_y, hidden_state: null_state, pkeep: 1.0, batchsize: validation_batch_size } batch_loss, acc_value, summaries_value = session.run( [batchloss, accuracy, summaries], feed_dict=feed_dict) utils.print_validation_stats(batch_loss, acc_value) # Save validation data for Tensorboard. validation_writer.add_summary(summaries_value, steps) # Display a short text generated with the current weights and biases. # If enabled, there will be a large output. if debug and steps // 4 % frequency == 0: utils.print_text_generation_header() file_info = utils.random_element_from_list(files_info_list) first_byte, file_size = file_info['first_byte'], file_info[ 'file_size'] ry = np.array([[first_byte]]) rh = np.zeros([1, hidden_state_size * hidden_layer_size]) sample = [first_byte] for _ in range(file_size - 1): feed_dict = { input_bytes: ry, pkeep: 1.0, hidden_state: rh, batchsize: 1 } ryo, rh = session.run([output_onehot, next_state], feed_dict=feed_dict) rc = utils.sample_from_probabilities( ryo, topn=10 if epoch <= 1 else 2) sample.append(rc) ry = np.array([[rc]]) print(repr(utils.decode_to_text(sample))) utils.print_text_generation_footer() # Save a checkpoint every `10 * frequency` batches. Each checkpoint is # a version of model. if steps // 10 % frequency == 0: saved_model_name = constants.RNN_MODEL_NAME + '_' + timestamp saved_model_path = os.path.join(model_dir, saved_model_name) saved_model = saver.save(session, saved_model_path, global_step=steps) print('Saved model: {}'.format(saved_model)) # Display progress bar. if debug: progress.step(reset=steps % frequency == 0) # Update state. state = new_state steps += step_size # Save the model after training is done. saved_model_name = constants.RNN_MODEL_NAME + '_' + timestamp saved_model_path = os.path.join(model_dir, saved_model_name) saved_model = saver.save(session, saved_model_path, global_step=steps) print('Saved model: {}'.format(saved_model)) return constants.ExitCode.SUCCESS
def main(args): """Main function to generate inputs. Args: args: Parsed arguments. Returns: Execution status defined by `constants.ExitCode`. """ # Validate required paths. if not validate_paths(args): return constants.ExitCode.INVALID_PATH # Extract paths. input_dir = args.input_dir output_dir = args.output_dir model_path = args.model_path # Extract model parameters. count = args.count hidden_state_size = args.hidden_state_size hidden_layer_size = args.hidden_layer_size # Use timestamp as part of identifier for each testcase generated. timestamp = str(math.trunc(time.time())) with tf.compat.v1.Session() as session: print('\nusing model {} to generate {} inputs...'.format( model_path, count)) # Restore the model. new_saver = tf.compat.v1.train.import_meta_graph( model_path + constants.MODEL_META_SUFFIX) new_saver.restore(session, model_path) corpus_files_info = utils.get_files_info(input_dir) if not corpus_files_info: return constants.ExitCode.CORPUS_TOO_SMALL new_units_count = 0 while new_units_count < count: # Reset hidden states each time to generate new inputs, so that # different rounds will not interfere. state = np.zeros( [BATCH_SIZE, hidden_state_size * hidden_layer_size], dtype=np.float32) # Randomly select `BATCH_SIZE` number of inputs from corpus. # Record their first byte and file length. new_files_bytes = [] corpus_files_length = [] for i in range(BATCH_SIZE): file_info = utils.random_element_from_list(corpus_files_info) first_byte, file_size = file_info['first_byte'], file_info[ 'file_size'] new_files_bytes.append([first_byte]) corpus_files_length.append(file_size) # Use 1st and 3rd quartile values as lower and upper bound respectively. # Also make sure they are within upper and lower bounds. max_length = int(np.percentile(corpus_files_length, 75)) max_length = min(max_length, UPPER_LENGTH_LIMIT) min_length = int(np.percentile(corpus_files_length, 25)) min_length = max(LOWER_LENGTH_LIMIT, min_length) # Reset in special cases where min_length exceeds upper limit. if min_length >= max_length: min_length = LOWER_LENGTH_LIMIT input_bytes = np.array(new_files_bytes) for _ in range(max_length - 1): feed_dict = { 'input_bytes:0': input_bytes, 'pkeep:0': 1.0, 'hidden_state:0': state, 'batchsize:0': BATCH_SIZE } try: output, new_state = session.run( ['output_onehot:0', 'next_state:0'], feed_dict=feed_dict) except ValueError: print(('Failed to run TensorFlow operations since ' 'model parameters do not match.'), file=sys.stderr) return constants.ExitCode.TENSORFLOW_ERROR for i in range(BATCH_SIZE): predicted_byte = utils.sample_from_probabilities(output[i], topn=TOPN) new_files_bytes[i].append(predicted_byte) input_bytes[i][0] = predicted_byte # Update state. state = new_state # Use timestamp as part of file name. for i in range(BATCH_SIZE): new_file_name = '{}_{:0>8}'.format(timestamp, new_units_count) new_file_path = os.path.join(output_dir, new_file_name) # Use existing input length if possible, but make sure it is between # min_length and max_length. new_file_length = max(min_length, min(corpus_files_length[i], max_length)) new_file_byte_array = bytearray( new_files_bytes[i][:new_file_length]) with open(new_file_path, 'wb') as new_file: new_file.write(new_file_byte_array) print('generate input: {}, feed byte: {}, input length: {}'. format(new_file_path, new_files_bytes[i][0], new_file_length)) # Have we got enough inputs? new_units_count += 1 if new_units_count >= count: break print('Done.') return constants.ExitCode.SUCCESS
def main(args): """Main function to generate inputs. Args: args: Parsed arguments. Returns: Execution status defined by `constants.ExitCode`. """ # Validate required paths. if not validate_paths(args): return constants.ExitCode.INVALID_PATH # Extract paths. input_dir = args.input_dir output_dir = args.output_dir model_path = args.model_path # Extract model parameters. count = args.count hidden_state_size = args.hidden_state_size hidden_layer_size = args.hidden_layer_size # Use timestamp as part of identifier for each testcase generated. timestamp = str(math.trunc(time.time())) print('\nusing model {} to generate {} inputs...'.format( model_path, count)) # Restore the RNN model by building it and loading the weights. model = utils.build_model(hidden_layer_size * hidden_state_size, constants.DROPOUT_PKEEP, constants.BATCH_SIZE, False) try: model.load_weights(model_path) except ValueError: print('Incompatible model parameters.', file=sys.stderr) return constants.ExitCode.TENSORFLOW_ERROR model.build(tf.TensorShape([constants.BATCH_SIZE, None])) model.reset_states() corpus_files_info = utils.get_files_info(input_dir) if not corpus_files_info: return constants.ExitCode.CORPUS_TOO_SMALL new_units_count = 0 while new_units_count < count: # Reset hidden states each time to generate new inputs, so that # different rounds will not interfere. model.reset_states() # Randomly select `BATCH_SIZE` number of inputs from corpus. # Record their first byte and file length. new_files_bytes = [] corpus_files_length = [] for i in range(BATCH_SIZE): file_info = utils.random_element_from_list(corpus_files_info) first_byte, file_size = file_info['first_byte'], file_info[ 'file_size'] new_files_bytes.append([first_byte]) corpus_files_length.append(file_size) # Use 1st and 3rd quartile values as lower and upper bound respectively. # Also make sure they are within upper and lower bounds. max_length = int(np.percentile(corpus_files_length, 75)) max_length = min(max_length, UPPER_LENGTH_LIMIT) min_length = int(np.percentile(corpus_files_length, 25)) min_length = max(LOWER_LENGTH_LIMIT, min_length) # Reset in special cases where min_length exceeds upper limit. if min_length >= max_length: min_length = LOWER_LENGTH_LIMIT input_bytes = np.array(new_files_bytes) for _ in range(max_length - 1): try: output = model(input_bytes).numpy() except tf.errors.InvalidArgumentError: print(('Failed to run TensorFlow operations since ' 'model parameters do not match.'), file=sys.stderr) return constants.ExitCode.TENSORFLOW_ERROR for i in range(BATCH_SIZE): predicted_byte = utils.sample_from_probabilities(output[i], topn=TOPN) new_files_bytes[i].append(predicted_byte) input_bytes[i][0] = predicted_byte # Use timestamp as part of file name. for i in range(BATCH_SIZE): new_file_name = '{}_{:0>8}'.format(timestamp, new_units_count) new_file_path = os.path.join(output_dir, new_file_name) # Use existing input length if possible, but make sure it is between # min_length and max_length. new_file_length = max(min_length, min(corpus_files_length[i], max_length)) new_file_byte_array = bytearray( new_files_bytes[i][:new_file_length]) with open(new_file_path, 'wb') as new_file: new_file.write(new_file_byte_array) print('generate input: {}, feed byte: {}, input length: {}'.format( new_file_path, new_files_bytes[i][0], new_file_length)) # Have we got enough inputs? new_units_count += 1 if new_units_count >= count: break print('Done.') return constants.ExitCode.SUCCESS