def train_joint(self, sess, image_data, reward_data, y_data, num_epochs, logs): """Trains the planner module given MDPs with reward functions and the corresponding policies. Essentially performs train_reward, but also passes planner graph Validation can be turned off by explicitly passing None. """ if self.config.verbosity >= 3: print(fmt_row(10, ["Iteration", "Train Cost", "Train Err", "Iter Time"])) if reward_data is None: # Initialize the reward array to random values reward_data = np.random.randn(*image_data.shape) planner_ops = [self.planner_optimize_op, self.step1_cost] batch_size = self.config.batchsize num_batches = int(image_data.shape[0] / batch_size) errors, costs, times = [], [], [] for epoch in range(num_epochs): tstart = time.time() errors_per_batch, costs_per_batch = [], [] for batch_num in range(num_batches): start, end = batch_num * batch_size, (batch_num + 1) * batch_size # We can't feed in reward_data directly to self.reward. # See train_reward for the explanation. fd = { "reward_input:0": reward_data[start:end], } sess.run([self.assign_reward.op], feed_dict=fd) fd = { "image:0": image_data[start:end], "y:0": y_data[start:end] } # Run both step1 & step2 ops, report only error & step2 cost _, err, cost, _, _ = sess.run( [self.reward_optimize_op, self.err, self.step2_cost] + planner_ops, feed_dict=fd) errors_per_batch.append(err) costs_per_batch.append(cost) epoch_error = sum(errors_per_batch) / len(errors_per_batch) errors.append(epoch_error) epoch_cost = sum(costs_per_batch) / len(costs_per_batch) costs.append(epoch_cost) elapsed = time.time() - tstart times.append(elapsed) if self.config.verbosity >= 3: print(fmt_row(10, [epoch, epoch_cost, epoch_error, elapsed])) reward_data[start:end] = self.reward.eval() logs['train_joint_errs'].append(errors) logs['train_joint_costs'].append(costs) logs['train_joint_times'].append(times) self.accuracy = 100 * (1 - errors[-1]) logs['accuracy'] = self.accuracy return reward_data
def train_reward(self, sess, image_data, reward_data, y_data, num_epochs, logs): """Infers the reward using backprop, holding the planner fixed. Due to Tensorflow constraints, image_data must contain exactly batch_size number of MDPs on which the reward should be inferred. The rewards are initialized to the values in reward_data. If reward_data is None, the rewards are initialized to all zeroes. """ if self.config.verbosity >= 3: print(fmt_row(10, ["Iteration", "Train Cost", "Train Err", "Iter Time"])) if reward_data is None: reward_data = np.random.randn(*image_data.shape) batch_size = self.config.batchsize num_batches = int(image_data.shape[0] / batch_size) costs, errs = [], [] for batch_num in range(num_batches): if self.config.verbosity >= 2 and batch_num % 10 == 0: print('Batch {} of {}'.format(batch_num, num_batches)) start, end = batch_num * batch_size, (batch_num + 1) * batch_size # We can't feed in reward_data directly to self.reward, because then # it will treat it as a constant and will not be able to update it # with backprop. Instead, we first run an op that assigns the # reward, and only then do the backprop. fd = { "reward_input:0": reward_data[start:end], } sess.run([self.assign_reward.op], feed_dict=fd) if batch_num % 10 == 0: costs.append([]) errs.append([]) for epoch in range(num_epochs): tstart = time.time() fd = { "image:0": image_data[start:end], "y:0": y_data[start:end] } _, e_, c_ = sess.run( [self.reward_optimize_op, self.err, self.step2_cost], feed_dict=fd) elapsed = time.time() - tstart if self.config.verbosity >= 3 and batch_num % 10 == 0: print(fmt_row(10, [epoch, c_, e_, elapsed])) costs[-1].append(c_) errs[-1].append(e_) reward_data[start:end] = self.reward.eval() logs['train_reward_costs'].append(costs) logs['train_reward_errs'].append(errs) return reward_data
TrA = [] TeA = [] # Launch the graph with tf.Session() as sess: if config.log: for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(config.logdir, sess.graph) sess.run(init) if Have_trained == True: model_file = tf.train.latest_checkpoint('ckpt/') saver.restore(sess, model_file) batch_size = config.batchsize print(fmt_row(10, ["Epoch", "Train Cost", "Train Acc", "Epoch Time", "Test Acc"])) for epoch in range(int(config.epochs)): learning_rate = learning_rate * 0.95 tstart = time.time() avg_err, avg_cost = 0.0, 0.0 avg_err_L = [] avg_cost_L = [] num_batches = int(Xtrain.shape[0] / batch_size) # Loop over all batches for i in range(0, Xtrain.shape[0], batch_size): j = i + batch_size if j <= Xtrain.shape[0]: # Run optimization op (backprop) and cost op (to get loss value) fd = {X: Xtrain[i:j], S1: S1train[i:j], S2: S2train[i:j], y: ytrain[i * config.statebatchsize:j * config.statebatchsize], LR: learning_rate, keep_drop: 0.5} _, e_, c_ = sess.run([optimizer, err, cost], feed_dict=fd)
saver = tf.train.Saver() Xtrain, S1train, S2train, ytrain, Xtest, S1test, S2test, ytest = process_gridworld_data( input=config.input, imsize=config.imsize) # Launch the graph with tf.Session() as sess: if config.log: for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(config.logdir, sess.graph) sess.run(init) batch_size = config.batchsize print(fmt_row(10, ["Epoch", "Train Cost", "Train Err", "Epoch Time"])) for epoch in range(int(config.epochs)): tstart = time.time() avg_err, avg_cost = 0.0, 0.0 num_batches = int(Xtrain.shape[0] / batch_size) # Loop over all batches for i in range(0, Xtrain.shape[0], batch_size): j = i + batch_size if j <= Xtrain.shape[0]: # Run optimization op (backprop) and cost op (to get loss value) fd = { X: Xtrain[i:j], S1: S1train[i:j], S2: S2train[i:j], y: ytrain[i * config.statebatchsize:j * config.statebatchsize]
def train_planner(self, sess, train_data, validation_data, num_epochs, logs): """Trains the planner module given MDPs with reward functions and the corresponding policies. Validation can be turned off by explicitly passing None. """ if self.config.verbosity >= 3: print(fmt_row(10, ["Epoch", "Train Cost", "Train Err", "Valid Err", "Epoch Time"])) avg_costs, train_errs, validation_errs, times = [], [], [], [] for epoch in range(int(num_epochs)): _, (avg_cost, avg_err), elapsed, epoch_dist = self.run_epoch( sess, train_data, [self.planner_optimize_op], [self.step1_cost, self.err], [self.pred_dist, self.y_dist]) # Display logs per epoch step if self.config.verbosity >= 3 and epoch % self.config.display_step == 0: if validation_data is not None: _, (validation_err,), _, _ = self.run_epoch(sess, validation_data, [], [self.err]) else: validation_err = 'N/A' print(fmt_row(10, [epoch, avg_cost, avg_err, validation_err, elapsed])) avg_costs.append(avg_cost) train_errs.append(avg_err) validation_errs.append(validation_err) times.append(elapsed) elif self.config.verbosity >= 2 and epoch % self.config.display_step == 0: print('Epoch {} of {}'.format(epoch, num_epochs)) # if self.config.log: # summary = tf.Summary() # summary.ParseFromString(sess.run(summary_op)) # summary.value.add(tag='Average error', simple_value=float(avg_err)) # summary.value.add(tag='Average cost', simple_value=float(avg_cost)) # summary_writer.add_summary(summary, epoch) logs['train_planner_costs'].append(avg_costs) logs['train_planner_train_errs'].append(train_errs) logs['train_planner_validation_errs'].append(validation_errs) logs['train_planner_times'].append(times) # TODO(rohinmshah): This seems redundant num_actions = self.config.num_actions action_dists = [np.zeros(num_actions), np.zeros(num_actions)] action_dists = [d + b_d for d, b_d in zip(action_dists, epoch_dist)] action_dists = [d / (np.sum(d)) for d in action_dists] pred = action_dists[0].tolist() actual = action_dists[1].tolist() logs['train_planner_predicted_action_dists'].append(pred) logs['train_planner_actual_action_dists'].append(actual) if self.config.verbosity >= 3: print("Action Distribution Comparison") print("------------------------------") print(fmt_row(10, ["Predicted"] + pred)) print(fmt_row(10, ["Actual"]+ actual)) if validation_data is not None: _, (err,), _, _ = self.run_epoch(sess, validation_data, [], [self.err]) logs['accuracy'].append(100 * (1 - err)) if self.config.verbosity >= 1: print('Validation Accuracy: ' + str(100 * (1 - err))) # Saving SavedModel instance if self.config.savemodel: saver = tf.train.Saver() # This allows for the model to perform reward inference saver.save(sess, "model_save_sess_0/")
init = tf.global_variables_initializer() saver = tf.train.Saver() Xtrain, S1train, S2train, ytrain, Xtest, S1test, S2test, ytest = process_gridworld_data(input=config.input, imsize=config.imsize) # Launch the graph with tf.Session() as sess: if config.log: for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(config.logdir, sess.graph) sess.run(init) batch_size = config.batchsize print(fmt_row(10, ["Epoch", "Train Cost", "Train Err", "Epoch Time"])) for epoch in range(int(config.epochs)): tstart = time.time() avg_err, avg_cost = 0.0, 0.0 num_batches = int(Xtrain.shape[0]/batch_size) # Loop over all batches for i in range(0, Xtrain.shape[0], batch_size): j = i + batch_size if j <= Xtrain.shape[0]: # Run optimization op (backprop) and cost op (to get loss value) fd = {X: Xtrain[i:j], S1: S1train[i:j], S2: S2train[i:j], y: ytrain[i * config.statebatchsize:j * config.statebatchsize]} _, e_, c_ = sess.run([optimizer, err, cost], feed_dict=fd) avg_err += e_ avg_cost += c_ # Display logs per epoch step