Ejemplo n.º 1
0
    def train_joint(self, sess, image_data, reward_data, y_data, num_epochs, logs):
        """Trains the planner module given MDPs with reward functions and the
        corresponding policies.

        Essentially performs train_reward, but also passes planner graph

        Validation can be turned off by explicitly passing None.
        """
        if self.config.verbosity >= 3:
            print(fmt_row(10, ["Iteration", "Train Cost", "Train Err", "Iter Time"]))
        if reward_data is None:
            # Initialize the reward array to random values
            reward_data = np.random.randn(*image_data.shape)

        planner_ops = [self.planner_optimize_op, self.step1_cost]
        batch_size = self.config.batchsize
        num_batches = int(image_data.shape[0] / batch_size)
        errors, costs, times = [], [], []
        for epoch in range(num_epochs):
            tstart = time.time()
            errors_per_batch, costs_per_batch = [], []
            for batch_num in range(num_batches):
                start, end = batch_num * batch_size, (batch_num + 1) * batch_size
                # We can't feed in reward_data directly to self.reward.
                # See train_reward for the explanation.
                fd = {
                    "reward_input:0": reward_data[start:end],
                }
                sess.run([self.assign_reward.op], feed_dict=fd)

                fd = {
                    "image:0": image_data[start:end],
                    "y:0": y_data[start:end]
                }

                # Run both step1 & step2 ops, report only error & step2 cost
                _, err, cost, _, _ = sess.run(
                    [self.reward_optimize_op, self.err, self.step2_cost] + planner_ops,
                    feed_dict=fd)
                errors_per_batch.append(err)
                costs_per_batch.append(cost)

            epoch_error = sum(errors_per_batch) / len(errors_per_batch)
            errors.append(epoch_error)
            epoch_cost = sum(costs_per_batch) / len(costs_per_batch)
            costs.append(epoch_cost)
            elapsed = time.time() - tstart
            times.append(elapsed)

            if self.config.verbosity >= 3:
                print(fmt_row(10, [epoch, epoch_cost, epoch_error, elapsed]))
                reward_data[start:end] = self.reward.eval()

        logs['train_joint_errs'].append(errors)
        logs['train_joint_costs'].append(costs)
        logs['train_joint_times'].append(times)
        self.accuracy = 100 * (1 - errors[-1])
        logs['accuracy'] = self.accuracy

        return reward_data
Ejemplo n.º 2
0
    def train_reward(self, sess, image_data, reward_data, y_data, num_epochs, logs):
        """Infers the reward using backprop, holding the planner fixed.

        Due to Tensorflow constraints, image_data must contain exactly
        batch_size number of MDPs on which the reward should be inferred.

        The rewards are initialized to the values in reward_data. If reward_data
        is None, the rewards are initialized to all zeroes.
        """
        if self.config.verbosity >= 3:
            print(fmt_row(10, ["Iteration", "Train Cost", "Train Err", "Iter Time"]))
        if reward_data is None:
            reward_data = np.random.randn(*image_data.shape)

        batch_size = self.config.batchsize
        num_batches = int(image_data.shape[0] / batch_size)
        costs, errs = [], []
        for batch_num in range(num_batches):
            if self.config.verbosity >= 2 and batch_num % 10 == 0:
                print('Batch {} of {}'.format(batch_num, num_batches))
            start, end = batch_num * batch_size, (batch_num + 1) * batch_size
            # We can't feed in reward_data directly to self.reward, because then
            # it will treat it as a constant and will not be able to update it
            # with backprop. Instead, we first run an op that assigns the
            # reward, and only then do the backprop.
            fd = {
                "reward_input:0": reward_data[start:end],
            }
            sess.run([self.assign_reward.op], feed_dict=fd)

            if batch_num % 10 == 0:
                costs.append([])
                errs.append([])
            for epoch in range(num_epochs):
                tstart = time.time()
                fd = {
                    "image:0": image_data[start:end],
                    "y:0": y_data[start:end]
                }
                _, e_, c_ = sess.run(
                    [self.reward_optimize_op, self.err, self.step2_cost],
                    feed_dict=fd)
                elapsed = time.time() - tstart
                if self.config.verbosity >= 3 and batch_num % 10 == 0:
                    print(fmt_row(10, [epoch, c_, e_, elapsed]))
                    costs[-1].append(c_)
                    errs[-1].append(e_)

            reward_data[start:end] = self.reward.eval()

        logs['train_reward_costs'].append(costs)
        logs['train_reward_errs'].append(errs)
        return reward_data
Ejemplo n.º 3
0
TrA = []
TeA = []
# Launch the graph
with tf.Session() as sess:
    if config.log:
        for var in tf.trainable_variables():
            tf.summary.histogram(var.op.name, var)
        summary_op = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(config.logdir, sess.graph)
    sess.run(init)
    if Have_trained == True:
        model_file = tf.train.latest_checkpoint('ckpt/')
        saver.restore(sess, model_file)

    batch_size = config.batchsize
    print(fmt_row(10, ["Epoch", "Train Cost",
                       "Train Acc", "Epoch Time", "Test Acc"]))
    for epoch in range(int(config.epochs)):
        learning_rate = learning_rate * 0.95
        tstart = time.time()
        avg_err, avg_cost = 0.0, 0.0
        avg_err_L = []
        avg_cost_L = []
        num_batches = int(Xtrain.shape[0] / batch_size)
        # Loop over all batches
        for i in range(0, Xtrain.shape[0], batch_size):
            j = i + batch_size
            if j <= Xtrain.shape[0]:
                # Run optimization op (backprop) and cost op (to get loss value)
                fd = {X: Xtrain[i:j], S1: S1train[i:j], S2: S2train[i:j],
                      y: ytrain[i * config.statebatchsize:j * config.statebatchsize], LR: learning_rate, keep_drop: 0.5}
                _, e_, c_ = sess.run([optimizer, err, cost], feed_dict=fd)
Ejemplo n.º 4
0
saver = tf.train.Saver()

Xtrain, S1train, S2train, ytrain, Xtest, S1test, S2test, ytest = process_gridworld_data(
    input=config.input, imsize=config.imsize)

# Launch the graph
with tf.Session() as sess:
    if config.log:
        for var in tf.trainable_variables():
            tf.summary.histogram(var.op.name, var)
        summary_op = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(config.logdir, sess.graph)
    sess.run(init)

    batch_size = config.batchsize
    print(fmt_row(10, ["Epoch", "Train Cost", "Train Err", "Epoch Time"]))
    for epoch in range(int(config.epochs)):
        tstart = time.time()
        avg_err, avg_cost = 0.0, 0.0
        num_batches = int(Xtrain.shape[0] / batch_size)
        # Loop over all batches
        for i in range(0, Xtrain.shape[0], batch_size):
            j = i + batch_size
            if j <= Xtrain.shape[0]:
                # Run optimization op (backprop) and cost op (to get loss value)
                fd = {
                    X: Xtrain[i:j],
                    S1: S1train[i:j],
                    S2: S2train[i:j],
                    y:
                    ytrain[i * config.statebatchsize:j * config.statebatchsize]
Ejemplo n.º 5
0
    def train_planner(self, sess, train_data, validation_data, num_epochs, logs):
        """Trains the planner module given MDPs with reward functions and the
        corresponding policies.

        Validation can be turned off by explicitly passing None.
        """
        if self.config.verbosity >= 3:
            print(fmt_row(10, ["Epoch", "Train Cost", "Train Err", "Valid Err", "Epoch Time"]))

        avg_costs, train_errs, validation_errs, times = [], [], [], []
        for epoch in range(int(num_epochs)):
            _, (avg_cost, avg_err), elapsed, epoch_dist = self.run_epoch(
                sess, train_data, [self.planner_optimize_op],
                [self.step1_cost, self.err], [self.pred_dist, self.y_dist])

            # Display logs per epoch step
            if self.config.verbosity >= 3 and epoch % self.config.display_step == 0:
                if validation_data is not None:
                    _, (validation_err,), _, _ = self.run_epoch(sess, validation_data, [], [self.err])
                else:
                    validation_err = 'N/A'
                print(fmt_row(10, [epoch, avg_cost, avg_err, validation_err, elapsed]))
                avg_costs.append(avg_cost)
                train_errs.append(avg_err)
                validation_errs.append(validation_err)
                times.append(elapsed)
            elif self.config.verbosity >= 2 and epoch % self.config.display_step == 0:
                print('Epoch {} of {}'.format(epoch, num_epochs))

            # if self.config.log:
            #     summary = tf.Summary()
            #     summary.ParseFromString(sess.run(summary_op))
            #     summary.value.add(tag='Average error', simple_value=float(avg_err))
            #     summary.value.add(tag='Average cost', simple_value=float(avg_cost))
            #     summary_writer.add_summary(summary, epoch)
        logs['train_planner_costs'].append(avg_costs)
        logs['train_planner_train_errs'].append(train_errs)
        logs['train_planner_validation_errs'].append(validation_errs)
        logs['train_planner_times'].append(times)

        # TODO(rohinmshah): This seems redundant
        num_actions = self.config.num_actions
        action_dists = [np.zeros(num_actions), np.zeros(num_actions)]
        action_dists = [d + b_d for d, b_d in zip(action_dists, epoch_dist)]
        action_dists = [d / (np.sum(d)) for d in action_dists]
        pred = action_dists[0].tolist()
        actual = action_dists[1].tolist()
        logs['train_planner_predicted_action_dists'].append(pred)
        logs['train_planner_actual_action_dists'].append(actual)
        if self.config.verbosity >= 3:
            print("Action Distribution Comparison")
            print("------------------------------")
            print(fmt_row(10, ["Predicted"] + pred))
            print(fmt_row(10, ["Actual"]+ actual))

        if validation_data is not None:
            _, (err,), _, _ = self.run_epoch(sess, validation_data, [], [self.err])
            logs['accuracy'].append(100 * (1 - err))
            if self.config.verbosity >= 1:
                print('Validation Accuracy: ' + str(100 * (1 - err)))

        # Saving SavedModel instance
        if self.config.savemodel:
            saver = tf.train.Saver()
            # This allows for the model to perform reward inference
            saver.save(sess, "model_save_sess_0/")
init = tf.global_variables_initializer()
saver = tf.train.Saver()

Xtrain, S1train, S2train, ytrain, Xtest, S1test, S2test, ytest = process_gridworld_data(input=config.input, imsize=config.imsize)

# Launch the graph
with tf.Session() as sess:
    if config.log:
        for var in tf.trainable_variables():
            tf.summary.histogram(var.op.name, var)
        summary_op = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(config.logdir, sess.graph)
    sess.run(init)

    batch_size = config.batchsize
    print(fmt_row(10, ["Epoch", "Train Cost", "Train Err", "Epoch Time"]))
    for epoch in range(int(config.epochs)):
        tstart = time.time()
        avg_err, avg_cost = 0.0, 0.0
        num_batches = int(Xtrain.shape[0]/batch_size)
        # Loop over all batches
        for i in range(0, Xtrain.shape[0], batch_size):
            j = i + batch_size
            if j <= Xtrain.shape[0]:
                # Run optimization op (backprop) and cost op (to get loss value)
                fd = {X: Xtrain[i:j], S1: S1train[i:j], S2: S2train[i:j],
                    y: ytrain[i * config.statebatchsize:j * config.statebatchsize]}
                _, e_, c_ = sess.run([optimizer, err, cost], feed_dict=fd)
                avg_err += e_
                avg_cost += c_
        # Display logs per epoch step