Ejemplo n.º 1
0
    def train(self):
        gc_dump_time = time.time()
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            # This seems like a rather sequential method
            pool = SimpleReplayPool(
                max_pool_size=self.replay_pool_size,
                observation_dim=self.env.observation_space.flat_dim,
                action_dim=self.env.action_space.flat_dim,
                replacement_prob=self.replacement_prob,
            )
            self.start_worker()

            self.init_opt()
            # This initializes the optimizer parameters
            sess.run(tf.global_variables_initializer())
            itr = 0
            path_length = 0
            path_return = 0
            terminal = False
            initial = False
            observation = self.env.reset()

            #with tf.variable_scope("sample_policy"):
            #with suppress_params_loading():
            #sample_policy = pickle.loads(pickle.dumps(self.policy))
            with tf.variable_scope("sample_policy"):
                sample_policy = Serializable.clone(self.policy)

            for epoch in range(self.n_epochs):
                logger.push_prefix('epoch #%d | ' % epoch)
                logger.log("Training started")
                train_qf_itr, train_policy_itr = 0, 0
                for epoch_itr in pyprind.prog_bar(range(self.epoch_length)):
                    # Execute policy
                    if terminal:  # or path_length > self.max_path_length:
                        # Note that if the last time step ends an episode, the very
                        # last state and observation will be ignored and not added
                        # to the replay pool
                        observation = self.env.reset()
                        self.es.reset()
                        sample_policy.reset()
                        self.es_path_returns.append(path_return)
                        path_length = 0
                        path_return = 0
                        initial = True
                    else:
                        initial = False

                    actions = []

                    for i in range(100):
                        action, _ = sample_policy.get_action_with_dropout(
                            observation)
                        actions.append(action)

                    tiled_observations = [observation] * len(actions)

                    all_qvals = []

                    for i in range(100):
                        q_vals = self.qf.get_qval_dropout(
                            np.vstack(tiled_observations), np.vstack(actions))
                        all_qvals.append(q_vals)

                    action_max = np.argmax(np.vstack(all_qvals)) % len(actions)

                    next_observation, reward, terminal, _ = self.env.step(
                        actions[action_max])
                    path_length += 1
                    path_return += reward

                    if not terminal and path_length >= self.max_path_length:
                        terminal = True
                        # only include the terminal transition in this case if the flag was set
                        if self.include_horizon_terminal_transitions:
                            pool.add_sample(observation, action,
                                            reward * self.scale_reward,
                                            terminal, initial)
                    else:
                        pool.add_sample(observation, action,
                                        reward * self.scale_reward, terminal,
                                        initial)

                    observation = next_observation

                    if pool.size >= self.min_pool_size:
                        for update_itr in range(self.n_updates_per_sample):
                            # Train policy
                            batch = pool.random_batch(self.batch_size)
                            itrs = self.do_training(itr, batch)
                            train_qf_itr += itrs[0]
                            train_policy_itr += itrs[1]
                        sample_policy.set_param_values(
                            self.policy.get_param_values())

                    itr += 1
                    if time.time() - gc_dump_time > 100:
                        gc.collect()
                        gc_dump_time = time.time()

                logger.log("Training finished")
                logger.log("Trained qf %d steps, policy %d steps" %
                           (train_qf_itr, train_policy_itr))
                if pool.size >= self.min_pool_size:
                    self.evaluate(epoch, pool)
                    params = self.get_epoch_snapshot(epoch)
                    logger.save_itr_params(epoch, params)
                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()
                if self.plot:
                    self.update_plot()
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")
            self.env.terminate()
            self.policy.terminate()
Ejemplo n.º 2
0
    def train(self, e, environment_name, penalty):
        with tf.Session() as sess:

            self.initialize_uninitialized(sess)

            # This seems like a rather sequential method
            pool = SimpleReplayPool(
                max_pool_size=self.replay_pool_size,
                observation_dim=self.env.observation_space.flat_dim,
                action_dim=self.env.action_space.flat_dim,
                replacement_prob=self.replacement_prob,
            )

            binary_pool = SimpleReplayPool(
                max_pool_size=self.replay_pool_size,
                observation_dim=self.env.observation_space.flat_dim,
                action_dim=2,
                replacement_prob=self.replacement_prob,
            )

            self.start_worker()
            self.init_opt()

            num_experiment = e

            self.initialize_uninitialized(sess)
            itr = 0
            path_length = 0
            path_return = 0
            terminal = False
            initial = False

            ### assigning query cost here
            query_cost = 0.9

            observation = self.env.reset()

            with tf.variable_scope("sample_policy"):
                sample_policy = Serializable.clone(self.policy)

            with tf.variable_scope("sample_target_gate_qf"):
                target_gate_qf = Serializable.clone(self.gate_qf)

            oracle_policy = self.oracle_policy

            oracle_interaction = 0
            agent_interaction = 0
            agent_interaction_per_episode = np.zeros(shape=(self.n_epochs))
            oracle_interaction_per_episode = np.zeros(shape=(self.n_epochs))

            for epoch in range(self.n_epochs):
                logger.push_prefix('epoch #%d | ' % epoch)
                logger.log("Training started")
                train_qf_itr, train_policy_itr = 0, 0

                for epoch_itr in pyprind.prog_bar(range(self.epoch_length)):
                    # Execute policy
                    if terminal:  # or path_length > self.max_path_length:
                        # Note that if the last time step ends an episode, the very
                        # last state and observation will be ignored and not added
                        # to the replay pool
                        observation = self.env.reset()
                        self.agent_strategy.reset()
                        sample_policy.reset()
                        self.es_path_returns.append(path_return)
                        path_length = 0
                        path_return = 0
                        initial = True
                    else:
                        initial = False

                    ## softmax binary output here from Beta(s)
                    agent_action, binary_action = self.agent_strategy.get_action_with_binary(
                        itr, observation, policy=sample_policy)  # qf=qf)

                    sigma = np.round(binary_action)
                    oracle_action = self.get_oracle_action(
                        itr, observation, policy=oracle_policy)

                    action = sigma[0] * agent_action + sigma[1] * oracle_action

                    next_observation, reward, terminal, _ = self.env.step(
                        action)

                    ## sigma[1] for oracle interaction
                    if sigma[1] == 1.0:
                        oracle_interaction += 1
                        if penalty == True:
                            reward = reward - query_cost

                    ## for no oracle interaction
                    elif sigma[0] == 1.0:
                        agent_interaction += 1

                    path_length += 1
                    path_return += reward
                    """
                    CHECK THIS - To do here
                    Discrete binary actions to be added to the replay buffer
                    Not the binary action probabilities
                    """
                    binary_action = sigma

                    if not terminal and path_length >= self.max_path_length:
                        terminal = True
                        if self.include_horizon_terminal_transitions:
                            pool.add_sample(observation, action,
                                            reward * self.scale_reward,
                                            terminal, initial)
                            binary_pool.add_sample(observation, binary_action,
                                                   reward * self.scale_reward,
                                                   terminal, initial)

                    else:
                        pool.add_sample(observation, action,
                                        reward * self.scale_reward, terminal,
                                        initial)
                        binary_pool.add_sample(observation, binary_action,
                                               reward * self.scale_reward,
                                               terminal, initial)

                    observation = next_observation

                    if pool.size >= self.min_pool_size:
                        for update_itr in range(self.n_updates_per_sample):
                            # Train policy
                            # batches from pool containing continuous actions and discrete actions
                            batch = pool.random_batch(self.batch_size)
                            binary_batch = binary_pool.random_batch(
                                self.batch_size)

                            itrs = self.do_training(itr, batch, binary_batch)
                            train_qf_itr += itrs[0]
                            train_policy_itr += itrs[1]
                        sample_policy.set_param_values(
                            self.policy.get_param_values())

                    itr += 1

                agent_interaction_per_episode[epoch] = agent_interaction
                oracle_interaction_per_episode[epoch] = oracle_interaction
                np.save(
                    '/Users/Riashat/Documents/PhD_Research/RLLAB/rllab/learning_active_learning/learning_ask_help/DDPG/Oracle_Interactions/oracle_interactons_'
                    + str(environment_name) + '_' + 'exp_' +
                    str(num_experiment) + '.npy',
                    oracle_interaction_per_episode)
                np.save(
                    '/Users/Riashat/Documents/PhD_Research/RLLAB/rllab/learning_active_learning/learning_ask_help/DDPG/Oracle_Interactions/agent_interactions_'
                    + str(environment_name) + '_' + 'exp_' +
                    str(num_experiment) + '.npy',
                    agent_interaction_per_episode)
                # np.save('/home/ml/rislam4/Documents/RLLAB/rllab/Active_Imitation_Learning/Imitation_Learning_RL/learning_ask_help/DDPG/Oracle_Interactions/oracle_interactons_'  + str(environment_name) +  '_' + 'exp_' + str(num_experiment) + '.npy', oracle_interaction_per_episode)
                # np.save('/home/ml/rislam4/Documents/RLLAB/rllab/Active_Imitation_Learning/Imitation_Learning_RL/learning_ask_help/DDPG/Oracle_Interactions/agent_interactions_'  + str(environment_name) +  '_' + 'exp_' + str(num_experiment) + '.npy', agent_interaction_per_episode)

                logger.record_tabular('Oracle Interactions',
                                      oracle_interaction)
                logger.record_tabular('Agent Interactions', agent_interaction)

                logger.log("Training finished")
                logger.log("Trained qf %d steps, policy %d steps" %
                           (train_qf_itr, train_policy_itr))
                # logger.log("Pool sizes agent (%d) oracle (%d)" %(agent_only_pool.size, oracle_only_pool.size))

                if pool.size >= self.min_pool_size:
                    self.evaluate(epoch, pool)
                    params = self.get_epoch_snapshot(epoch)
                    logger.save_itr_params(epoch, params)
                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()
                if self.plot:
                    self.update_plot()
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")
            self.env.terminate()
            self.policy.terminate()
Ejemplo n.º 3
0
    def train(self):
        with tf.Session() as sess:
            # sess.run(tf.global_variables_initializer())
            # only initialise the uninitialised ones
            self.initialize_uninitialized(sess)

            pool = SimpleReplayPool(
                max_pool_size=self.replay_pool_size,
                observation_dim=self.env.observation_space.flat_dim,
                action_dim=self.env.action_space.flat_dim,
                replacement_prob=self.replacement_prob,
            )

            self.start_worker()
            self.init_opt()

            # This initializes the optimizer parameters
            self.initialize_uninitialized(sess)
            itr = 0
            path_length = 0
            path_return = 0
            terminal = False
            initial = False
            observation = self.env.reset()

            with tf.variable_scope("sample_policy"):
                sample_policy = Serializable.clone(self.policy)

            oracle_policy = self.oracle_policy

            for epoch in range(self.n_epochs):
                logger.push_prefix('epoch #%d | ' % epoch)
                logger.log("Training started")
                train_qf_itr, train_policy_itr = 0, 0

                for epoch_itr in pyprind.prog_bar(range(self.epoch_length)):
                    # Execute policy
                    if terminal:  # or path_length > self.max_path_length:
                        # Note that if the last time step ends an episode, the very
                        # last state and observation will be ignored and not added
                        # to the replay pool
                        observation = self.env.reset()
                        self.agent_strategy.reset()
                        sample_policy.reset()
                        self.es_path_returns.append(path_return)
                        path_length = 0
                        path_return = 0
                        initial = True
                    else:
                        initial = False

                    ### both continuous actions
                    ### binary_action is continuous here
                    ### it will be approximated as a discrete action with the regularizers
                    ### taken from conditional computation (Bengio)
                    agent_action, binary_action = self.agent_strategy.get_action_with_binary(
                        itr, observation, policy=sample_policy)  # qf=qf)
                    sigma = np.round(binary_action)
                    oracle_action = self.get_oracle_action(
                        itr, observation, policy=oracle_policy)

                    action = sigma[0] * agent_action + sigma[1] * oracle_action
                    next_observation, reward, terminal, _ = self.env.step(
                        action)
                    path_length += 1
                    path_return += reward

                    ### including both the agent and oracle samples in the same replay buffer
                    if not terminal and path_length >= self.max_path_length:
                        terminal = True
                        # only include the terminal transition in this case if the flag was set
                        if self.include_horizon_terminal_transitions:
                            pool.add_sample(observation, action,
                                            reward * self.scale_reward,
                                            terminal, initial)

                    else:
                        pool.add_sample(observation, action,
                                        reward * self.scale_reward, terminal,
                                        initial)

                    observation = next_observation

                    if pool.size >= self.min_pool_size:

                        for update_itr in range(self.n_updates_per_sample):
                            # Train policy
                            batch = pool.random_batch(self.batch_size)
                            itrs = self.do_training(itr, batch)
                            train_qf_itr += itrs[0]
                            train_policy_itr += itrs[1]
                        sample_policy.set_param_values(
                            self.policy.get_param_values())

                    itr += 1

                logger.log("Training finished")
                logger.log("Trained qf %d steps, policy %d steps" %
                           (train_qf_itr, train_policy_itr))
                # logger.log("Pool sizes agent (%d) oracle (%d)" %(agent_only_pool.size, oracle_only_pool.size))

                if pool.size >= self.min_pool_size:
                    self.evaluate(epoch, pool)
                    params = self.get_epoch_snapshot(epoch)
                    logger.save_itr_params(epoch, params)
                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()
                if self.plot:
                    self.update_plot()
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")
            self.env.terminate()
            self.policy.terminate()
Ejemplo n.º 4
0
    def train(self):
        gc_dump_time = time.time()
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            # This seems like a rather sequential method
            pool = SimpleReplayPool(
                max_pool_size=self.replay_pool_size,
                observation_dim=self.env.observation_space.flat_dim,
                action_dim=self.env.action_space.flat_dim,
                replacement_prob=self.replacement_prob,
            )
            self.start_worker()

            self.init_opt()
            # This initializes the optimizer parameters
            sess.run(tf.global_variables_initializer())
            itr = 0
            path_length = 0
            path_return = 0
            terminal = False
            initial = False
            observation = self.env.reset()

            #with tf.variable_scope("sample_policy"):
            #with suppress_params_loading():
            #sample_policy = pickle.loads(pickle.dumps(self.policy))
            with tf.variable_scope("sample_policy"):
                sample_policy = Serializable.clone(self.policy)

            updates_until_next_sampling = 0
            for epoch in range(self.n_epochs):
                logger.push_prefix('epoch #%d | ' % epoch)
                logger.log("Training started")
                train_qf_itr, train_policy_itr = 0, 0
                for epoch_itr in pyprind.prog_bar(range(self.epoch_length)):
                    if updates_until_next_sampling <= 0:
                        samples = self.es.generate_samples(
                            self.env, sample_policy, self.epoch_length + 1,
                            self.max_path_length)
                        updates_until_next_sampling = len(samples)
                    sample = samples[-updates_until_next_sampling]
                    updates_until_next_sampling -= 1
                    observation, action, reward, terminal, initial, path_length = sample
                    if not terminal and path_length >= self.max_path_length:
                        terminal = True
                        # only include the terminal transition in this case if the flag was set
                        if self.include_horizon_terminal_transitions:
                            pool.add_sample(observation, action,
                                            reward * self.scale_reward,
                                            terminal, initial)
                    else:
                        pool.add_sample(observation, action,
                                        reward * self.scale_reward, terminal,
                                        initial)

                    if pool.size >= self.min_pool_size:
                        for update_itr in range(self.n_updates_per_sample):
                            # Train policy
                            batch = pool.random_batch(self.batch_size)
                            itrs = self.do_training(itr, batch)
                            train_qf_itr += itrs[0]
                            train_policy_itr += itrs[1]
                        sample_policy.set_param_values(
                            self.policy.get_param_values())

                    itr += 1
                    if time.time() - gc_dump_time > 100:
                        gc.collect()
                        gc_dump_time = time.time()

                logger.log("Training finished")
                logger.log("Trained qf %d steps, policy %d steps" %
                           (train_qf_itr, train_policy_itr))
                if pool.size >= self.min_pool_size:
                    self.evaluate(epoch, pool)
                    params = self.get_epoch_snapshot(epoch)
                    logger.save_itr_params(epoch, params)
                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()
                if self.plot:
                    self.update_plot()
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")
            self.env.terminate()
            self.policy.terminate()
Ejemplo n.º 5
0
    def train(self):
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            # This seems like a rather sequential method
            pool = SimpleReplayPool(
                max_pool_size=self.replay_pool_size,
                observation_dim=self.env.observation_space.flat_dim,
                action_dim=self.env.action_space.flat_dim,
                replacement_prob=self.replacement_prob,
            )
            self.start_worker()

            self.init_opt()
            # This initializes the optimizer parameters
            sess.run(tf.global_variables_initializer())
            itr = 0
            path_length = 0
            path_return = 0
            terminal = False
            initial = False
            observation = self.env.reset()

            with tf.variable_scope("sample_policy"):
                sample_policy = Serializable.clone(self.policy)

            for epoch in range(self.n_epochs):
                logger.push_prefix('epoch #%d | ' % epoch)
                logger.log("Training started")
                train_qf_itr, train_policy_itr = 0, 0

                #sample a policy function from the posterior at every episode
                #move in the entire episode with the sampled policy function?

                for epoch_itr in pyprind.prog_bar(range(self.epoch_length)):
                    # Execute policy
                    if terminal:  # or path_length > self.max_path_length:
                        # Note that if the last time step ends an episode, the very
                        # last state and observation will be ignored and not added
                        # to the replay pool
                        observation = self.env.reset()
                        self.es.reset()
                        sample_policy.reset()
                        self.es_path_returns.append(path_return)
                        path_length = 0
                        path_return = 0
                        initial = True
                    else:
                        initial = False

                    action = self.es.get_action(itr,
                                                observation,
                                                policy=sample_policy)  # qf=qf)

                    next_observation, reward, terminal, _ = self.env.step(
                        action)
                    path_length += 1
                    path_return += reward

                    if not terminal and path_length >= self.max_path_length:
                        terminal = True
                        # only include the terminal transition in this case if the flag was set
                        if self.include_horizon_terminal_transitions:
                            pool.add_sample(observation, action,
                                            reward * self.scale_reward,
                                            terminal, initial)
                    else:
                        pool.add_sample(observation, action,
                                        reward * self.scale_reward, terminal,
                                        initial)

                    observation = next_observation

                    if pool.size >= self.min_pool_size:
                        for update_itr in range(self.n_updates_per_sample):
                            # Train policy
                            batch = pool.random_batch(self.batch_size)
                            itrs = self.do_training(itr, epoch, batch)
                            train_qf_itr += itrs[0]
                            train_policy_itr += itrs[1]
                        sample_policy.set_param_values(
                            self.policy.get_param_values())

                    itr += 1

                logger.log("Training finished")
                logger.log("Trained qf %d steps, policy %d steps" %
                           (train_qf_itr, train_policy_itr))
                if pool.size >= self.min_pool_size:
                    self.evaluate(epoch, pool)
                    params = self.get_epoch_snapshot(epoch)
                    logger.save_itr_params(epoch, params)
                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()
                if self.plot:
                    self.update_plot()
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")
            self.env.terminate()
            self.policy.terminate()
    def lp_exploration(self):
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            # This seems like a rather sequential method
            pool = SimpleReplayPool(
                max_pool_size=self.replay_pool_size,
                observation_dim=self.env.observation_space.flat_dim,
                action_dim=self.env.action_space.flat_dim,
                replacement_prob=self.replacement_prob,
            )
            self.start_worker()

            with tf.variable_scope("sample_policy", reuse=True):
                sample_policy = Serializable.clone(self.policy)

            self.init_opt()

            # This initializes the optimizer parameters
            sess.run(tf.global_variables_initializer())

            itr = 0
            path_length = 0
            path_return = 0
            terminal = False
            initial = False
            observation = self.env.reset()

            self.initial_action = self.env.action_space.sample()

            chain_actions = np.array([self.initial_action])
            chain_states = np.array([observation])

            action_trajectory_chain = 0
            state_trajectory_chain = 0

            end_traj_action = 0
            end_traj_state = 0

            H_vector = np.random.uniform(
                low=self.env.action_space.low,
                high=self.env.action_space.high,
                size=(self.env.action_space.shape[0], ))
            H = (self.b_step_size / LA.norm(H_vector)) * H_vector

            all_H = np.array([H])
            all_theta = np.array([])

            last_action_chosen = self.initial_action

            for epoch in range(self.max_exploratory_steps):

                print("LP Exploration Episode", epoch)
                print("Replay Buffer Sample Size", pool.size)

                # logger.push_prefix('epoch #%d | ' % epoch)
                # logger.log("Training started")
                train_qf_itr, train_policy_itr = 0, 0

                if epoch == 0:
                    next_action = last_action_chosen + H
                else:
                    next_action = last_action_chosen

                for epoch_itr in pyprind.prog_bar(range(self.epoch_length)):

                    if self.env.action_space.shape[0] == 6:
                        one_vector = self.one_vector_6D()

                    elif self.env.action_space.shape[0] == 21:
                        one_vector = self.one_vector_21D()

                    elif self.env.action_space.shape[0] == 3:
                        one_vector = self.one_vector_3D()

                    elif self.env.action_space.shape[0] == 10:
                        one_vector = self.one_vector_10D()

                    theta_mean = np.arccos(
                        np.exp(np.true_divide(-self.b_step_size,
                                              self.L_p))) * one_vector
                    sigma_iden = self.sigma**2 * np.identity(
                        self.env.action_space.shape[0] - 1)

                    eta = np.random.multivariate_normal(theta_mean, sigma_iden)
                    eta = np.concatenate((np.array([0]), eta), axis=0)
                    """
                    Map H_t to Spherical coordinate
                    """
                    if self.env.action_space.shape[0] == 3:
                        H_conversion = self.cart2pol_3D(H)
                    elif self.env.action_space.shape[0] == 6:
                        H_conversion = self.cart2pol_6D(H)
                    elif self.env.action_space.shape[0] == 10:
                        H_conversion = self.cart2pol_10D(H)
                    elif self.env.action_space.shape[0] == 21:
                        H_conversion = self.cart2pol_21D(H)

                    H = H_conversion + eta
                    """
                    Map H_t to Cartesian coordinate
                    """
                    if self.env.action_space.shape[0] == 3:
                        H_conversion = self.pol2cart_3D(H)
                    elif self.env.action_space.shape[0] == 6:
                        H_conversion = self.pol2cart_6D(H)
                    elif self.env.action_space.shape[0] == 10:
                        H_conversion = self.pol2cart_10D(H)
                    elif self.env.action_space.shape[0] == 21:
                        H_conversion = self.cart2pol_21D(H)

                    H = H_conversion

                    phi_t = next_action
                    phi_t_1 = phi_t + H

                    chosen_action = np.array([phi_t_1])
                    chain_actions = np.append(chain_actions,
                                              chosen_action,
                                              axis=0)

                    chosen_action = chosen_action[0, :]

                    # Execute policy
                    if terminal:  # or path_length > self.max_path_length:
                        # Note that if the last time step ends an episode, the very
                        # last state and observation will be ignored and not added
                        # to the replay pool
                        observation = self.env.reset()
                        # self.es.reset()
                        sample_policy.reset()
                        self.es_path_returns.append(path_return)
                        path_length = 0
                        path_return = 0
                        initial = True
                    else:
                        initial = False

                    chosen_state, reward, terminal, _ = self.env.step(
                        chosen_action)

                    chain_states = np.append(chain_states,
                                             np.array([chosen_state]),
                                             axis=0)

                    action = chosen_action
                    state = chosen_state
                    end_traj_state = chosen_state
                    end_traj_action = chosen_action

                    #updates to be used in next iteration
                    H = phi_t_1 - phi_t
                    all_H = np.append(all_H, np.array([H]), axis=0)
                    next_action = phi_t_1

                    path_length += 1
                    path_return += reward

                    if not terminal and path_length >= self.max_path_length:
                        terminal = True

                        #originally, it was only line above
                        #added these below
                        terminal_state = chosen_state
                        last_action_chosen = self.env.action_space.sample()
                        H_vector = np.random.uniform(
                            low=self.env.action_space.low,
                            high=self.env.action_space.high,
                            size=(self.env.action_space.shape[0], ))
                        H = (self.b_step_size / LA.norm(H_vector)) * H_vector
                        next_action = last_action_chosen + H

                        # path_length = 0
                        # path_return = 0
                        # state = self.env.reset()
                        # sample_policy.reset()

                        # only include the terminal transition in this case if the flag was set
                        if self.include_horizon_terminal_transitions:
                            pool.add_sample(observation, action,
                                            reward * self.scale_reward,
                                            terminal, initial)
                    else:
                        pool.add_sample(observation, action,
                                        reward * self.scale_reward, terminal,
                                        initial)

                    observation = state

                    if pool.size >= self.min_pool_size:
                        for update_itr in range(self.n_updates_per_sample):
                            # Train policy
                            batch = pool.random_batch(self.batch_size)
                            itrs = self.do_training(itr, batch)

                            train_qf_itr += itrs[0]
                            train_policy_itr += itrs[1]
                        sample_policy.set_param_values(
                            self.policy.get_param_values())

                    itr += 1

                last_action_chosen = action
                #last_action_chosen = last_action_chosen[0, :]

            action_trajectory_chain = chain_actions
            state_trajectory_chain = chain_states
            end_trajectory_action = end_traj_action
            end_trajectory_state = end_traj_state

            self.env.terminate()
            self.policy.terminate()

            return self.qf, self.policy, action_trajectory_chain, state_trajectory_chain, end_trajectory_action, end_trajectory_state