os.system(
            "cp /home/gordon/software/simple-rl/srl/basis_functions/simple_basis_functions.py {0}"
            .format(results_dir))
        os.system(
            "cp /home/gordon/software/simple-rl/srl/environments/cartpole.py {0}"
            .format(results_dir))

        f_returns = open("{0}{1}".format(results_dir, "/EpisodeReturn.fso"),
                         "w", 1)
        f_num_steps = open("{0}{1}".format(results_dir, "/NumSteps.fso"), "w",
                           1)
        # f_timings = open("{0}{1}".format(results_dir, "/AvgStepTime.fso"), "w", 1)

        # initialise policy and value functions
        # policy = PyBrainANNApproximator(actor_config["alpha"])
        policy = ANNApproximator(12, "sigmoid")
        # policy = LinearApprox(actor_config)
        # basis_functions = PolynomialBasisFunctions(idx=run)
        basis_functions = BasisFunctions(idx=run)

        cartpole_environment = CartPole(x=0.05,
                                        xdot=0.0,
                                        theta=-0.05,
                                        thetadot=0.0)

        # traditional TD(lambda) learning algorithm for the critic
        td_lambda = TDLinear(
            len(
                basis_functions.computeFeatures(
                    [0.0 for _ in range(critic_config["num_input_dims"])])),
            critic_config["alpha"], CONFIG["gamma"], CONFIG["lambda"])
Ejemplo n.º 2
0
        # f_timings = open("{0}{1}".format(results_dir, "/AvgStepTime.fso"), "w", 1)

        basis_functions = BasisFunctions(
            resolution=CONFIG["critic_config"]["rbf_basis_resolution"],
            scalar=CONFIG["critic_config"]["rbf_basis_scalar"],
            num_dims=CONFIG["critic_config"]["number_of_dims_in_state"])

        # initialise policy and value functions
        # policy = PyBrainANNApproximator(actor_config["alpha"])
        if CONFIG["policy_type"] == "linear":
            policy = LinearApprox(actor_config,
                                  basis_functions=basis_functions)
            # policy.setParams(np.linspace(-0.2, 0.2, CONFIG["critic_config"]["rbf_basis_resolution"]))
        elif CONFIG["policy_type"] == "ann":
            policy = ANNApproximator(
                CONFIG["actor_config"]["num_input_dims"],
                CONFIG["actor_config"]["num_hidden_units"],
                hlayer_activation_func="tanh")
            # policy.setParams(list(np.load("/tmp/policy_params0.npy")))
        elif CONFIG["policy_type"] == "synth":
            policy = SynthPolicy()

        # Init the type of Critic Algorithm you wish according to CONFIG dict
        if CONFIG["critic algorithm"] == "trad":
            td_lambda = TDLinear(
                len(
                    basis_functions.computeFeatures([
                        0.0 for _ in range(critic_config["num_input_dims"])
                    ])),
                critic_config["alpha"],
                CONFIG["gamma"],
                CONFIG["lambda"],
Ejemplo n.º 3
0
class CartPoleSimulation(object):
    def __init__(self):
        args = sys.argv
        if "-r" in args:
            self.results_dir_name = args[args.index("-r") + 1]
        else:
            self.results_dir_name = "cartpole_run"

        self.position_normaliser = DynamicNormalizer([-2.4, 2.4], [-1.0, 1.0])
        self.position_deriv_normaliser = DynamicNormalizer([-1.75, 1.75], [-1.0, 1.0])
        self.angle_normaliser = DynamicNormalizer([-0.25944, 0.25944], [-1.0, 1.0])
        self.angle_deriv_normaliser = DynamicNormalizer([-1.5, 1.5], [-1.0, 1.0])

        self.angle_dt_moving_window = SlidingWindow(5)
        self.last_150_episode_returns = SlidingWindow(150)
        self.last_action = None
        # self.last_action_greedy = None

    def update_critic(self, reward):
        state_t_value = self.approx_critic.computeOutput(list(self.state_t))
        state_t_p1_value = self.approx_critic.computeOutput(list(self.state_t_plus_1))

        if CONFIG["critic algorithm"] == "ann_trad":
            td_error = reward + (CONFIG["gamma"] * state_t_p1_value) - state_t_value
        elif CONFIG["critic algorithm"] == "ann_true":
            td_error = reward + (CONFIG["gamma"] * state_t_p1_value) - \
            self.approx_critic.computeOutputThetaMinusOne(list(self.state_t))
        prev_critic_weights = self.approx_critic.getParams()
        critic_gradient = self.approx_critic.calculateGradient(list(self.state_t))
        self.traces_policy.updateTrace(self.approx_policy.calculateGradient(list(self.state_t)), 1.0)

        p = self.approx_critic.getParams()
        if CONFIG["critic algorithm"] == "ann_trad":
            self.traces_critic.updateTrace(critic_gradient, 1.0)  # for standard TD(lambda)
            X, T = self.traces_critic.getTraces()
            for x, trace in zip(X, T):
                # print("updating critic using gradient vector: {0}\t{1}".format(x, trace))
                p += critic_config["alpha"] * td_error * (x * trace)
            # self.approx_critic.setParams(prev_critic_weights + CONFIG["critic_config"]["alpha"] * td_error * critic_gradient)
        elif CONFIG["critic algorithm"] == "ann_true":
            # For True TD(lambda)
            #print("UPDATING ANN CRITC with TRUE TD(lambda)")
            self.traces_critic.updateTrace(critic_gradient)    # for True TD(lambda)
            part_1 = td_error * self.traces_critic.e
            part_2 = critic_config["alpha"] * \
                    np.dot((self.approx_critic.computeOutputThetaMinusOne(list(self.state_t)) - state_t_value), critic_gradient)
            p += part_1 + part_2
        
        self.approx_critic.setParams(p)
        return td_error, critic_gradient, state_t_value, state_t_p1_value

    def update_policy(self, td_error, exploration):
        UPDATE_CONDITION = False
        if CONFIG["actor update rule"] == "cacla":
            if td_error > 0.0:
                UPDATE_CONDITION = True
            else:
                UPDATE_CONDITION = False
        elif CONFIG["actor update rule"] == "td lambda":
            UPDATE_CONDITION = True
        
        if UPDATE_CONDITION:
            # print("Updating the policy")
            # get original values
            params = self.approx_policy.getParams()
            old_action = self.approx_policy.computeOutput(list(self.state_t))
            policy_gradient = self.approx_policy.calculateGradient()

            # now update
            if CONFIG["actor update rule"] == "cacla":
                # policy.setParams(params + actor_config["alpha"] * (policy_gradient * exploration))
                X, T = self.traces_policy.getTraces()
                p = self.approx_policy.getParams()
                #print("Number of traces: {0}".format(len(T)))
                for x, trace in zip(X, T):
                    # print("updating critic using gradient vector: {0}\t{1}".format(x, trace))
                    p += actor_config["alpha"] * (x * trace) * exploration
                self.approx_policy.setParams(p)
                # self.approx_policy_greedy.setParams(p)
            else:
                self.approx_policy.setParams(params + actor_config["alpha"] * (policy_gradient * td_error))
                # self.approx_policy_greedy.setParams(params + actor_config["alpha"] * (policy_gradient * td_error))

    def run(self):
        # Loop number of runs
        for run in range(CONFIG["num_runs"]):
            # Create logging directory and files
            results_dir = "/home/gordon/data/tmp/{0}{1}".format(self.results_dir_name, run)
            if not os.path.exists(results_dir):
                os.makedirs(results_dir)
            filename = os.path.basename(sys.argv[0])
            os.system("cp {0} {1}".format(filename, results_dir))

            f_returns = open("{0}{1}".format(results_dir, "/EpisodeReturn.fso"), "w", 1)
            # f_returns_greedy = open("{0}{1}".format(results_dir, "/GreedyEpisodeReturn.fso"), "w", 1)

            # env_name = 'MountainCarContinuous-v0'
            env_name = 'Pendulum-v0'
            self.env = gym.make(env_name)
            self.env = wrappers.Monitor(self.env, directory="{0}_gym".format(results_dir), force=True)
            self.env.seed(0)
            # self.env_greedy = gym.make(env_name)

            # policies and critics
            self.approx_critic = ANNApproximator(actor_config["num_input_dims"],
                                            actor_config["num_hidden_units"], hlayer_activation_func="tanh")
            self.approx_policy = ANNApproximator(actor_config["num_input_dims"], actor_config["num_hidden_units"], hlayer_activation_func="tanh")
            # self.approx_policy_greedy = ANNApproximator(actor_config["num_input_dims"], actor_config["num_hidden_units"], hlayer_activation_func="tanh")
            prev_critic_gradient = np.zeros(self.approx_critic.getParams().shape)

            # Set up trace objects
            if CONFIG["critic algorithm"] == "ann_trad":
                self.traces_critic = Traces(CONFIG["lambda"], CONFIG["min_trace_value"])
            elif CONFIG["critic algorithm"] == "ann_true":
                self.traces_critic = TrueTraces(critic_config["alpha"], CONFIG["gamma"], CONFIG["lambda"])
            self.traces_policy = Traces(CONFIG["lambda"], CONFIG["min_trace_value"])

            for episode_number in range(CONFIG["num_episodes"]):
                reward_cum = 0.0
                reward_cum_greedy = 0.0

                if episode_number % CONFIG["log_actions"] == 0:
                    f_actions = open("{0}{1}".format(results_dir, "/actions{0}.csv".format(episode_number)), "w", 1)
                    # f_actions_greedy = open("{0}{1}".format(results_dir, "/greedy_actions{0}.csv".format(episode_number)), "w", 1)

                # reset everything for the next episode
                self.traces_critic.reset()
                self.traces_policy.reset()
                observation = self.env.reset()
                # self.env_greedy.reset()

                episode_ended = False
                episode_ended_learning = False
                # episode_ended_greedy = False

                for step_number in range(CONFIG["max_num_steps"]):
                    # Update the state for timestep t
                    # self.update_state_t()
                    self.state_t = deepcopy(observation)

                    # action_t_greedy = self.approx_policy_greedy.computeOutput(list(self.state_t_greedy))
                    action_t_deterministic = self.approx_policy.computeOutput(list(self.state_t))
                    if step_number % 5 == 0:
                        exploration = np.random.normal(0.0, CONFIG["exploration_sigma"])
                    action_t = np.clip(action_t_deterministic + exploration, -2, 2)

                    observation, reward, episode_ended_learning, diagnostics = self.env.step([action_t])
                    self.state_t_plus_1 = deepcopy(observation)
                    # self.env_greedy.performAction(action_t_greedy)

                    # Update the state for timestep t + 1, after action is performed
                    # self.update_state_t_p1(next_state)

                    if self.last_action is None:
                        self.last_action = action_t
                    # if self.last_action_greedy is None:
                    #     self.last_action_greedy = action_t_greedy
                    # action_diff = self.last_action - action_t_deterministic
                    # reward = self.env.getReward(action_diff)
                    # print("action {0} reward: {1}".format(action_t, reward))
                    self.last_action = deepcopy(action_t_deterministic)

                    # Always log the greedy actions
                    # if episode_number % CONFIG["log_actions"] == 0:
                        # if step_number == 0:
                        #     state_keys = list(self.state_t_greedy.keys())
                        #     state_keys.append("action")
                        #     label_logging_format = "#{" + "}\t{".join(
                        #         [str(state_keys.index(el)) for el in state_keys]) + "}\n"
                        #     f_actions_greedy.write(label_logging_format.format(*state_keys))
                        #
                        # logging_list = list(self.state_t_greedy)
                        # logging_list.append(action_t_greedy)
                        #
                        # action_logging_format = "{" + "}\t{".join(
                        #     [str(logging_list.index(el)) for el in logging_list]) + "}\n"
                        # f_actions_greedy.write(action_logging_format.format(*logging_list))

                    if not episode_ended_learning:
                        # ---- Critic Update ----
                        (td_error, critic_gradient, state_t_value, state_tp1_value) = self.update_critic(reward)

                        # ---- Policy Update -------
                        self.update_policy(td_error, exploration)

                        # only log the learning actions whilst learning
                        if episode_number % CONFIG["log_actions"] == 0:
                            # if step_number == 0:
                        #     #     state_keys = list(self.state_t.keys())
                        #     #     state_keys.append("exploration")
                        #     #     state_keys.append("reward")
                        #     #     state_keys.append("tde")
                        #     #     state_keys.append("st")
                        #     #     state_keys.append("stp1")
                        #     #     state_keys.append("explore_action")
                        #     #     state_keys.append("action")
                        #     #     label_logging_format = "#{" + "}\t{".join(
                        #     #         [str(state_keys.index(el)) for el in state_keys]) + "}\n"
                        #     #     f_actions.write(label_logging_format.format(*state_keys))
                        #
                            logging_list = list(self.state_t)
                        #     logging_list.append(exploration)
                        #     logging_list.append(reward)
                        #     logging_list.append(td_error)
                        #     logging_list.append(state_t_value)
                        #     logging_list.append(state_tp1_value)
                            logging_list.append(action_t)
                            logging_list.append(action_t_deterministic)
                            action_logging_format = "{" + "}\t{".join(
                                [str(logging_list.index(el)) for el in logging_list]) + "}\n"
                            f_actions.write(action_logging_format.format(*logging_list))

                        prev_critic_gradient = deepcopy(critic_gradient)
                    
                        reward_cum += reward
                    # if not episode_ended_greedy:
                    #     reward_cum_greedy += self.env_greedy.getReward(action_t_greedy - self.last_action_greedy)
                    # self.last_action_greedy = deepcopy(action_t_greedy)
                    # episode_ended_learning = self.env.episodeEnded()
                    # episode_ended_greedy = self.env_greedy.episodeEnded()
                    self.state_t = deepcopy(self.state_t_plus_1)

                    if episode_ended_learning:# and episode_ended_greedy:
                        # episode complete, start a new one
                        break
                # episode either ended early due to failure or completed max number of steps
                print("Episode ended - Learning {0} {1}".format(episode_number, reward_cum))
                # print("Episode ended - Greedy {0} {1}".format(episode_number, reward_cum_greedy))
                last_150_avg = sum(self.last_150_episode_returns.getWindow(reward_cum_greedy)) / 150.0

                f_returns.write("{0}\t{1}\n".format(episode_number, reward_cum))
                # f_returns_greedy.write("{0}\t{1}\n".format(episode_number, reward_cum_greedy))
                if last_150_avg > 1995:
                    break
Ejemplo n.º 4
0
    def run(self):
        # Loop number of runs
        for run in range(CONFIG["num_runs"]):
            # Create logging directory and files
            results_dir = "/home/gordon/data/tmp/{0}{1}".format(self.results_dir_name, run)
            if not os.path.exists(results_dir):
                os.makedirs(results_dir)
            filename = os.path.basename(sys.argv[0])
            os.system("cp {0} {1}".format(filename, results_dir))

            f_returns = open("{0}{1}".format(results_dir, "/EpisodeReturn.fso"), "w", 1)
            # f_returns_greedy = open("{0}{1}".format(results_dir, "/GreedyEpisodeReturn.fso"), "w", 1)

            # env_name = 'MountainCarContinuous-v0'
            env_name = 'Pendulum-v0'
            self.env = gym.make(env_name)
            self.env = wrappers.Monitor(self.env, directory="{0}_gym".format(results_dir), force=True)
            self.env.seed(0)
            # self.env_greedy = gym.make(env_name)

            # policies and critics
            self.approx_critic = ANNApproximator(actor_config["num_input_dims"],
                                            actor_config["num_hidden_units"], hlayer_activation_func="tanh")
            self.approx_policy = ANNApproximator(actor_config["num_input_dims"], actor_config["num_hidden_units"], hlayer_activation_func="tanh")
            # self.approx_policy_greedy = ANNApproximator(actor_config["num_input_dims"], actor_config["num_hidden_units"], hlayer_activation_func="tanh")
            prev_critic_gradient = np.zeros(self.approx_critic.getParams().shape)

            # Set up trace objects
            if CONFIG["critic algorithm"] == "ann_trad":
                self.traces_critic = Traces(CONFIG["lambda"], CONFIG["min_trace_value"])
            elif CONFIG["critic algorithm"] == "ann_true":
                self.traces_critic = TrueTraces(critic_config["alpha"], CONFIG["gamma"], CONFIG["lambda"])
            self.traces_policy = Traces(CONFIG["lambda"], CONFIG["min_trace_value"])

            for episode_number in range(CONFIG["num_episodes"]):
                reward_cum = 0.0
                reward_cum_greedy = 0.0

                if episode_number % CONFIG["log_actions"] == 0:
                    f_actions = open("{0}{1}".format(results_dir, "/actions{0}.csv".format(episode_number)), "w", 1)
                    # f_actions_greedy = open("{0}{1}".format(results_dir, "/greedy_actions{0}.csv".format(episode_number)), "w", 1)

                # reset everything for the next episode
                self.traces_critic.reset()
                self.traces_policy.reset()
                observation = self.env.reset()
                # self.env_greedy.reset()

                episode_ended = False
                episode_ended_learning = False
                # episode_ended_greedy = False

                for step_number in range(CONFIG["max_num_steps"]):
                    # Update the state for timestep t
                    # self.update_state_t()
                    self.state_t = deepcopy(observation)

                    # action_t_greedy = self.approx_policy_greedy.computeOutput(list(self.state_t_greedy))
                    action_t_deterministic = self.approx_policy.computeOutput(list(self.state_t))
                    if step_number % 5 == 0:
                        exploration = np.random.normal(0.0, CONFIG["exploration_sigma"])
                    action_t = np.clip(action_t_deterministic + exploration, -2, 2)

                    observation, reward, episode_ended_learning, diagnostics = self.env.step([action_t])
                    self.state_t_plus_1 = deepcopy(observation)
                    # self.env_greedy.performAction(action_t_greedy)

                    # Update the state for timestep t + 1, after action is performed
                    # self.update_state_t_p1(next_state)

                    if self.last_action is None:
                        self.last_action = action_t
                    # if self.last_action_greedy is None:
                    #     self.last_action_greedy = action_t_greedy
                    # action_diff = self.last_action - action_t_deterministic
                    # reward = self.env.getReward(action_diff)
                    # print("action {0} reward: {1}".format(action_t, reward))
                    self.last_action = deepcopy(action_t_deterministic)

                    # Always log the greedy actions
                    # if episode_number % CONFIG["log_actions"] == 0:
                        # if step_number == 0:
                        #     state_keys = list(self.state_t_greedy.keys())
                        #     state_keys.append("action")
                        #     label_logging_format = "#{" + "}\t{".join(
                        #         [str(state_keys.index(el)) for el in state_keys]) + "}\n"
                        #     f_actions_greedy.write(label_logging_format.format(*state_keys))
                        #
                        # logging_list = list(self.state_t_greedy)
                        # logging_list.append(action_t_greedy)
                        #
                        # action_logging_format = "{" + "}\t{".join(
                        #     [str(logging_list.index(el)) for el in logging_list]) + "}\n"
                        # f_actions_greedy.write(action_logging_format.format(*logging_list))

                    if not episode_ended_learning:
                        # ---- Critic Update ----
                        (td_error, critic_gradient, state_t_value, state_tp1_value) = self.update_critic(reward)

                        # ---- Policy Update -------
                        self.update_policy(td_error, exploration)

                        # only log the learning actions whilst learning
                        if episode_number % CONFIG["log_actions"] == 0:
                            # if step_number == 0:
                        #     #     state_keys = list(self.state_t.keys())
                        #     #     state_keys.append("exploration")
                        #     #     state_keys.append("reward")
                        #     #     state_keys.append("tde")
                        #     #     state_keys.append("st")
                        #     #     state_keys.append("stp1")
                        #     #     state_keys.append("explore_action")
                        #     #     state_keys.append("action")
                        #     #     label_logging_format = "#{" + "}\t{".join(
                        #     #         [str(state_keys.index(el)) for el in state_keys]) + "}\n"
                        #     #     f_actions.write(label_logging_format.format(*state_keys))
                        #
                            logging_list = list(self.state_t)
                        #     logging_list.append(exploration)
                        #     logging_list.append(reward)
                        #     logging_list.append(td_error)
                        #     logging_list.append(state_t_value)
                        #     logging_list.append(state_tp1_value)
                            logging_list.append(action_t)
                            logging_list.append(action_t_deterministic)
                            action_logging_format = "{" + "}\t{".join(
                                [str(logging_list.index(el)) for el in logging_list]) + "}\n"
                            f_actions.write(action_logging_format.format(*logging_list))

                        prev_critic_gradient = deepcopy(critic_gradient)
                    
                        reward_cum += reward
                    # if not episode_ended_greedy:
                    #     reward_cum_greedy += self.env_greedy.getReward(action_t_greedy - self.last_action_greedy)
                    # self.last_action_greedy = deepcopy(action_t_greedy)
                    # episode_ended_learning = self.env.episodeEnded()
                    # episode_ended_greedy = self.env_greedy.episodeEnded()
                    self.state_t = deepcopy(self.state_t_plus_1)

                    if episode_ended_learning:# and episode_ended_greedy:
                        # episode complete, start a new one
                        break
                # episode either ended early due to failure or completed max number of steps
                print("Episode ended - Learning {0} {1}".format(episode_number, reward_cum))
                # print("Episode ended - Greedy {0} {1}".format(episode_number, reward_cum_greedy))
                last_150_avg = sum(self.last_150_episode_returns.getWindow(reward_cum_greedy)) / 150.0

                f_returns.write("{0}\t{1}\n".format(episode_number, reward_cum))
                # f_returns_greedy.write("{0}\t{1}\n".format(episode_number, reward_cum_greedy))
                if last_150_avg > 1995:
                    break
            # Create logging directory and files
            results_dir = "/tmp/{0}{1}".format(results_dir_name, run)
            if not os.path.exists(results_dir):
                os.makedirs(results_dir)
            filename = os.path.basename(sys.argv[0])
            os.system("cp {0} {1}".format(filename, results_dir))
            os.system("cp /home/gordon/software/simple-rl/srl/basis_functions/simple_basis_functions.py {0}".format(results_dir))
            os.system("cp /home/gordon/software/simple-rl/srl/environments/cartpole.py {0}".format(results_dir))

            f_returns = open("{0}{1}".format(results_dir, "/EpisodeReturn.fso"), "w", 1)
            f_num_steps = open("{0}{1}".format(results_dir, "/NumSteps.fso"), "w", 1)
            f_timings = open("{0}{1}".format(results_dir, "/AvgStepTime.fso"), "w", 1)

            # initialise policy and value functions
            # policy = LinearApprox(actor_config)
            policy = ANNApproximator(4, controlled_var[run], hlayer_activation_func="tanh")
            # basis_functions = PolynomialBasisFunctions(idx=run)
            basis_functions = BasisFunctions(idx=0)

            cartpole_environment = CartPoleEnvironment()

            # traditional TD(lambda) learning algorithm for the critic
            # td_lambda = Traditional_TD_LAMBDA(ANNApproximator(critic_config["alpha"]), CONFIG["lambda"], CONFIG["min_trace_value"])
            # tmp_critic_config = deepcopy(critic_config)
            # tmp_critic_config["alpha"] = critic_config["alpha"][run]
            td_lambda = TrueOnlineTDLambda(basis_functions, critic_config, CONFIG["lambda"], CONFIG["gamma"])
            # td_lambda = ANNApproximator(critic_config["alpha"])

            # Loop number of episodes
            for episode_number in range(CONFIG["num_episodes"]):
                global episode_number, results_dir
Ejemplo n.º 6
0
class NessieRlSimulation(object):
    def __init__(self):
        args = sys.argv
        if "-r" in args:
            self.results_dir_name = args[args.index("-r") + 1]
        else:
            self.results_dir_name = "nessie_run"

        self.position_normaliser = DynamicNormalizer([-2.4, 2.4], [-1.0, 1.0])
        self.position_deriv_normaliser = DynamicNormalizer([-1.75, 1.75],
                                                           [-1.0, 1.0])
        self.angle_normaliser = DynamicNormalizer([-3.14, 3.14], [-1.0, 1.0])
        self.angle_deriv_normaliser = DynamicNormalizer([-0.02, 0.02],
                                                        [-1.0, 1.0])

        self.angle_dt_moving_window = SlidingWindow(5)
        self.last_150_episode_returns = SlidingWindow(150)

        self.thrusters = Thrusters()
        self.env = ROSBehaviourInterface()
        self.environment_info = EnvironmentInfo()

        self.ounoise = OUNoise()

        self.prev_action = 0.0

    def update_critic(self, reward, update):
        state_t_value = self.approx_critic.computeOutput(self.state_t.values())
        state_t_p1_value = self.approx_critic.computeOutput(
            self.state_t_plus_1.values())
        # print("state t: {0}".format(state_t_value))
        # print("state tp1: {0}".format(state_t_p1_value))

        if CONFIG["critic algorithm"] == "ann_trad":
            td_error = reward + (CONFIG["gamma"] *
                                 state_t_p1_value) - state_t_value
        elif CONFIG["critic algorithm"] == "ann_true":
            td_error = reward + (CONFIG["gamma"] * state_t_p1_value) - \
            self.approx_critic.computeOutputThetaMinusOne(self.state_t.values())
        prev_critic_weights = self.approx_critic.getParams()
        critic_gradient = self.approx_critic.calculateGradient(
            self.state_t.values())
        self.traces_policy.updateTrace(
            self.approx_policy.calculateGradient(self.state_t.values()), 1.0)

        if update:
            p = self.approx_critic.getParams()
            if CONFIG["critic algorithm"] == "ann_trad":
                self.traces_critic.updateTrace(critic_gradient,
                                               1.0)  # for standard TD(lambda)
                X, T = self.traces_critic.getTraces()
                for x, trace in zip(X, T):
                    # print("updating critic using gradient vector: {0}\t{1}".format(x, trace))
                    p += critic_config["alpha"] * td_error * (x * trace)
                # self.approx_critic.setParams(prev_critic_weights + CONFIG["critic_config"]["alpha"] * td_error * critic_gradient)
            elif CONFIG["critic algorithm"] == "ann_true":
                # For True TD(lambda)
                #print("UPDATING ANN CRITC with TRUE TD(lambda)")
                self.traces_critic.updateTrace(
                    critic_gradient)  # for True TD(lambda)
                part_1 = td_error * self.traces_critic.e
                part_2 = critic_config["alpha"] * \
                        np.dot((self.approx_critic.computeOutputThetaMinusOne(self.state_t.values()) - state_t_value), critic_gradient)
                p += part_1 + part_2

            self.approx_critic.setParams(p)
        return (td_error, critic_gradient, state_t_value, state_t_p1_value)

    def update_state_t(self):
        raw_angle = deepcopy(self.environment_info.raw_angle_to_goal)
        # print("raw angle:")
        # raw_angle_dt = raw_angle - self.prev_angle_dt_t
        # print("raw angle dt: {0}".format(raw_angle_dt))
        self.state_t = {
            "angle": self.angle_normaliser.scale_value(raw_angle),
            "angle_deriv": self.prev_angle_dt_t
        }
        self.prev_angle_dt_t = deepcopy(raw_angle)

    def update_state_t_p1(self):
        raw_angle = deepcopy(self.environment_info.raw_angle_to_goal)
        angle_tp1 = self.angle_normaliser.scale_value(raw_angle)
        angle_t = self.state_t["angle"]

        # if (abs(angle_t)) > 0.5:
        #     if angle_t > 0 and angle_tp1 < 0:
        #         angle_change = (1.0 - angle_t) + (-1.0 - angle_tp1)
        #     elif angle_t < 0 and angle_tp1 > 0:
        #         angle_change = (1.0 - angle_tp1) + (-1.0 - angle_t)
        #     else:
        #         angle_change = angle_tp1 - angle_t
        # else:
        abs_angle_tp1 = np.abs(angle_tp1)
        abs_angle_t = np.abs(angle_t)
        if abs_angle_tp1 > abs_angle_t:
            sign = -1
        else:
            sign = 1
        angle_change = sign * abs(abs_angle_tp1 - abs_angle_t)

        # print("angle t: {0}".format(abs_angle_t))
        # print("angle tp1: {0}".format(abs_angle_tp1))
        # print("angle change: {0}".format(angle_change))

        tmp_angle_change = sum(
            self.angle_dt_moving_window.getWindow(angle_change)) / 5.0
        self.state_t_plus_1 = {
            "angle":
            self.angle_normaliser.scale_value(raw_angle),
            "angle_deriv":
            self.angle_deriv_normaliser.scale_value(tmp_angle_change)
        }
        self.prev_angle_dt_t = self.angle_deriv_normaliser.scale_value(
            tmp_angle_change)

    def update_policy(self, td_error, exploration):
        UPDATE_CONDITION = False
        if CONFIG["actor update rule"] == "cacla":
            if td_error > 0.0:
                UPDATE_CONDITION = True
            else:
                UPDATE_CONDITION = False
        elif CONFIG["actor update rule"] == "td lambda":
            UPDATE_CONDITION = True

        if UPDATE_CONDITION:
            # get original values
            params = self.approx_policy.getParams()
            old_action = self.approx_policy.computeOutput(
                self.state_t.values())
            policy_gradient = self.approx_policy.calculateGradient()

            # now update
            if CONFIG["actor update rule"] == "cacla":
                # policy.setParams(params + actor_config["alpha"] * (policy_gradient * exploration))
                X, T = self.traces_policy.getTraces()
                p = self.approx_policy.getParams()
                #print("Number of traces: {0}".format(len(T)))
                for x, trace in zip(X, T):
                    # print("updating critic using gradient vector: {0}\t{1}".format(x, trace))
                    p += actor_config["alpha"] * (x * trace) * exploration
                self.approx_policy.setParams(p)
            else:
                self.approx_policy.setParams(params + actor_config["alpha"] *
                                             (policy_gradient * td_error))

    def run(self):
        # Loop number of runs
        if CONFIG["test_policy"]:
            runs = TEST_CONFIG["run_numbers"]
        else:
            runs = range(CONFIG["num_runs"])

        for run in runs:
            if CONFIG["test_policy"]:
                self.results_dir_name = "nessie_validate_{0}".format(
                    TEST_CONFIG["folder"])
                results_to_load_directory = "/tmp/{0}{1}".format(
                    TEST_CONFIG["folder"], run)
            # Create logging directory and files
            results_dir = "/home/gordon/data/tmp/{0}{1}".format(
                self.results_dir_name, run)
            if not os.path.exists(results_dir):
                os.makedirs(results_dir)
            filename = os.path.basename(sys.argv[0])
            os.system("cp {0} {1}".format(filename, results_dir))
            os.system(
                "cp /home/gordon/rosbuild_ws/ros_simple_rl/src/srl/environments/ros_behaviour_interface.py {0}"
                .format(results_dir))
            os.system(
                "cp /home/gordon/rosbuild_ws/ros_simple_rl/src/utilities/orstein_exploration.py {0}"
                .format(results_dir))

            if CONFIG["test_policy"]:
                os.system("cp {0}/Epi* {1}/LearningEpisodeReturn.fso".format(
                    results_to_load_directory, results_dir))
                os.system("cp {0}/basic* {1}/LearningMainScript.py".format(
                    results_to_load_directory, results_dir))

            f_returns = open(
                "{0}{1}".format(results_dir, "/EpisodeReturn.fso"), "w", 1)

            # policies and critics
            self.approx_critic = ANNApproximator(
                actor_config["num_input_dims"],
                actor_config["num_hidden_units"],
                hlayer_activation_func="tanh")
            if not CONFIG["generate_initial_weights"]:
                critic_init = "/home/gordon/data/tmp/critic_params_48h.npy"
                self.approx_critic.setParams(list(np.load(critic_init)))

            self.approx_policy = ANNApproximator(
                actor_config["num_input_dims"],
                actor_config["num_hidden_units"],
                hlayer_activation_func="tanh")
            if not CONFIG["generate_initial_weights"]:
                policy_init = "/home/gordon/data/tmp/initial_2dim_48h_policy_params.npy"
                self.approx_policy.setParams(list(np.load(policy_init)))

            # if CONFIG["test_policy"] is True:
            #    if not os.path.exists("/tmp/{0}".format(results_to_validate)):
            #        continue
            #self.approx_policy.setParams()
            prev_critic_gradient = np.zeros(
                self.approx_critic.getParams().shape)

            # Set up trace objects
            if CONFIG["critic algorithm"] == "ann_trad":
                self.traces_critic = Traces(CONFIG["lambda"],
                                            CONFIG["min_trace_value"])
            elif CONFIG["critic algorithm"] == "ann_true":
                self.traces_critic = TrueTraces(critic_config["alpha"],
                                                CONFIG["gamma"],
                                                CONFIG["lambda"])
            self.traces_policy = Traces(CONFIG["lambda"],
                                        CONFIG["min_trace_value"])

            exploration_sigma = CONFIG["exploration_sigma"]

            for episode_number in range(CONFIG["num_episodes"]):

                if CONFIG["test_policy"] and episode_number not in TEST_CONFIG[
                        "episode_numbers"]:
                    # don't do anything for the episode number if we are testing policies and
                    # this episodes policy is not in the list to test
                    continue

                reward_cum = 0.0
                reward_cum_greedy = 0.0

                if episode_number % CONFIG["log_actions"] == 0:
                    f_actions = open(
                        "{0}{1}".format(
                            results_dir,
                            "/actions{0}.csv".format(episode_number)), "w", 1)

                # If testing a learnt policy, load it
                if CONFIG["test_policy"]:
                    policy_to_load = "{0}/policy_params{1}.npy".format(
                        results_to_load_directory, episode_number)
                    critic_to_load = "{0}/critic_params{1}.npy".format(
                        results_to_load_directory, episode_number)

                    print("policy_to_load: {0}".format(policy_to_load))
                    self.approx_policy.setParams(list(np.load(policy_to_load)))
                    self.approx_critic.setParams(list(np.load(critic_to_load)))

                # reset everything for the next episode
                self.traces_critic.reset()
                self.traces_policy.reset()

                # self.env.nav_reset()
                self.env.reset()
                self.ounoise.reset()

                self.angle_dt_moving_window.reset()

                episode_ended = False
                episode_ended_learning = False

                # if episode_number > 5 and exploration_sigma > 0.1:
                exploration_sigma *= CONFIG["exploration_decay"]

                self.prev_angle_dt_t = 0.0
                self.prev_angle_dt_tp1 = 0.0

                if CONFIG["generate_initial_weights"]:
                    self.approx_policy = ANNApproximator(
                        actor_config["num_input_dims"],
                        actor_config["num_hidden_units"],
                        hlayer_activation_func="tanh")

                for step_number in range(CONFIG["max_num_steps"]):
                    # Update the state for timestep t
                    self.update_state_t()

                    action_t_deterministic = self.approx_policy.computeOutput(
                        self.state_t.values())

                    # if episode_number > 9:
                    #     control_rate = 0.5
                    # else:
                    control_rate = 3
                    if step_number % (control_rate * CONFIG["spin_rate"]) == 0:
                        # exploration = self.ounoise.get_action(action_t_deterministic)
                        # exploration = np.random.normal(0.0, exploration_sigma)
                        tmp_action = self.ounoise.get_action(
                            action_t_deterministic)[0]
                        exploration = tmp_action - action_t_deterministic

                        # exploration = self.ounoise.function(action_t_deterministic, 0, 0.2, 0.1)[0]
                    # else:
                    #    action_t = deepcopy(self.prev_action)

                    # self.prev_action = deepcopy(action_t)

                    if not CONFIG["generate_initial_weights"] and not CONFIG[
                            "test_policy"]:
                        action_t = np.clip(
                            action_t_deterministic + exploration, -10, 10)
                    else:
                        action_t = np.clip(action_t_deterministic, -10, 10)

                    # TODO - investigate what happens with the action!!!
                    self.env.performAction("gaussian_variance", action_t)

                    # TODO - time rather than rospy.sleep?!
                    time.sleep(1.0 / CONFIG["spin_rate"])

                    # Update the state for timestep t + 1, after action is performed
                    self.update_state_t_p1()

                    to_end = False
                    # if self.state_t["angle"] > 0.9:
                    #     reward = -10
                    #     to_end = True
                    # else:
                    reward = self.env.getReward(self.state_t_plus_1, action_t)

                    if not episode_ended_learning:
                        if not CONFIG["generate_initial_weights"]:
                            # ---- Critic Update ----
                            (td_error, critic_gradient, state_t_value,
                             state_tp1_value) = self.update_critic(
                                 reward, not CONFIG["test_policy"])

                            if episode_number % CONFIG["log_actions"] == 0:
                                if step_number == 0:
                                    state_keys = self.state_t.keys()
                                    state_keys.append("exploration")
                                    state_keys.append("reward")
                                    state_keys.append("tde")
                                    state_keys.append("st")
                                    state_keys.append("stp1")
                                    state_keys.append("explore_action")
                                    state_keys.append("action")
                                    label_logging_format = "#{" + "}\t{".join([
                                        str(state_keys.index(el))
                                        for el in state_keys
                                    ]) + "}\n"
                                    f_actions.write(
                                        label_logging_format.format(
                                            *state_keys))

                                logging_list = self.state_t.values()
                                logging_list.append(exploration)
                                logging_list.append(reward)
                                logging_list.append(td_error)
                                logging_list.append(state_t_value)
                                logging_list.append(state_tp1_value)
                                logging_list.append(action_t)
                                logging_list.append(action_t_deterministic)
                                action_logging_format = "{" + "}\t{".join([
                                    str(logging_list.index(el))
                                    for el in logging_list
                                ]) + "}\n"
                                f_actions.write(
                                    action_logging_format.format(
                                        *logging_list))

                            if not CONFIG["test_policy"]:
                                # ---- Policy Update -------
                                self.update_policy(td_error, exploration)

                            prev_critic_gradient = deepcopy(critic_gradient)

                        reward_cum += reward
                        if to_end:
                            reward_cum = -3000  # for logging only
                            break

                    # TODO - add check for if episode ended early. i.e. moving average
                    """ episode_ended_learning = self.env.episodeEnded()

                     if episode_ended_learning:
                        # episode complete, start a new one
                        break """
                # episode either ended early due to failure or completed max number of steps
                print("Episode ended - Learning {0} {1}".format(
                    episode_number, reward_cum))

                f_returns.write("{0}\t{1}\n".format(episode_number,
                                                    reward_cum))

                np.save(
                    "{0}/policy_params{1}".format(results_dir, episode_number),
                    self.approx_policy.getParams())
                np.save(
                    "{0}/critic_params{1}".format(results_dir, episode_number),
                    self.approx_critic.getParams())
Ejemplo n.º 7
0
    def run(self):
        # Loop number of runs
        if CONFIG["test_policy"]:
            runs = TEST_CONFIG["run_numbers"]
        else:
            runs = range(CONFIG["num_runs"])

        for run in runs:
            if CONFIG["test_policy"]:
                self.results_dir_name = "nessie_validate_{0}".format(
                    TEST_CONFIG["folder"])
                results_to_load_directory = "/tmp/{0}{1}".format(
                    TEST_CONFIG["folder"], run)
            # Create logging directory and files
            results_dir = "/home/gordon/data/tmp/{0}{1}".format(
                self.results_dir_name, run)
            if not os.path.exists(results_dir):
                os.makedirs(results_dir)
            filename = os.path.basename(sys.argv[0])
            os.system("cp {0} {1}".format(filename, results_dir))
            os.system(
                "cp /home/gordon/rosbuild_ws/ros_simple_rl/src/srl/environments/ros_behaviour_interface.py {0}"
                .format(results_dir))
            os.system(
                "cp /home/gordon/rosbuild_ws/ros_simple_rl/src/utilities/orstein_exploration.py {0}"
                .format(results_dir))

            if CONFIG["test_policy"]:
                os.system("cp {0}/Epi* {1}/LearningEpisodeReturn.fso".format(
                    results_to_load_directory, results_dir))
                os.system("cp {0}/basic* {1}/LearningMainScript.py".format(
                    results_to_load_directory, results_dir))

            f_returns = open(
                "{0}{1}".format(results_dir, "/EpisodeReturn.fso"), "w", 1)

            # policies and critics
            self.approx_critic = ANNApproximator(
                actor_config["num_input_dims"],
                actor_config["num_hidden_units"],
                hlayer_activation_func="tanh")
            if not CONFIG["generate_initial_weights"]:
                critic_init = "/home/gordon/data/tmp/critic_params_48h.npy"
                self.approx_critic.setParams(list(np.load(critic_init)))

            self.approx_policy = ANNApproximator(
                actor_config["num_input_dims"],
                actor_config["num_hidden_units"],
                hlayer_activation_func="tanh")
            if not CONFIG["generate_initial_weights"]:
                policy_init = "/home/gordon/data/tmp/initial_2dim_48h_policy_params.npy"
                self.approx_policy.setParams(list(np.load(policy_init)))

            # if CONFIG["test_policy"] is True:
            #    if not os.path.exists("/tmp/{0}".format(results_to_validate)):
            #        continue
            #self.approx_policy.setParams()
            prev_critic_gradient = np.zeros(
                self.approx_critic.getParams().shape)

            # Set up trace objects
            if CONFIG["critic algorithm"] == "ann_trad":
                self.traces_critic = Traces(CONFIG["lambda"],
                                            CONFIG["min_trace_value"])
            elif CONFIG["critic algorithm"] == "ann_true":
                self.traces_critic = TrueTraces(critic_config["alpha"],
                                                CONFIG["gamma"],
                                                CONFIG["lambda"])
            self.traces_policy = Traces(CONFIG["lambda"],
                                        CONFIG["min_trace_value"])

            exploration_sigma = CONFIG["exploration_sigma"]

            for episode_number in range(CONFIG["num_episodes"]):

                if CONFIG["test_policy"] and episode_number not in TEST_CONFIG[
                        "episode_numbers"]:
                    # don't do anything for the episode number if we are testing policies and
                    # this episodes policy is not in the list to test
                    continue

                reward_cum = 0.0
                reward_cum_greedy = 0.0

                if episode_number % CONFIG["log_actions"] == 0:
                    f_actions = open(
                        "{0}{1}".format(
                            results_dir,
                            "/actions{0}.csv".format(episode_number)), "w", 1)

                # If testing a learnt policy, load it
                if CONFIG["test_policy"]:
                    policy_to_load = "{0}/policy_params{1}.npy".format(
                        results_to_load_directory, episode_number)
                    critic_to_load = "{0}/critic_params{1}.npy".format(
                        results_to_load_directory, episode_number)

                    print("policy_to_load: {0}".format(policy_to_load))
                    self.approx_policy.setParams(list(np.load(policy_to_load)))
                    self.approx_critic.setParams(list(np.load(critic_to_load)))

                # reset everything for the next episode
                self.traces_critic.reset()
                self.traces_policy.reset()

                # self.env.nav_reset()
                self.env.reset()
                self.ounoise.reset()

                self.angle_dt_moving_window.reset()

                episode_ended = False
                episode_ended_learning = False

                # if episode_number > 5 and exploration_sigma > 0.1:
                exploration_sigma *= CONFIG["exploration_decay"]

                self.prev_angle_dt_t = 0.0
                self.prev_angle_dt_tp1 = 0.0

                if CONFIG["generate_initial_weights"]:
                    self.approx_policy = ANNApproximator(
                        actor_config["num_input_dims"],
                        actor_config["num_hidden_units"],
                        hlayer_activation_func="tanh")

                for step_number in range(CONFIG["max_num_steps"]):
                    # Update the state for timestep t
                    self.update_state_t()

                    action_t_deterministic = self.approx_policy.computeOutput(
                        self.state_t.values())

                    # if episode_number > 9:
                    #     control_rate = 0.5
                    # else:
                    control_rate = 3
                    if step_number % (control_rate * CONFIG["spin_rate"]) == 0:
                        # exploration = self.ounoise.get_action(action_t_deterministic)
                        # exploration = np.random.normal(0.0, exploration_sigma)
                        tmp_action = self.ounoise.get_action(
                            action_t_deterministic)[0]
                        exploration = tmp_action - action_t_deterministic

                        # exploration = self.ounoise.function(action_t_deterministic, 0, 0.2, 0.1)[0]
                    # else:
                    #    action_t = deepcopy(self.prev_action)

                    # self.prev_action = deepcopy(action_t)

                    if not CONFIG["generate_initial_weights"] and not CONFIG[
                            "test_policy"]:
                        action_t = np.clip(
                            action_t_deterministic + exploration, -10, 10)
                    else:
                        action_t = np.clip(action_t_deterministic, -10, 10)

                    # TODO - investigate what happens with the action!!!
                    self.env.performAction("gaussian_variance", action_t)

                    # TODO - time rather than rospy.sleep?!
                    time.sleep(1.0 / CONFIG["spin_rate"])

                    # Update the state for timestep t + 1, after action is performed
                    self.update_state_t_p1()

                    to_end = False
                    # if self.state_t["angle"] > 0.9:
                    #     reward = -10
                    #     to_end = True
                    # else:
                    reward = self.env.getReward(self.state_t_plus_1, action_t)

                    if not episode_ended_learning:
                        if not CONFIG["generate_initial_weights"]:
                            # ---- Critic Update ----
                            (td_error, critic_gradient, state_t_value,
                             state_tp1_value) = self.update_critic(
                                 reward, not CONFIG["test_policy"])

                            if episode_number % CONFIG["log_actions"] == 0:
                                if step_number == 0:
                                    state_keys = self.state_t.keys()
                                    state_keys.append("exploration")
                                    state_keys.append("reward")
                                    state_keys.append("tde")
                                    state_keys.append("st")
                                    state_keys.append("stp1")
                                    state_keys.append("explore_action")
                                    state_keys.append("action")
                                    label_logging_format = "#{" + "}\t{".join([
                                        str(state_keys.index(el))
                                        for el in state_keys
                                    ]) + "}\n"
                                    f_actions.write(
                                        label_logging_format.format(
                                            *state_keys))

                                logging_list = self.state_t.values()
                                logging_list.append(exploration)
                                logging_list.append(reward)
                                logging_list.append(td_error)
                                logging_list.append(state_t_value)
                                logging_list.append(state_tp1_value)
                                logging_list.append(action_t)
                                logging_list.append(action_t_deterministic)
                                action_logging_format = "{" + "}\t{".join([
                                    str(logging_list.index(el))
                                    for el in logging_list
                                ]) + "}\n"
                                f_actions.write(
                                    action_logging_format.format(
                                        *logging_list))

                            if not CONFIG["test_policy"]:
                                # ---- Policy Update -------
                                self.update_policy(td_error, exploration)

                            prev_critic_gradient = deepcopy(critic_gradient)

                        reward_cum += reward
                        if to_end:
                            reward_cum = -3000  # for logging only
                            break

                    # TODO - add check for if episode ended early. i.e. moving average
                    """ episode_ended_learning = self.env.episodeEnded()

                     if episode_ended_learning:
                        # episode complete, start a new one
                        break """
                # episode either ended early due to failure or completed max number of steps
                print("Episode ended - Learning {0} {1}".format(
                    episode_number, reward_cum))

                f_returns.write("{0}\t{1}\n".format(episode_number,
                                                    reward_cum))

                np.save(
                    "{0}/policy_params{1}".format(results_dir, episode_number),
                    self.approx_policy.getParams())
                np.save(
                    "{0}/critic_params{1}".format(results_dir, episode_number),
                    self.approx_critic.getParams())