def run_controller(self, horizon, policy):

        logs = DotMap()
        logs.states = []
        logs.actions = []
        logs.rewards = []
        logs.times = []
        logs.obs = []

        observation = self._env.reset()
        print("Env has been reset")
        for t in range(horizon):
            # env.render()
            state = self.state2touch(observation)
            print("Go from state to touch")
            # print(state)
            action = policy.act(state)
            print("Get an action based on policy")

            observation, reward, done, info = self._env.step(action)
            print("Perform an action")

            # Log
            # logs.times.append()
            logs.actions.append(action.tolist())
            logs.rewards.append(reward)
            logs.states.append(state)
            logs.obs.append(observation)

        # Cluster state
        logs.actions = np.array(logs.actions)
        logs.rewards = np.array(logs.rewards)
        logs.states = np.array(logs.states)
        logs.obs = np.array(logs.obs)
        return logs
def run_controller(env, horizon, policy, video = False):
    """
    Runs a Reacher3d gym environment for horizon timesteps, making actions according to policy

    :param env: A gym object
    :param horizon: The number of states forward to look
    :param policy: A policy object (see other python file)
    """

    # WHat is going on here?
    # nol 29 feb - action only acts on first 5 variables
    def obs2q(obs):
        return obs[0:5]

    logs = DotMap()
    logs.states = []
    logs.actions = []
    logs.rewards = []
    logs.times = []

    observation = env.reset()
    for t in range(horizon):
        if(video):
            env.render()
        state = observation
        action, t = policy.act(obs2q(state))

        # print(action)

        observation, reward, done, info = env.step(action)

        # Log
        # logs.times.append()
        logs.actions.append(action)
        logs.rewards.append(reward)
        logs.states.append(observation)

    # Cluster state
    logs.actions = np.array(logs.actions)
    logs.rewards = np.array(logs.rewards)
    logs.states = np.array(logs.states)
    return logs
Esempio n. 3
0
        plt.title('begin')
        plt.imshow(goal_log['states'][0])
        plt.subplot(1, 2, 2)
        plt.title('end')
        plt.imshow(goal_log['states'][-1])
    #goal_img = goal_log['states'][-1]
    #goal_diff_img
    goal_img = goal_log['states'][-1] - goal_log['states'][-2]
    obs_init = sim._env.reset()
    state_init = sim.state2touch(obs_init)

    #Init error to be massive
    error = 1e+3
    logs = DotMap()
    logs.states = []
    logs.actions = []
    logs.rewards = []
    logs.times = []
    logs.obs = []
    ii = 0
    while (error > parsed.thresh):
        #for ii in range(200):
        #we take random action for 0th step
        if ii == 0:
            action = np.random.uniform(-10, 10, 2)
            error = 1e+3
        elif ii == 1:
            curr_img = logs['states'][-1] - state_init
            error, action = compute_action(goal_img, curr_img)
        else:
            curr_img = logs['states'][-1] - logs['states'][-2]