Esempio n. 1
0
class GoalController(object):
    def __init__(self,
                 state_dim,
                 action_bound=1.0,
                 final_activation=tf.identity,
                 training_batch_size=32,
                 GAMMA=0.95,
                 lr=0.001,
                 replay_buffer_size=1024):

        self.AC = ActorCritic(state_dim,
                              state_dim,
                              final_activation=final_activation,
                              action_bound=action_bound,
                              training_batch_size=training_batch_size,
                              GAMMA=GAMMA,
                              lr=lr,
                              replay_buffer_size=replay_buffer_size)

    def add_to_replay_buffer(self, state, goal_state, reward, resulting_state):
        # Here, reward means exactly what it sounds like it does...
        self.AC.add_to_replay_buffer(state, goal_state, reward,
                                     resulting_state)

    def add_batch_to_replay_buffer(self, states, goal_states, rewards,
                                   resulting_states):
        for s, gs, r, rs in zip(states, goal_states, rewards,
                                resulting_states):
            self.AC.add_to_replay_buffer(s, gs, r, rs)

    def train_from_replay_buffer(self):
        self.AC.train_from_replay_buffer()

    def get_goal_state(self, current_states):
        return self.AC.get_actions(current_states)
Esempio n. 2
0
class StateController(object):
    def __init__(self,
                 state_dim,
                 action_dim,
                 action_bound=0.4,
                 training_batch_size=32,
                 GAMMA=0.95,
                 lr=0.001,
                 replay_buffer_size=1024):

        new_state_dim = 2 * state_dim
        self.state_dim = state_dim
        self.AC = ActorCritic(
            new_state_dim,
            action_dim,
            action_bound=action_bound,
            training_batch_size=training_batch_size,
            GAMMA=GAMMA,
            lr=lr,
            replay_buffer_size=replay_buffer_size)

    def get_reward(self, resulting_state, goal_state):
        return np.sum(((resulting_state - goal_state)**2), 1)

    def add_to_replay_buffer(self, state, goal_state, action, resulting_state):
        combined_state = np.concatenate(
            state, goal_state)  #combined is state plus goal
        reward = self.get_reward(resulting_state,
                                 goal_state)  # But reward is result - goal
        real_resulting_state = np.concatenate(resulting_state, goal_state)
        self.AC.add_to_replay_buffer(combined_state, action, reward,
                                     real_resulting_state)

    def add_batch_to_replay_buffer(self, states, goal_states, actions,
                                   resulting_states):
        for s, gs, a, rs in zip(states, goal_states, actions, rewards,
                                resulting_states):
            self.AC.add_to_replay_buffer(s, gs, a, rs)

    def train_from_replay_buffer(self):
        self.AC.train_from_replay_buffer()

    def get_actions(self, states, goal_states):
        combined_states = np.concatenate((states, goal_states), 1)
        return self.AC.get_actions(combined_states)

    def get_random_visited_state(self):
        return self.AC.get_batch(1)[0][0][0:self.state_dim]
class Runner(object):
    def __init__(self, env, GOAL_STATE, GAMMA=0.95, lr=0.001):
        self.env = env
        self.GOAL_STATE = GOAL_STATE
        self.states_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.actor_critic = ActorCritic(
            self.states_dim, self.action_dim, GAMMA=GAMMA, lr=lr)
        self.min_spread_holder = MinSpreadHolder(self.states_dim)

    def render_if_true(self, render):
        if render:
            self.env.render()

    def get_reward(self, state):
        shifted_goal_state = self.shift_observation(self.GOAL_STATE)
        diff = state - shifted_goal_state
        reward = -1 * np.mean(np.multiply(diff, diff))
        return reward

    def add_observed_batch(self, obs_batch):
        self.min_spread_holder.add_batch(obs_batch)

    def shift_observation(self, obs):
        return self.min_spread_holder.transform(obs)

    def play_random_game(self, render=True, add_to_all_observations=False):
        env = self.env
        observation = env.reset()
        games_observations = []

        for t in range(1000):
            games_observations.append(observation)
            self.render_if_true(render)
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            if done:
                if add_to_all_observations:
                    self.add_observed_batch(np.asarray(games_observations))
                print('Episode finished after {} timesteps'.format(t + 1))
                break

    def play_game_from_actor_with_random(self,
                                         render=True,
                                         add_to_buffer=True,
                                         prob_random=0.0):
        games_observations = []
        env = self.env
        obs = env.reset()
        games_observations = []
        for t in range(1000):
            self.render_if_true(render)
            obs = np.asarray(obs)
            games_observations.append(obs)
            shifted_obs = self.shift_observation(obs)

            action = self.actor_critic.get_actions(
                np.asarray([shifted_obs]))[0]  # I think zero.
            if not render and (random.random() < prob_random):
                action = env.action_space.sample()
            # if not render:
            #     for i in range(len(action)):
            #         if random.random() < prob_random:
            #             action[i] = (random.random() * 0.8) - 0.4

            new_obs, reward, done, info = env.step(action)
            shifted_new_obs = self.shift_observation(new_obs)
            if add_to_buffer:
                # real_reward = 0.0 if not done else -1.0
                real_reward = self.get_reward(
                    shifted_new_obs) if not done else -2.0
                self.actor_critic.add_to_replay_buffer(
                    shifted_obs, action, real_reward, shifted_new_obs)
            if done:
                self.add_observed_batch(np.asarray(games_observations))
                print('Episode finished after {} timesteps'.format(t + 1))
                break

            obs = new_obs

    def train_from_replay_buffer(self, should_print):
        losses = self.actor_critic.train_from_replay_buffer(should_print)
        return np.mean(losses)
Esempio n. 4
0
class Runner(object):
    def __init__(self, env, GAMMA=0.5):
        self.env = env
        self.states_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.actor_critic = ActorCritic(self.states_dim,
                                        self.action_dim,
                                        lr=0.0000000001)
        self.all_observations = np.asarray([])

    def get_means_stddevs(self, num_games=100, min_std_dev=0.01):
        observations = []
        env = self.env
        for i in xrange(num_games):
            obs = env.reset()
            while True:
                observations.append(obs)
                action = env.action_space.sample()
                obs, reward, done, info = env.step(action)
                if done:
                    print('game {} done'.format(i))
                    break
        observations = np.asarray(observations)
        mean = np.mean(observations, axis=0)
        stddev = np.maximum(np.std(observations, axis=0), min_std_dev)
        return mean, stddev

    def write_mean_stddev_to_file(self, num_games=100, min_std_dev=0.01):
        mean, stddev = self.get_means_stddevs(num_games, min_std_dev)
        with open('./mujoco_data/mean_state.json', 'w') as f:
            f.write(json.dumps(mean.tolist()))
        with open('./mujoco_data/stddev_state.json', 'w') as f:
            f.write(json.dumps(stddev.tolist()))
        print('written')

    def get_min_spread(self, num_games=100, min_spread=0.05):
        observations = []
        env = self.env
        for i in xrange(num_games):
            obs = env.reset()
            while True:
                observations.append(obs)
                action = env.action_space.sample()
                obs, reward, done, info = env.step(action)
                if done:
                    print('game {} done'.format(i))
                    break
        observations = np.asarray(observations)
        min_obs = observations.min(axis=0)
        max_obs = observations.max(axis=0)
        spread = np.maximum(max_obs - min_obs, min_spread)
        return min_obs, spread

    def write_min_spread_to_file(self, num_games=100, min_spread=0.05):
        min_obs, spread = self.get_min_spread(num_games, min_spread)
        print(min_obs)
        print(spread)
        print(min_obs.shape, spread.shape)
        with open('./mujoco_data/min_state.json', 'w') as f:
            f.write(json.dumps(min_obs.tolist()))
        with open('./mujoco_data/spread_state.json', 'w') as f:
            f.write(json.dumps(spread.tolist()))
        print('written')

    def play_random_game(self, render=True):
        env = self.env
        observation = env.reset()

        for t in range(1000):
            if render == True:
                env.render()
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            if done:
                print('Episode finished after {} timesteps'.format(t + 1))
                break

    def play_game_from_actor(self, render=True, add_to_buffer=True):
        env = self.env
        obs = env.reset()
        for t in range(1000):
            if render == True:
                env.render()
                sleep(0.05)
            obs = np.asarray(obs)
            shifted_obs = shift_state(obs)
            action = self.actor_critic.get_actions(np.asarray(
                [shifted_obs]))[0]  # I think zero.
            new_obs, reward, done, info = env.step(action)

            if done:
                print('Episode finished after {} timesteps'.format(t + 1))
                break

            if add_to_buffer:
                shifted_new_obs = shift_state(new_obs)
                # real_reward = get_reward(shifted_obs, shifted_new_obs)
                real_reward = get_reward(shifted_new_obs)
                self.actor_critic.add_to_replay_buffer(shifted_obs, action,
                                                       real_reward,
                                                       shifted_new_obs)

            obs = new_obs

    def play_game_from_actor_with_random(self,
                                         render=True,
                                         add_to_buffer=True,
                                         prob_random=0.05):
        env = self.env
        obs = env.reset()
        for t in range(1000):
            if render == True:
                env.render()
                sleep(0.01)
            obs = np.asarray(obs)
            shifted_obs = shift_state(obs)

            action = self.actor_critic.get_actions(np.asarray(
                [shifted_obs]))[0]  # I think zero.
            if not render:
                for i in range(len(action)):
                    if random.random() < prob_random:
                        action[i] = (random.random() * 0.8) - 0.4

            # random_move = random.random() < prob_random
            # if random_move and not render:
            #     print('Random move!')
            #     action = env.action_space.sample()
            # else:
            #     action = self.actor_critic.get_actions(
            #         np.asarray([shifted_obs]))[0]  # I think zero.
            new_obs, reward, done, info = env.step(action)

            if done:
                print obs, '\n'
                print new_obs, '\n'
                print shifted_obs, '\n'
                exit()
                if add_to_buffer:
                    real_reward = -0.10
                    self.actor_critic.add_to_replay_buffer(
                        shifted_obs, action, real_reward, shifted_obs)
                print('Episode finished after {} timesteps'.format(t + 1))
                break

            if add_to_buffer:
                shifted_new_obs = shift_state(new_obs)
                # real_reward = get_reward(shifted_obs, shifted_new_obs)
                real_reward = get_reward(shifted_new_obs)
                self.actor_critic.add_to_replay_buffer(shifted_obs, action,
                                                       real_reward, new_obs)

            obs = new_obs

    def train_from_replay_buffer(self, should_print):
        losses = self.actor_critic.train_from_replay_buffer(should_print)
        return np.mean(losses)