Exemple #1
0
    def evaluate(self):
        test_env = TimeLimit(gym.make('PepperPush-v0'), max_episode_steps=100)
        obs = test_env.reset()
        results = evaluate_policy(self.model,
                                  test_env,
                                  n_eval_episodes=75,
                                  return_episode_rewards=False)

        return results[0]
Exemple #2
0
def collect_fixed_set_of_states(conf: dict, env: TimeLimit) -> list:
    # Collect samples to evaluate the agent on a fixed set of samples
    # (DQN paper). Collect a fixed set of states by running a random policy
    # before training starts and track the average of the maximum predicted
    # Q for these states.
    env.reset()
    exclude = conf['preprocess']['exclude']
    fixed_states = []

    while True:
        action = env.action_space.sample()
        next_state, reward, done, _ = env.step(action)
        state = next_state
        preprocessed_state = preprocess_frame(state, exclude)
        fixed_states.append(preprocessed_state)
        if done:
            break
    env.close()
    print(f'Collected {len(fixed_states)} fixed set of states!')

    return fixed_states
def render_episode(env: TimeLimit, estimator: CNN_DQN):
    obs = env.reset()
    state = get_state(obs)
    is_done = False
    while not is_done:
        sleep(0.0415)  # (~24 fps)

        actionIndex = torch.argmax(estimator.predict(state)).item()
        action = ACTIONS[actionIndex]
        obs, reward, is_done, info = env.step(action)
        state = get_state(obs)
        env.render()
    env.close()
def run_episode(env: TimeLimit, estimator: CNN_DQN):
    obs = env.reset()
    state = get_state(obs)
    is_done = False
    total_reward = 0

    while not is_done:
        actionIndex = torch.argmax(estimator.predict(state)).item()
        action = ACTIONS[actionIndex]
        obs, reward, is_done, info = env.step(action)
        state = get_state(obs)
        total_reward += reward

    return total_reward
Exemple #5
0
def render_episode(env: TimeLimit, estimator: CNN_DQN):
    obs = env.reset()
    state = get_state(obs)
    is_done = False
    while not is_done:
        sleep(0.0415)  # (~24 fps)

        rgb = env.render('rgb_array')
        upscaled = repeat_upsample(rgb, 3, 4)
        viewer.imshow(upscaled)

        actionIndex = torch.argmax(estimator.predict(state)).item()
        action = ACTIONS[actionIndex]
        obs, reward, is_done, _ = env.step(action)
        if reward != 0:
            print(reward)
        state = get_state(obs)
    env.close()
def monte_carlo_control_epsilon_greedy(
    env: TimeLimit,
    stats: dict,
    num_episodes: int,
    policy: Callable,
    discount_factor: float = 1.0,
    max_epsilon: float = 1.0,
    min_epsilon: float = 0.001,
    decay_rate: float = 0.00005
) -> np.ndarray:
    """
    Monte Carlo Control using Epsilon-Greedy policies. Finds the optimal
    state-action value function.

    Args:
        env: OpenAI gym environment
        stats: Dictionary contains statistics about the experiment
        num_episodes: Number of episodes to sample
        policy: Function that returns an action according to a policy
        discount_factor: Gamma discount factor
        max_epsilon: Max epsilon value from where the decay starts
        min_epsilon: Min epsilon value until the decay lasts
        decay_rate: Rate of the exponential decay

    Returns:
        The optimal sate-action value function.
    """

    num_wins = 0
    num_actions = env.action_space.n
    num_states = env.observation_space.n
    q_table = np.zeros((num_states, num_actions))
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
    epsilon = max_epsilon

    for i_episode in tqdm(range(num_episodes)):

        unique_state_action_pairs = set()
        state_action_pairs_in_episode = []
        rewards_in_episode = []
        done = False
        state = env.reset()
        t = 0

        while not done:

            action = policy(env, q_table, state, epsilon)
            next_state, reward, done, _ = env.step(action)
            state_action_pairs_in_episode.append([state, action])
            unique_state_action_pairs.add((state, action))
            rewards_in_episode.append(reward)
            state = next_state

            stats['train/episode_rewards'][i_episode] += reward
            stats['train/episode_lengths'][i_episode] = t

            if reward == 1:
                num_wins += 1

            t += 1

        state_action_pairs_in_episode = np.array(
            state_action_pairs_in_episode).astype(int)
        # Find all (state, action) pairs we've visited in this episode

        for state_action in unique_state_action_pairs:
            first_occurrence_idx = np.where(
                state_action_pairs_in_episode == state_action)[0][0]
            # Sum up all the rewards since the first occurrence
            G = sum([r * (discount_factor ** i) for i, r in
                     enumerate(rewards_in_episode[first_occurrence_idx:])])
            # Calculate average return for this state over all sampled episodes
            returns_sum[state_action] += G
            returns_count[state_action] += 1.0
            st, act = state_action
            q_table[st, act] = (
                    returns_sum[state_action] / returns_count[state_action]
            )

        stats['train/epsilon'][i_episode] = epsilon

        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(
            -decay_rate * i_episode)

        win_ratio = num_wins / (i_episode + 1)
        stats['train/win_ratio'][i_episode] = win_ratio

        if i_episode % 5000 == 0 and i_episode > 0:
            print(f'Current win ratio is {win_ratio}, epsilon: {epsilon}')

        # The policy is improved implicitly by changing the q_table
    print(f'Win ratio: {round(num_wins / num_episodes, 5)}')

    return q_table
Exemple #7
0
def q_learning_control_epsilon_greedy(
    env: TimeLimit,
    stats: dict,
    num_episodes: int,
    policy: Callable,
    discount_factor: float = 1.0,
    learning_rate: float = 0.5,
    max_epsilon: float = 1.0,
    min_epsilon: float = 0.001,
    decay_rate: float = 0.00005
) -> np.ndarray:
    """
    Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy
    policy while following an epsilon-greedy policy

    Args:
        env: OpenAI environment
        stats: Dictionary contains statistics about the experiment
        num_episodes: Number of episodes to run for
        policy: Function that returns an action according to a policy
        discount_factor: Gamma discount factor
        learning_rate: TD learning rate
        max_epsilon: Max epsilon value from where the decay starts
        min_epsilon: Min epsilon value until the decay lasts
        decay_rate: Rate of the exponential decay

    Returns:
        Q table with state-action values
    """

    num_wins = 0
    num_actions = env.action_space.n
    num_states = env.observation_space.n
    q_table = np.zeros((num_states, num_actions))
    epsilon = max_epsilon

    for i_episode in tqdm(range(num_episodes)):

        state = env.reset()

        for t in itertools.count():

            action = policy(env, q_table, state, epsilon)
            next_state, reward, done, info = env.step(action)

            stats["train/episode_rewards"][i_episode] += reward
            stats["train/episode_lengths"][i_episode] = t

            td_target = reward + discount_factor * np.max(q_table[next_state])
            td_error = td_target - q_table[state, action]
            q_table[state, action] += learning_rate * td_error

            state = next_state

            if done:
                if reward == 1:
                    num_wins += 1
                break

        stats["train/epsilon"][i_episode] = epsilon

        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(
            -decay_rate * i_episode)

        win_ratio = num_wins / (i_episode + 1)
        stats['train/win_ratio'][i_episode] = win_ratio

        if i_episode % 5000 == 0 and i_episode > 0:
            print(f'Current win ratio is {win_ratio}, epsilon: {epsilon}')

    return q_table
Exemple #8
0
proj = la.svd(proj, full_matrices=False)[2]
enc_dim = proj.shape[0]
weights = np.load(p_dir + "weights.npz")
biases = np.load(p_dir + "biases.npz")
weights = [v for k, v in weights.items()]
biases = [v for k, v in biases.items()]

saveload_path = "./experiments/learned_controllers/pendulum/{}".format(i)
model = DDPG.load(saveload_path + "model")

# now let's test the model
# specify the test task
n_test_steps = 100

# restart the env
env = TimeLimit(RestartablePendulumEnv(), max_episode_steps=200)
env = EncoderWrapper(env, mlp_encoder, [weights, biases, proj])

# for each test state, start the env in the state, then run forward and collect rewards
for k in range(3):
    high = np.array([np.pi, 1])
    start_state = np.random.uniform(low=-high, high=high)
    obs = env.reset(state=start_state)
    for j in range(n_test_steps):
        action, _states = model.predict(obs)
        obs, reward, dones, info = env.step(action)
        env.render()

# clean up and save results
env.close()
del model
Exemple #9
0
def main(k):
    path = './direction_BS_woNorm/150/{}'.format(k)
    if not os.path.exists(path):
        os.makedirs(path)
    ############## Hyperparameters ##############
    env_name = "fishEvasion-v0" # used when creating the environment with gym.make
    render = False              # render the environment in training if true
    # solved_reward = 100         # stop training if avg_reward > solved_reward
    log_interval = 27           # print avg reward in the interval
    max_episodes = 10000        # max training episodes
    max_timesteps = 150         # max timesteps in one episode
    
    update_timestep = 4050      # update policy every n timesteps
    action_std = 0.5            # constant std for action distribution (Multivariate Normal)
    K_epochs = 80               # update policy for K epochs
    eps_clip = 0.2              # clip parameter for PPO
    gamma = 0.99                # discount factor
    
    lr = 0.0003                 # parameters for Adam optimizer
    betas = (0.9, 0.999)
    
    random_seed = None
    #############################################
    
    # creating environment
    env = fish.FishEvasionEnv(dt = 0.1)

    # set the length of an episode
    from gym.wrappers.time_limit import TimeLimit
    env = TimeLimit(env, max_episode_steps=max_timesteps)

    # get observation and action dimensions from the environment
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    if random_seed:
        print("Random Seed: {}".format(random_seed))
        torch.manual_seed(random_seed)
        env.seed(random_seed)
        np.random.seed(random_seed)
    
    memory = Memory()
    ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip)
    # ------------------------------------------------------------------
    # start training from an existing policy    
    # ppo.policy_old.load_state_dict(torch.load('./direction_policy/PPO_{}_{:06d}.pth'.format(env_name,4380),map_location=device))
    # ppo.policy.load_state_dict(torch.load('./direction_policy/PPO_{}_{:06d}.pth'.format(env_name,4380),map_location=device))
    # ------------------------------------------------------------------
    
    # logging variables
    running_reward = 0
    avg_length = 0
    time_step = 0

    # training loop
    for i_episode in range(1, max_episodes+1):
        # ------------------------------------------------------------------
        # set a specific distribution for beta 
        # beta0 = angle_normalize(i_episode*3,center = 0)
        # print(beta0)
        # ------------------------------------------------------------------
        state = env.reset()
        for t in range(max_timesteps):
            time_step +=1
            # Running policy_old:
            action = ppo.select_action(state, memory)
            state, reward, done, _ = env.step(action)

            # Storing reward and is_terminals:
            memory.rewards.append(reward)
            memory.is_terminals.append(done)
            
            # update if it is time
            # ------------------------------------------------------------------
            if time_step % update_timestep == 0:
                ppo.update(memory)
                memory.clear_memory()
                time_step = 0
            # ------------------------------------------------------------------
            running_reward += reward
            if render:
                env.render()
            # break if episode ends
            if done:
                break
        avg_length += t

        # ------------------------------------------------------------------
        # stop training if avg_reward > solved_reward
        # if running_reward > (log_interval*solved_reward):
        #     print("########## Solved! ##########")
        #     torch.save(ppo.policy.state_dict(), './PPO_continuous_forwardWoPos_solved_{}.pth'.format(env_name))
        #     break
        # ------------------------------------------------------------------
    
        # save every 50 episodes
        if i_episode % 50 == 0:
            torch.save(ppo.policy.state_dict(), path+'/PPO_{}_direction{:06d}.pth'.format(env_name,i_episode)) 

        # ------------------------------------------------------------------
        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length/log_interval)
            running_reward = ((running_reward/log_interval))
            print('Episode {} \t Avg length: {} \t Avg reward: {}'.format(i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0
def q_learning(env: TimeLimit,
               estimator: CNN_DQN,
               n_episode,
               target_update_every=10,
               gamma=1.0,
               epsilon=0.1,
               epsilon_decay=0.99,
               replay_size=32):

    step = 0
    for episode in range(n_episode):

        policy = estimator.gen_epsilon_greedy_policy(epsilon, n_action)
        obs = env.reset()
        state = get_state(obs)
        is_done = False
        while not is_done:

            actionAi = policy(state)
            actionGym = ACTIONS[actionAi]

            next_obs, reward, is_done, _ = env.step(actionGym)
            next_state = get_state(next_obs)

            total_reward_episode[episode] += reward

            memory.append((state, actionAi, next_state, reward, is_done))

            if is_done:
                break

            estimator.replay(memory, replay_size, gamma)

            state = next_state

            step += 1
            sys.stdout.write("                                                                                                                  \r"\
                + 'step {}'.format(step))

        print('Episode {}: reward: {}, epsilon: {}'.format(
            episode, total_reward_episode[episode], epsilon))

        epsilon = max(epsilon * epsilon_decay, 0.01)

        if (episode % target_update_every) == 1:

            # update targets NN
            estimator.copy_target()
            estimator.save()

        if (episode % 100) == 1:
            # render_episode(env, estimator)

            # check if NN is well trained
            total_wins = 0
            for test in range(100):
                total_wins += 1 if run_episode(env, estimator) > 0 else 0
            if (total_wins > 90):
                estimator.copy_target()
                estimator.save()
                print('Finished training due to successful model')
                break
Exemple #11
0
def sarsa(env: TimeLimit,
          stats: dict,
          num_episodes: int,
          policy: Callable,
          discount_factor: float = 1.0,
          learning_rate: float = 0.5,
          max_epsilon: float = 1.0,
          min_epsilon: float = 0.03,
          decay_rate: float = 0.00005) -> np.ndarray:
    """
    SARSA algorithm: On-policy TD control. Finds an optimal Q state-action
    value function.

    Args:
        env: OpenAI environment
        stats: Dictionary contains statistics about the experiment
        num_episodes: Number of episodes to run for
        policy: Function that returns an action according to a policy
        discount_factor: Gamma discount factor
        learning_rate: TD learning rate
        max_epsilon: Max epsilon value from where the decay starts
        min_epsilon: Min epsilon value until the decay lasts
        decay_rate: Rate of the exponential decay


    Returns:
        A tuple (Q, stats).
        Q is the optimal action-value function, a dictionary mapping
        state -> action values.
        stats is an EpisodeStats object with two numpy arrays for
        episode_lengths and episode_rewards.
    """

    num_wins = 0
    num_actions = env.action_space.n
    num_states = env.observation_space.n
    q_table = np.zeros((num_states, num_actions))
    epsilon = max_epsilon

    for i_episode in tqdm(range(num_episodes)):

        state = env.reset()
        action = policy(env, q_table, state, epsilon)

        for t in itertools.count():

            next_state, reward, done, _ = env.step(action)
            next_action = policy(env, q_table, next_state, epsilon)

            if reward == 1:
                num_wins += 1

            stats['episode_rewards'][i_episode] += reward
            stats['episode_lengths'][i_episode] = t

            td_target = reward + discount_factor * q_table[next_state,
                                                           next_action]
            td_delta = td_target - q_table[state, action]
            q_table[state, action] += learning_rate * td_delta

            if done:
                break

            action = next_action
            state = next_state

        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(
            -decay_rate * i_episode)

        if i_episode % 5000 == 0 and i_episode > 0:
            print(f"Current win ratio is {num_wins / i_episode}, "
                  f"epsilon: {epsilon}")

    print(f"Win ratio: {round(num_wins / num_episodes, 5)}")

    return q_table
def n_step_sarsa(env: TimeLimit,
                 stats: dict,
                 num_episodes: int,
                 policy: Callable,
                 discount_factor: float = 0.9,
                 learning_rate: float = 0.8,
                 max_epsilon: float = 1.0,
                 min_epsilon: float = 0.001,
                 decay_rate: float = 0.00005,
                 n_steps: int = 5):
    """
    N-step Sarsa for estimating Q (N-step bootstrapping).

    Args:
        env: OpenAI environment
        stats: Dictionary contains statistics about the experiment
        num_episodes: Number of episodes to run for
        policy: Function that returns an action according to a policy
        discount_factor: Gamma discount factor
        learning_rate: TD learning rate
        max_epsilon: Max epsilon value from where the decay starts
        min_epsilon: Min epsilon value until the decay lasts
        decay_rate: Rate of the exponential decay
        n_steps: Steps to bootstrap

    Returns:
        Q table with state-action values
    """

    num_wins = 0
    num_actions = env.action_space.n
    num_states = env.observation_space.n
    q_table = np.zeros((num_states, num_actions))
    epsilon = max_epsilon

    for i_episode in tqdm(range(num_episodes)):

        T = np.inf

        state = env.reset()

        action = policy(env, q_table, state, epsilon)

        actions = [action]
        states = [state]
        rewards = [0]

        for t in itertools.count():

            if t < T:
                next_state, reward, done, _ = env.step(action)

                states.append(next_state)
                rewards.append(reward)

                if done:
                    T = t + 1

                    if reward == 1:
                        num_wins += 1

                else:
                    action = epsilon_greedy_policy(env, q_table, state,
                                                   epsilon)
                    actions.append(action)

                stats["episode_rewards"][i_episode] += reward
                stats["episode_lengths"][i_episode] = t

            # state tau being updated
            tau = t - n_steps + 1

            if tau >= 0:

                G = 0

                for i in range(tau + 1, min(tau + n_steps + 1, T + 1)):
                    G += np.power(discount_factor, i - tau - 1) * rewards[i]

                if tau + n_steps < T:
                    st = states[tau + n_steps]
                    act = actions[tau + n_steps]
                    G += np.power(discount_factor, n_steps) * q_table[st, act]

                # update Q values
                st = states[tau]
                act = actions[tau]
                q_table[st, act] += learning_rate * (G - q_table[st, act])

            if tau == T - 1:
                break

        stats["epsilon"][i_episode] = epsilon
        # Reduce epsilon (because we need less and less exploration)
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(
            -decay_rate * i_episode)

        if i_episode % 5000 == 0 and i_episode > 0:
            print(f"Game won {num_wins} times. Current win ratio is "
                  f"{num_wins / i_episode}, epsilon: {epsilon}")

    print(f"Win ratio: {round(num_wins / num_episodes, 5)}")

    return q_table
class Worker(object):
    def __init__(self,
                 name,
                 globalAC,
                 hard_share=None,
                 soft_sharing_coeff_actor=0.0,
                 soft_sharing_coeff_critic=0.0,
                 gradient_clip_actor=0.0,
                 gradient_clip_critic=0.0,
                 debug=False,
                 max_ep_steps=200,
                 image_shape=None,
                 stack=1):
        self.env = gym.make(GAME).unwrapped
        self.env = TimeLimit(self.env, max_episode_steps=max_ep_steps)
        self.name = name
        self.AC = ACNet(name,
                        globalAC,
                        hard_share=hard_share,
                        soft_sharing_coeff_actor=soft_sharing_coeff_actor,
                        soft_sharing_coeff_critic=soft_sharing_coeff_critic,
                        gradient_clip_actor=gradient_clip_actor,
                        gradient_clip_critic=gradient_clip_critic,
                        image_shape=image_shape,
                        stack=stack)
        self.debug = debug
        self.image_shape = image_shape
        self.stack = stack

    def work(self):
        def get_img(fn, *args):
            img_lock.acquire()
            results = fn(*args)
            img = self.env.render(mode='rgb_array')
            img_lock.release()
            img = rgb2grey(img)
            img = resize(img, self.image_shape)
            return img, results

        def env_reset_obs():
            return self.env.reset()

        def env_reset_img():
            img, _ = get_img(env_reset_obs)
            return img

        def env_step_obs(a):
            return self.env.step(a)

        def env_step_img(a):
            img, results = get_img(env_step_obs, a)
            return img, results[1], results[2], results[3]

        if self.image_shape is not None:
            env_reset_fn = env_reset_img
            env_step_fn = env_step_img
        else:
            env_reset_fn = env_reset_obs
            env_step_fn = env_step_obs

        global GLOBAL_RUNNING_R, GLOBAL_R, GLOBAL_EP, MAX_GLOBAL_EP
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            s = env_reset_fn()
            buffer_s = [s] * self.stack
            ep_r = 0
            while True:
                a = self.AC.choose_action(buffer_s[-self.stack:])
                s_, r, done, info = env_step_fn(a)
                if done: r = -5
                ep_r += r
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net

                    if done:
                        v_s_ = 0  # terminal
                    else:
                        obs_hist = buffer_s[-(self.stack - 1):] + [
                            s_,
                        ]
                        feed_dict = {
                            var: obs[np.newaxis, :]
                            for var, obs in zip(self.AC.s, obs_hist)
                        }
                        v_s_ = SESS.run(self.AC.v, feed_dict=feed_dict)[0, 0]

                    buffer_v_target = []
                    for r in buffer_r[::-1]:  # reverse buffer r
                        v_s_ = r + GAMMA * v_s_
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()

                    if self.image_shape is not None:
                        buffer_s_ = [
                            buffer_s_[np.newaxis, :] for buffer_s_ in buffer_s
                        ]
                    else:
                        buffer_s_ = copy.deepcopy(buffer_s)
                    obs_columns = [
                        np.vstack(buffer_s_[idx:-(self.stack - idx)])
                        for idx in range(self.stack)
                    ]
                    buffer_a, buffer_v_target = np.array(buffer_a), np.vstack(
                        buffer_v_target)
                    feed_dict = {
                        var: obs
                        for var, obs in zip(self.AC.s, obs_columns)
                    }
                    feed_dict[self.AC.a_his] = buffer_a
                    feed_dict[self.AC.v_target] = buffer_v_target
                    if self.debug and self.name == 'W_0':
                        a_loss, c_loss, t_td, c_loss, t_log_prob, t_exp_v, t_entropy, t_exp_v2, a_loss, a_grads, c_grads = self.AC.get_stats(
                            feed_dict)
                        #print("a_loss: ", a_loss.shape, " ", a_loss, "\tc_loss: ", c_loss.shape, " ", c_loss, "\ttd: ", t_td.shape, " ", t_td, "\tlog_prob: ", t_log_prob.shape, " ", t_log_prob, "\texp_v: ", t_exp_v.shape, " ", t_exp_v, "\tentropy: ", t_entropy.shape, " ", t_entropy, "\texp_v2: ", t_exp_v2.shape, " ", t_exp_v2, "\ta_grads: ", [np.sum(weights) for weights in a_grads], "\tc_grads: ", [np.sum(weights) for weights in c_grads])
                        print("a_loss: ", a_loss.shape, " ", a_loss,
                              "\tc_loss: ", c_loss)
                    c_loss, a_loss, entropy = self.AC.update_global(feed_dict)

                    #import ipdb; ipdb.set_trace()
                    buffer_s, buffer_a, buffer_r = buffer_s[-(
                        self.stack):], [], []
                    self.AC.pull_global()

                s = s_
                total_step += 1
                if done:
                    GLOBAL_R.append(ep_r)
                    if len(GLOBAL_RUNNING_R
                           ) == 0:  # record running episode reward
                        GLOBAL_RUNNING_R.append(ep_r)
                    else:
                        GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] +
                                                0.01 * ep_r)

                    log_lock.acquire()
                    logger.record_tabular("global_ep", GLOBAL_EP)
                    logger.record_tabular("name", self.name)
                    logger.record_tabular("ep_r", ep_r)
                    logger.record_tabular("ep_r_weighted",
                                          GLOBAL_RUNNING_R[-1])
                    logger.record_tabular("c_loss", c_loss)
                    logger.record_tabular("a_loss", a_loss)
                    logger.record_tabular("entropy", entropy)
                    logger.dump_tabular()
                    log_lock.release()

                    GLOBAL_EP += 1
                    break
    plt.subplot(2, 1, 2)
    plt.plot(np.arange(len(GLOBAL_R)), GLOBAL_R)
    plt.xlabel('step')
    plt.ylabel('Total moving reward')
    if args.log:
        name = 'plot_' + str(MAX_GLOBAL_EP) + '_sharing_'
        if args.hard_share is not None:
            name += 'hard'
        elif soft_sharing_coeff_actor > 0. or soft_sharing_coeff_critic > 0.:
            name += 'soft'
        else:
            name += 'none'
        name += '_lra_' + str(lr_a) + '_lrc_' + str(lr_c) + '.png'
        plt.savefig()
    else:
        plt.show()

        env = gym.make(GAME).unwrapped
        env = TimeLimit(env, max_episode_steps=args.max_ep_steps)
        s = env.reset()
        buffer_s = [s] * args.stack
        tidx = 0
        done = False
        while tidx < 1000 and not done:
            a = workers[0].AC.choose_action(buffer_s[-args.stack:])
            env.render()
            s_, r, done, info = env.step(a)
            s = s_
            buffer_s.append(s)
            tidx += 1
Exemple #15
0
from test_envs.cartpole_continous import CartPoleContinousEnv
from gym.wrappers.time_limit import TimeLimit

from test_policies.pd import PD
import numpy as np

import time

PD_coeff = np.array([2.0, 1.0, 10.0, 2.0])
policy = PD(PD_coeff)

env = TimeLimit(CartPoleContinousEnv(), max_episode_steps=200)
state = env.reset()

total_reward = total_length = 0

for step in range(10000):
    action = policy(np.array(state))

    state, reward, done, info = env.step(action)
    total_reward += reward
    total_length += 1

    # env.render()
    # time.sleep(0.01)

    if done:
        state = env.reset()

        assert total_reward == 200
        assert total_length == 200
class ObstacleGoalEnv(VanillaGoalEnv):
    def __init__(self, args):
        VanillaGoalEnv.__init__(self, args)
        env_id = {'FetchPush-v1': 'push'}
        assert args.env in env_id.keys()
        MODEL_XML_PATH = os.path.abspath('.') + '/envs/assets/fetch/' + env_id[
            args.env] + '_obstacle.xml'

        if env_id[args.env] in ['push']:
            initial_qpos = {
                'robot0:slide0': 0.405,
                'robot0:slide1': 0.48,
                'robot0:slide2': 0.0,
                'object0:joint': [1.25, 0.53, 0.4, 1., 0., 0., 0.],
            }
            self.env = FetchEnv(MODEL_XML_PATH,
                                has_object=True,
                                block_gripper=False,
                                n_substeps=20,
                                gripper_extra_height=0.2,
                                target_in_the_air=True,
                                target_offset=0.0,
                                obj_range=0.15,
                                target_range=0.15,
                                distance_threshold=0.05,
                                initial_qpos=initial_qpos,
                                reward_type='sparse')

        self.env = TimeLimit(
            self.env,
            max_episode_steps=args.timesteps)  # A default wrapper of gym.

        self.render = self.env.render
        self.get_obs = self.env.env._get_obs
        self.reset_sim = self.env.env._reset_sim

        self.env.reset()
        self.reset()

    def reset(self):
        self.reset_ep()
        self.sim.set_state(self.initial_state)

        if self.has_object:
            object_xpos = self.initial_gripper_xpos[:2].copy()
            random_offset = np.random.uniform(
                0.3, 1.0) * self.obj_range * self.args.init_offset
            object_xpos -= np.array([random_offset, self.obj_range])
            object_qpos = self.sim.data.get_joint_qpos('object0:joint')
            assert object_qpos.shape == (7, )
            object_qpos[:2] = object_xpos
            self.sim.data.set_joint_qpos('object0:joint', object_qpos)

        self.sim.forward()
        self.goal = self.generate_goal()
        self.last_obs = (self.get_obs()).copy()
        return self.get_obs()

    def generate_goal(self):
        return self.env.env._sample_goal()

    def generate_goal(self):
        if self.has_object:
            goal = self.initial_gripper_xpos[:3] + self.target_offset
            goal[0] += np.random.uniform(-self.target_range,
                                         -self.target_range * 0.3)
            goal[1] += self.target_range
            goal[2] = self.height_offset + int(self.target_in_the_air) * 0.45
        else:
            goal = self.initial_gripper_xpos[:3] + np.array([
                np.random.uniform(-self.target_range, self.target_range),
                self.target_range, self.target_range
            ])
        return goal.copy()
def main():

    # train the policy, then do some tests to get a sense of how it performs

    for arg in sys.argv:
        if arg.startswith('--job='):
            i = int(arg.split('--job=')[1]) - 1

    # pull in the encoder params
    p_dir = "./experiments/extra_train_exps/{}".format(i)
    proj = np.load(p_dir + "projectors.npz")
    proj = np.row_stack([v for k, v in proj.items()])
    proj = la.svd(proj, full_matrices=False)[2]
    enc_dim = proj.shape[0]
    weights = np.load(p_dir + "weights.npz")
    biases = np.load(p_dir + "biases.npz")
    weights = [v for k, v in weights.items()]
    biases = [v for k, v in biases.items()]

    saveload_path = "./experiments/extra_train_exps/{}".format(i)

    # train the model
    # try a few restarts, keep the best
    best_avg_perf = -np.inf
    perfs = []
    for j in range(5):
        # set up the environment
        env = TimeLimit(
            RestartablePendulumEnv(enc_dim=enc_dim),
            max_episode_steps=200)  # not sure effect of max_episode_steps
        env = EncoderWrapper(env, mlp_encoder, [weights, biases, proj])
        env = DummyVecEnv([lambda: env])
        pol = LinearPolicy_MLPCritic
        pol_args = dict(
            layers=[64, 64], layer_norm=False
        )  # this is the architecture for the critic in ddpg, doesn't specify policy

        model = train_policy_ddpg(env,
                                  pol,
                                  pol_args,
                                  300000,
                                  verbose=0,
                                  actor_lr=.5,
                                  critic_lr=.001)

        # clean up
        env.close()

        #model = DDPG.load(saveload_path+"model")

        # now let's test the model
        # specify the test task
        n_test_steps = 100

        # uniform grid over statespace (20 points)
        angs = np.linspace(-np.pi, np.pi, 5)[:-1]
        vels = np.linspace(-1, 1, 5)
        test_states = np.array(list(itertools.product(angs, vels)))
        n_test_states = len(angs) * len(vels)
        performance = np.zeros(n_test_states)

        # restart the env
        env = TimeLimit(RestartablePendulumEnv(), max_episode_steps=200)
        env = EncoderWrapper(env, mlp_encoder, [weights, biases, proj])

        # for each test state, start the env in the state, then run forward and collect rewards
        for k in range(n_test_states):
            obs = env.reset(state=test_states[k])
            rewards = []
            for j in range(n_test_steps):
                action, _states = model.predict(obs)
                obs, reward, dones, info = env.step(action)
                rewards.append(reward)
                #env.render()
            performance[k] = np.array(rewards).mean()

        avg_perf = performance.mean()
        perfs.append(avg_perf)
        print("average performance of this model:{}".format(avg_perf))
        if avg_perf > best_avg_perf:
            best_avg_perf = avg_perf
            # specify the path to save the model

            model.save(saveload_path + "model")
            np.savetxt(saveload_path + "test_performance.txt", performance)

        # clean up and save results
        np.savetxt(saveload_path + "avg_per_runs.txt", np.array(perfs))
        env.close()
        del model