def render_episode(env: TimeLimit, estimator: CNN_DQN):
    obs = env.reset()
    state = get_state(obs)
    is_done = False
    while not is_done:
        sleep(0.0415)  # (~24 fps)

        actionIndex = torch.argmax(estimator.predict(state)).item()
        action = ACTIONS[actionIndex]
        obs, reward, is_done, info = env.step(action)
        state = get_state(obs)
        env.render()
    env.close()
Example #2
0
def render_episode(env: TimeLimit, estimator: CNN_DQN):
    obs = env.reset()
    state = get_state(obs)
    is_done = False
    while not is_done:
        sleep(0.0415)  # (~24 fps)

        rgb = env.render('rgb_array')
        upscaled = repeat_upsample(rgb, 3, 4)
        viewer.imshow(upscaled)

        actionIndex = torch.argmax(estimator.predict(state)).item()
        action = ACTIONS[actionIndex]
        obs, reward, is_done, _ = env.step(action)
        if reward != 0:
            print(reward)
        state = get_state(obs)
    env.close()
Example #3
0
def collect_fixed_set_of_states(conf: dict, env: TimeLimit) -> list:
    # Collect samples to evaluate the agent on a fixed set of samples
    # (DQN paper). Collect a fixed set of states by running a random policy
    # before training starts and track the average of the maximum predicted
    # Q for these states.
    env.reset()
    exclude = conf['preprocess']['exclude']
    fixed_states = []

    while True:
        action = env.action_space.sample()
        next_state, reward, done, _ = env.step(action)
        state = next_state
        preprocessed_state = preprocess_frame(state, exclude)
        fixed_states.append(preprocessed_state)
        if done:
            break
    env.close()
    print(f'Collected {len(fixed_states)} fixed set of states!')

    return fixed_states
Example #4
0
proj = la.svd(proj, full_matrices=False)[2]
enc_dim = proj.shape[0]
weights = np.load(p_dir + "weights.npz")
biases = np.load(p_dir + "biases.npz")
weights = [v for k, v in weights.items()]
biases = [v for k, v in biases.items()]

saveload_path = "./experiments/learned_controllers/pendulum/{}".format(i)
model = DDPG.load(saveload_path + "model")

# now let's test the model
# specify the test task
n_test_steps = 100

# restart the env
env = TimeLimit(RestartablePendulumEnv(), max_episode_steps=200)
env = EncoderWrapper(env, mlp_encoder, [weights, biases, proj])

# for each test state, start the env in the state, then run forward and collect rewards
for k in range(3):
    high = np.array([np.pi, 1])
    start_state = np.random.uniform(low=-high, high=high)
    obs = env.reset(state=start_state)
    for j in range(n_test_steps):
        action, _states = model.predict(obs)
        obs, reward, dones, info = env.step(action)
        env.render()

# clean up and save results
env.close()
del model
Example #5
0
max_returns = []

best_avg_return = -10000
result_directory = f'{os.environ["HOME"]}/code/imitation-learning-codebase/src/ai/algorithms/' \
                   f'results/td3'
os.makedirs(result_directory, exist_ok=True)

for epoch in range(epochs):
    evaluation_returns, losses, objectives = train_one_epoch(epoch=epoch, render=(epoch % 20 == 0) and epoch != 0)
    avg_returns.append(np.mean(evaluation_returns))
    min_returns.append(min(evaluation_returns))
    max_returns.append(max(evaluation_returns))
    print(f'epoch: {epoch} \t returns: {np.mean(evaluation_returns): 0.3f} [{np.std(evaluation_returns): 0.3f}] \t'
          f'critic loss: {np.mean(losses): 0.3f} [{np.std(losses): 0.3f}], '
          f'policy objective: {np.mean(objectives): 0.3f}[{np.std(objectives): 0.3f}]')
#          f'replay buffer reward: {np.mean(replay_buffer["reward"])} '
#          f'[{np.min(replay_buffer["reward"]): 0.3f}: {np.max(replay_buffer["reward"]): 0.3f}]')
    if (epoch % 20 == 10 or epoch == epochs - 1) and plot:
        plt.fill_between(range(len(avg_returns)), min_returns, max_returns, color='blue', alpha=0.5)
        plt.plot(avg_returns, color='b')
        plt.show()

    if np.mean(evaluation_returns) > best_avg_return:
        store_checkpoint(policy, result_directory + '/policy')
        best_avg_return = np.mean(evaluation_returns)

print(f'total duration : {time.time()-start_time}s')
results = np.asarray([avg_returns, min_returns, max_returns])
np.save(os.path.join(result_directory, result_file_name + '.npy'), results)
environment.close()
def main():

    # train the policy, then do some tests to get a sense of how it performs

    for arg in sys.argv:
        if arg.startswith('--job='):
            i = int(arg.split('--job=')[1]) - 1

    # pull in the encoder params
    p_dir = "./experiments/extra_train_exps/{}".format(i)
    proj = np.load(p_dir + "projectors.npz")
    proj = np.row_stack([v for k, v in proj.items()])
    proj = la.svd(proj, full_matrices=False)[2]
    enc_dim = proj.shape[0]
    weights = np.load(p_dir + "weights.npz")
    biases = np.load(p_dir + "biases.npz")
    weights = [v for k, v in weights.items()]
    biases = [v for k, v in biases.items()]

    saveload_path = "./experiments/extra_train_exps/{}".format(i)

    # train the model
    # try a few restarts, keep the best
    best_avg_perf = -np.inf
    perfs = []
    for j in range(5):
        # set up the environment
        env = TimeLimit(
            RestartablePendulumEnv(enc_dim=enc_dim),
            max_episode_steps=200)  # not sure effect of max_episode_steps
        env = EncoderWrapper(env, mlp_encoder, [weights, biases, proj])
        env = DummyVecEnv([lambda: env])
        pol = LinearPolicy_MLPCritic
        pol_args = dict(
            layers=[64, 64], layer_norm=False
        )  # this is the architecture for the critic in ddpg, doesn't specify policy

        model = train_policy_ddpg(env,
                                  pol,
                                  pol_args,
                                  300000,
                                  verbose=0,
                                  actor_lr=.5,
                                  critic_lr=.001)

        # clean up
        env.close()

        #model = DDPG.load(saveload_path+"model")

        # now let's test the model
        # specify the test task
        n_test_steps = 100

        # uniform grid over statespace (20 points)
        angs = np.linspace(-np.pi, np.pi, 5)[:-1]
        vels = np.linspace(-1, 1, 5)
        test_states = np.array(list(itertools.product(angs, vels)))
        n_test_states = len(angs) * len(vels)
        performance = np.zeros(n_test_states)

        # restart the env
        env = TimeLimit(RestartablePendulumEnv(), max_episode_steps=200)
        env = EncoderWrapper(env, mlp_encoder, [weights, biases, proj])

        # for each test state, start the env in the state, then run forward and collect rewards
        for k in range(n_test_states):
            obs = env.reset(state=test_states[k])
            rewards = []
            for j in range(n_test_steps):
                action, _states = model.predict(obs)
                obs, reward, dones, info = env.step(action)
                rewards.append(reward)
                #env.render()
            performance[k] = np.array(rewards).mean()

        avg_perf = performance.mean()
        perfs.append(avg_perf)
        print("average performance of this model:{}".format(avg_perf))
        if avg_perf > best_avg_perf:
            best_avg_perf = avg_perf
            # specify the path to save the model

            model.save(saveload_path + "model")
            np.savetxt(saveload_path + "test_performance.txt", performance)

        # clean up and save results
        np.savetxt(saveload_path + "avg_per_runs.txt", np.array(perfs))
        env.close()
        del model