def render_episode(env: TimeLimit, estimator: CNN_DQN): obs = env.reset() state = get_state(obs) is_done = False while not is_done: sleep(0.0415) # (~24 fps) actionIndex = torch.argmax(estimator.predict(state)).item() action = ACTIONS[actionIndex] obs, reward, is_done, info = env.step(action) state = get_state(obs) env.render() env.close()
def render_episode(env: TimeLimit, estimator: CNN_DQN): obs = env.reset() state = get_state(obs) is_done = False while not is_done: sleep(0.0415) # (~24 fps) rgb = env.render('rgb_array') upscaled = repeat_upsample(rgb, 3, 4) viewer.imshow(upscaled) actionIndex = torch.argmax(estimator.predict(state)).item() action = ACTIONS[actionIndex] obs, reward, is_done, _ = env.step(action) if reward != 0: print(reward) state = get_state(obs) env.close()
def collect_fixed_set_of_states(conf: dict, env: TimeLimit) -> list: # Collect samples to evaluate the agent on a fixed set of samples # (DQN paper). Collect a fixed set of states by running a random policy # before training starts and track the average of the maximum predicted # Q for these states. env.reset() exclude = conf['preprocess']['exclude'] fixed_states = [] while True: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) state = next_state preprocessed_state = preprocess_frame(state, exclude) fixed_states.append(preprocessed_state) if done: break env.close() print(f'Collected {len(fixed_states)} fixed set of states!') return fixed_states
proj = la.svd(proj, full_matrices=False)[2] enc_dim = proj.shape[0] weights = np.load(p_dir + "weights.npz") biases = np.load(p_dir + "biases.npz") weights = [v for k, v in weights.items()] biases = [v for k, v in biases.items()] saveload_path = "./experiments/learned_controllers/pendulum/{}".format(i) model = DDPG.load(saveload_path + "model") # now let's test the model # specify the test task n_test_steps = 100 # restart the env env = TimeLimit(RestartablePendulumEnv(), max_episode_steps=200) env = EncoderWrapper(env, mlp_encoder, [weights, biases, proj]) # for each test state, start the env in the state, then run forward and collect rewards for k in range(3): high = np.array([np.pi, 1]) start_state = np.random.uniform(low=-high, high=high) obs = env.reset(state=start_state) for j in range(n_test_steps): action, _states = model.predict(obs) obs, reward, dones, info = env.step(action) env.render() # clean up and save results env.close() del model
max_returns = [] best_avg_return = -10000 result_directory = f'{os.environ["HOME"]}/code/imitation-learning-codebase/src/ai/algorithms/' \ f'results/td3' os.makedirs(result_directory, exist_ok=True) for epoch in range(epochs): evaluation_returns, losses, objectives = train_one_epoch(epoch=epoch, render=(epoch % 20 == 0) and epoch != 0) avg_returns.append(np.mean(evaluation_returns)) min_returns.append(min(evaluation_returns)) max_returns.append(max(evaluation_returns)) print(f'epoch: {epoch} \t returns: {np.mean(evaluation_returns): 0.3f} [{np.std(evaluation_returns): 0.3f}] \t' f'critic loss: {np.mean(losses): 0.3f} [{np.std(losses): 0.3f}], ' f'policy objective: {np.mean(objectives): 0.3f}[{np.std(objectives): 0.3f}]') # f'replay buffer reward: {np.mean(replay_buffer["reward"])} ' # f'[{np.min(replay_buffer["reward"]): 0.3f}: {np.max(replay_buffer["reward"]): 0.3f}]') if (epoch % 20 == 10 or epoch == epochs - 1) and plot: plt.fill_between(range(len(avg_returns)), min_returns, max_returns, color='blue', alpha=0.5) plt.plot(avg_returns, color='b') plt.show() if np.mean(evaluation_returns) > best_avg_return: store_checkpoint(policy, result_directory + '/policy') best_avg_return = np.mean(evaluation_returns) print(f'total duration : {time.time()-start_time}s') results = np.asarray([avg_returns, min_returns, max_returns]) np.save(os.path.join(result_directory, result_file_name + '.npy'), results) environment.close()
def main(): # train the policy, then do some tests to get a sense of how it performs for arg in sys.argv: if arg.startswith('--job='): i = int(arg.split('--job=')[1]) - 1 # pull in the encoder params p_dir = "./experiments/extra_train_exps/{}".format(i) proj = np.load(p_dir + "projectors.npz") proj = np.row_stack([v for k, v in proj.items()]) proj = la.svd(proj, full_matrices=False)[2] enc_dim = proj.shape[0] weights = np.load(p_dir + "weights.npz") biases = np.load(p_dir + "biases.npz") weights = [v for k, v in weights.items()] biases = [v for k, v in biases.items()] saveload_path = "./experiments/extra_train_exps/{}".format(i) # train the model # try a few restarts, keep the best best_avg_perf = -np.inf perfs = [] for j in range(5): # set up the environment env = TimeLimit( RestartablePendulumEnv(enc_dim=enc_dim), max_episode_steps=200) # not sure effect of max_episode_steps env = EncoderWrapper(env, mlp_encoder, [weights, biases, proj]) env = DummyVecEnv([lambda: env]) pol = LinearPolicy_MLPCritic pol_args = dict( layers=[64, 64], layer_norm=False ) # this is the architecture for the critic in ddpg, doesn't specify policy model = train_policy_ddpg(env, pol, pol_args, 300000, verbose=0, actor_lr=.5, critic_lr=.001) # clean up env.close() #model = DDPG.load(saveload_path+"model") # now let's test the model # specify the test task n_test_steps = 100 # uniform grid over statespace (20 points) angs = np.linspace(-np.pi, np.pi, 5)[:-1] vels = np.linspace(-1, 1, 5) test_states = np.array(list(itertools.product(angs, vels))) n_test_states = len(angs) * len(vels) performance = np.zeros(n_test_states) # restart the env env = TimeLimit(RestartablePendulumEnv(), max_episode_steps=200) env = EncoderWrapper(env, mlp_encoder, [weights, biases, proj]) # for each test state, start the env in the state, then run forward and collect rewards for k in range(n_test_states): obs = env.reset(state=test_states[k]) rewards = [] for j in range(n_test_steps): action, _states = model.predict(obs) obs, reward, dones, info = env.step(action) rewards.append(reward) #env.render() performance[k] = np.array(rewards).mean() avg_perf = performance.mean() perfs.append(avg_perf) print("average performance of this model:{}".format(avg_perf)) if avg_perf > best_avg_perf: best_avg_perf = avg_perf # specify the path to save the model model.save(saveload_path + "model") np.savetxt(saveload_path + "test_performance.txt", performance) # clean up and save results np.savetxt(saveload_path + "avg_per_runs.txt", np.array(perfs)) env.close() del model