def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(5), GridworldEnv(4), GridworldEnv(6)], batch_size=128, gamma=0.999, alpha=0.8, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5, is_plot=False, num_episodes=200, max_num_steps_per_episode=1000, learning_rate=0.001, memory_replay_size=10000, memory_policy_size=1000): """ Soft Q-learning training routine. Retuns rewards and durations logs. Plot environment screen """ num_actions = list_of_envs[0].action_space.n input_size = list_of_envs[0].observation_space.shape[0] num_envs = len(list_of_envs) policy = PolicyNetwork(input_size, num_actions) models = [DQN(input_size,num_actions) for _ in range(0, num_envs)] ### Add torch.nn.ModuleList (?) memories = [ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs)] use_cuda = torch.cuda.is_available() if use_cuda: policy.cuda() for model in models: model.cuda() optimizers = [optim.Adam(model.parameters(), lr=learning_rate) for model in models] policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate) # optimizer = optim.RMSprop(model.parameters(), ) episode_durations = [[] for _ in range(num_envs)] episode_rewards = [[] for _ in range(num_envs)] steps_done = np.zeros(num_envs) episodes_done = np.zeros(num_envs) current_time = np.zeros(num_envs) distilled_logits_magnitude = np.zeros((num_episodes,num_envs)) policy_logits_magnitude = np.zeros((num_episodes,num_envs)) # keep track of num of times a random action is picked num_rand = np.zeros(num_envs) # Initialize environments states = [] for env in list_of_envs: states.append(torch.from_numpy( env.reset() ).type(torch.FloatTensor).view(-1,input_size)) while np.min(episodes_done) < num_episodes: # TODO: add max_num_steps_per_episode # Optimization is given by alterating minimization scheme: # 1. do the step for each env # 2. do one optimization step for each env using "soft-q-learning". # 3. do one optimization step for the policy for i_env, env in enumerate(list_of_envs): # select an action action, pi_0_norm, pi_i_norm = select_action(states[i_env], policy, models[i_env], num_actions, eps_start, eps_end, eps_decay, episodes_done[i_env], alpha, beta) if episodes_done[i_env] < num_episodes: if pi_0_norm + pi_i_norm == 0: num_rand[i_env] += 1 else: distilled_logits_magnitude[int(episodes_done[i_env]), i_env] += pi_0_norm policy_logits_magnitude[int(episodes_done[i_env]), i_env] += pi_i_norm steps_done[i_env] += 1 current_time[i_env] += 1 next_state_tmp, reward, done, _ = env.step(action[0,0]) reward = Tensor([reward]) # Observe new state next_state = torch.from_numpy( next_state_tmp ).type(torch.FloatTensor).view(-1,input_size) if done: next_state = None # Store the transition in memory time = Tensor([current_time[i_env]]) memories[i_env].push(states[i_env], action, next_state, reward, time) # Perform one step of the optimization (on the target network) optimize_model(policy, models[i_env], optimizers[i_env], memories[i_env], batch_size, alpha, beta, gamma) # Update state states[i_env] = next_state # Check if agent reached target if done or current_time[i_env] >= max_num_steps_per_episode: if episodes_done[i_env] <= num_episodes: print("ENV:", i_env, "iter:", episodes_done[i_env], "\treward:{0:.2f}".format(env.episode_total_reward), "\tit:", current_time[i_env], "\texp_factor:", eps_end + (eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay)) if episodes_done[i_env] < num_episodes: # average the cumulative norms distilled_logits_magnitude[int(episodes_done[i_env]), i_env] /= (current_time[i_env] - num_rand[i_env]) policy_logits_magnitude[int(episodes_done[i_env]), i_env] /= (current_time[i_env] -num_rand[i_env]) num_rand[i_env] = 0 episode_rewards[i_env].append(env.episode_total_reward) episodes_done[i_env] += 1 episode_durations[i_env].append(current_time[i_env]) current_time[i_env] = 0 states[i_env] = torch.from_numpy( env.reset() ).type(torch.FloatTensor).view(-1,input_size) if is_plot: plot_rewards(episode_rewards, i_env) optimize_policy(policy, policy_optimizer, memories, batch_size, num_envs, gamma, alpha, beta) print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-distral-2col-rewards', episode_rewards) np.save(file_name + '-distral-2col-durations', episode_durations) np.save(file_name + '-beta-distilled_logit_norms', distilled_logits_magnitude) np.save(file_name + '-beta-policy_logit_norms', policy_logits_magnitude) return models, policy, episode_rewards, episode_durations
def trainSQL(file_name="SQL", env=GridworldEnv(1), batch_size=128, gamma=0.999, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=1000, is_plot=False, num_episodes=500, max_num_steps_per_episode=1000, learning_rate=0.001, memory_replay_size=10000): """ Soft Q-learning training routine. Retuns rewards and durations logs. Plot environment screen """ if is_plot: env.reset() plt.ion() plt.figure() plt.imshow(get_screen(env).cpu().squeeze(0).squeeze(0).numpy(), interpolation='none') plt.draw() plt.pause(0.00001) num_actions = env.action_space.n model = DQN(num_actions) optimizer = optim.Adam(model.parameters(), lr=learning_rate) # optimizer = optim.RMSprop(model.parameters(), ) use_cuda = torch.cuda.is_available() if use_cuda: model.cuda() memory = ReplayMemory(memory_replay_size) episode_durations = [] mean_durations = [] episode_rewards = [] mean_rewards = [] steps_done, t = 0, 0 # plt.ion() for i_episode in range(num_episodes): print("Cur episode:", i_episode, "steps done:", t, "exploration factor:", eps_end + (eps_start - eps_end) * \ math.exp(-1. * steps_done / eps_decay)) # Initialize the environment and state env.reset() # last_screen = env.current_grid_map current_screen = get_screen(env) state = current_screen # - last_screen for t in count(): # Select and perform an action action = select_action(state, model, num_actions, eps_start, eps_end, eps_decay, steps_done) _, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # Observe new state last_screen = current_screen current_screen = get_screen(env) if not done: next_state = current_screen # - last_screen else: next_state = None # Store the transition in memory memory.push(state, action, next_state, reward) # Move to the next state state = next_state # plot_state(state) # env.render() # Perform one step of the optimization (on the target network) optimize_model(model, optimizer, memory, batch_size, gamma, beta) if done or t + 1 >= max_num_steps_per_episode: episode_durations.append(t + 1) episode_rewards.append(env.episode_total_reward) if is_plot: plot_durations(episode_durations, mean_durations) plot_rewards(episode_rewards, mean_rewards) steps_done += 1 break print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-sql-rewards', episode_rewards) np.save(file_name + '-sql-durations', episode_durations) return model, episode_rewards, episode_durations
# initialize network and experience replay objects td_learner = TDLearner(env, **mc_params) # train monte carlo learner t0 = time() episode_rewards = [] for idx in xrange(n_epochs): total_reward = td_learner.run_episode(env) print('Total reward on epoch {}/{}:\t{}'.format( idx + 1, n_epochs, total_reward)) episode_rewards.append(total_reward) print('\nTraining took {} mins'.format((time() - t0) / 60.)) env_name = env.spec.id policy = 'off' if off_policy else 'on' param_str = '{}Policy_lr{:.3E}_ntilings{}_griddim{}x{}.png'\ .format(policy, learning_rate, n_tilings, *grid_dims) save_path = 'plots/td_learner_{}.png'.format(param_str) plot_rewards(episode_rewards, save_path, env_name) print('Executing greedy policy\n') td_learner.epsilon = 0 for idx in xrange(10): total_reward = td_learner.run_episode(env, render=True) print('Total reward on greedy epoch {}/{}:\t{}\n'.format( idx + 1, 10, total_reward))
# Move to the next state state = next_state cumulative_rewards.append(cum_reward) writer.add_scalar('Training ' + env_name, cum_reward, ep) # Update the target network, copying all weights and biases in DQN # Uncomment for Task 4 if ep % TARGET_UPDATE == 0: agent.update_target_network() # Save the policy # Uncomment for Task 4 if ep % 1000 == 0: torch.save(agent.policy_net.state_dict(), "weights_%s_%d.mdl" % (env_name, ep)) plot_rewards(cumulative_rewards) plt.savefig("plots/task-4b.png") print('Complete') plt.ioff() plt.show() # Task 3 - plot the policy # Values used for the discretization discr = 16 x_min, x_max = -2.4, 2.4 th_min, th_max = -0.3, 0.3 # Fixed values v = 0 av = 0
def main(): np.random.seed(2) tf.set_random_seed(2) # reproducible sess = tf.Session() hp = Hyperparameters() env = gym.make('CartPole-v0') env.seed(1) # reproducible env = env.unwrapped actor = Actor(sess, n_features=hp.N_F, n_actions=hp.N_A, lr=hp.LR_A) # we need a good teacher, so the teacher should learn faster than the actor critic = Critic(sess, n_features=hp.N_F, lr=hp.LR_C, discount=hp.GAMMA) sess.run(tf.global_variables_initializer()) if hp.OUTPUT_GRAPH: tf.summary.FileWriter("./logs/", sess.graph) running_rewards = [] for i_episode in range(hp.MAX_EPISODE): s = env.reset() # assert to check: whether there is nan in s. assert np.isnan(np.min(s.ravel())) == False t = 0 track_r = [] while True: if hp.RENDER: env.render() a, probs = actor.choose_action(s) if i_episode == 0: write_file('./logs/probs.txt', probs, True) else: write_file('./logs/probs.txt', probs, False) # print('------------------------------------', probs) s_, r, done, info = env.step(a) assert np.isnan(np.min(s_.ravel())) == False if done: r = -20 track_r.append(r) td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)] exp_v = actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error] # # debug mode # # # exp_v, act_prob, log_prob, l1 = actor.learn(s, a, td_error) # # debug mode # # s = s_ t += 1 if done or t >= hp.MAX_EP_STEPS: ep_rs_sum = sum(track_r) if 'running_reward' not in globals() and 'running_reward' not in locals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 running_rewards.append(running_reward) # print(len(running_rewards)) if len(running_rewards) % 1000 == 0: write_file('./logs/rewards_' + str(i_episode) + '.txt', running_rewards, True) y_axis_ticks = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000] plot_rewards(running_rewards, y_axis_ticks, './logs/' + str(i_episode) + '/') if running_reward > hp.DISPLAY_REWARD_THRESHOLD: hp.RENDER = True # rendering # # debug mode # # # print('\naction:', a, 'td_error:', td_error, 'exp_v:', exp_v, 'act_prob:', act_prob, 'log_prob:', # log_prob, 'l1:', l1) # # debug mode # # print('episode:', i_episode, ' running reward:', int(running_reward), ' episode reward:', ep_rs_sum, ' tf_error:', td_error, ' exp_v:', exp_v) break
def trainSQL0(file_name="SQL0", env=GridworldEnv(1), batch_size=128, gamma=0.999, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=1000, is_plot=False, num_episodes=500, max_num_steps_per_episode=1000, learning_rate=0.001, memory_replay_size=10000): """ Soft Q-learning training routine when observation vector is input Retuns rewards and durations logs. Plot environment screen """ if is_plot: env.reset() plt.ion() plt.figure() plt.imshow(get_screen(env).cpu().squeeze(0).squeeze(0).numpy(), interpolation='none') plt.draw() plt.pause(0.00001) num_actions = env.action_space.n input_size = env.observation_space.shape[0] model = DQN(input_size, num_actions) optimizer = optim.Adam(model.parameters(), lr=learning_rate) # optimizer = optim.RMSprop(model.parameters(), ) use_cuda = torch.cuda.is_available() if use_cuda: model.cuda() memory = ReplayMemory(memory_replay_size) episode_durations = [] mean_durations = [] episode_rewards = [] mean_rewards = [] steps_done, t = 0, 0 # plt.ion() for i_episode in range(num_episodes): if i_episode % 20 == 0: clear_output() if i_episode != 0: print("Cur episode:", i_episode, "steps done:", episode_durations[-1], "exploration factor:", eps_end + (eps_start - eps_end) * \ math.exp(-1. * steps_done / eps_decay), "reward:", env.episode_total_reward) # Initialize the environment and state state = torch.from_numpy( env.reset() ).type(torch.FloatTensor).view(-1,input_size) for t in count(): # Select and perform an action action = select_action(state, model, num_actions, eps_start, eps_end, eps_decay, steps_done) next_state_tmp, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # Observe new state next_state = torch.from_numpy( next_state_tmp ).type(torch.FloatTensor).view(-1,input_size) if done: next_state = None # Store the transition in memory memory.push(state, action, next_state, reward) # Move to the next state state = next_state # plot_state(state) # env.render() # Perform one step of the optimization (on the target network) optimize_model(model, optimizer, memory, batch_size, gamma, beta) #### Difference w.r.t DQN if done or t + 1 >= max_num_steps_per_episode: episode_durations.append(t + 1) episode_rewards.append(env.episode_total_reward) ##### Modify for OpenAI envs such as CartPole if is_plot: plot_durations(episode_durations, mean_durations) plot_rewards(episode_rewards, mean_rewards) steps_done += 1 break print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-sql0-rewards', episode_rewards) np.save(file_name + '-sql0-durations', episode_durations) return model, episode_rewards, episode_durations