def run_actions(env: gridworld_env.GridworldEnv, actions): env.reset() for action in actions: print(action) _, _, is_done, _ = env.update(action) env.display() time.sleep(1) if is_done: break
def trainA3C(file_name="A3C", env=GridworldEnv(1), update_global_iter=10, gamma=0.999, is_plot=False, num_episodes=500, max_num_steps_per_episode=1000, learning_rate=0.0001): """ A3C training routine. Retuns rewards and durations logs. Plot environment screen """ ns = env.observation_space.shape[ 0] ## Line to fix for arbitrary environment na = env.action_space.n gnet = Net(ns, na) # global network gnet.share_memory() # share the global parameters in multiprocessing opt = SharedAdam(gnet.parameters(), lr=learning_rate) # global optimizer global_ep, global_ep_r, res_queue = mp.Value('i', 0), mp.Value('d', 0.), mp.Queue() # parallel training workers = [ Worker(gnet, opt, global_ep, global_ep_r, res_queue, i, update_global_iter, num_episodes, max_num_steps_per_episode, gamma, env, ns, na) for i in range(mp.cpu_count()) ] [w.start() for w in workers] episode_rewards = [] # record episode reward to plot while True: r = res_queue.get() if r is not None: episode_rewards.append(r) else: break [w.join() for w in workers] #Store results np.save(file_name + '-a3c-rewards', episode_rewards) return episode_rewards
def trainDistral(file_name="Distral_1col_AC", list_of_envs=[GridworldEnv(4), GridworldEnv(5)], batch_size=128, gamma=0.95, alpha=0.8, beta=5, num_episodes=200, max_num_steps_per_episode=1000, learning_rate=0.001, n_step=1): # Specify Environment conditions input_size = list_of_envs[0].observation_space.shape[0] num_actions = list_of_envs[0].action_space.n tasks = len(list_of_envs) # Define our set of policies, including distilled one models = torch.nn.ModuleList( [Policy(input_size, num_actions) for _ in range(tasks)]) distilled = Distilled(input_size, num_actions, tasks) optimizers = [ optim.Adam(model.parameters(), lr=learning_rate) for model in models ] opt_distilled = optim.Adam(distilled.parameters(), lr=learning_rate) # Store the total rewards episode_rewards = [[] for i in range(num_episodes)] episode_duration = [[] for i in range(num_episodes)] for i_episode in range(num_episodes): task_specific_losses = [] # For each one of the envs for i_env, env in enumerate(list_of_envs): #Initialize state of envs state = env.reset() #Store total reward per environment per episode total_reward = 0 # Store duration of each episode per env duration = 0 for t in range(max_num_steps_per_episode): # Run our policy action = select_action(state, models[i_env], distilled, i_env) next_state, reward, done, _ = env.step(action.item()) models[i_env].rewards.append(reward) total_reward += reward duration += 1 if done: break #Update state state = next_state episode_rewards[i_episode].append(total_reward) episode_duration[i_episode].append(duration) # get the value estimate of the final state according to equation 7 from distral paper next_state = torch.from_numpy(np.asarray(next_state)).float() _, action_pref_temp = models[i_env](next_state) pi_0_temp, _ = distilled(next_state) temp_term = beta * action_pref_temp - torch.max( beta * action_pref_temp) final_state_value = torch.log((torch.pow(pi_0_temp, alpha) * torch.exp(temp_term)).sum()) / beta if done: final_state_value = 0 # Distill for each environment task_specific_losses.append( task_specific_update(models[i_env], distilled, optimizers[i_env], alpha, beta, gamma, final_state_value, i_env, n_step)) finish_episode(task_specific_losses, models, distilled, opt_distilled, alpha, beta, gamma) # if i_episode % args.log_interval == 0: for i in range(tasks): print('Episode: {}\tEnv: {}\tDuration: {}\tTotal Reward: {:.2f}'. format(i_episode, i, episode_duration[i_episode][i], episode_rewards[i_episode][i])) np.save(file_name + '-rewards', episode_rewards) np.save(file_name + '-durations', episode_duration) print('Completed')
if done: final_state_value = 0 # Distill for each environment task_specific_losses.append( task_specific_update(models[i_env], distilled, optimizers[i_env], alpha, beta, gamma, final_state_value, i_env, n_step)) finish_episode(task_specific_losses, models, distilled, opt_distilled, alpha, beta, gamma) # if i_episode % args.log_interval == 0: for i in range(tasks): print('Episode: {}\tEnv: {}\tDuration: {}\tTotal Reward: {:.2f}'. format(i_episode, i, episode_duration[i_episode][i], episode_rewards[i_episode][i])) np.save(file_name + '-rewards', episode_rewards) np.save(file_name + '-durations', episode_duration) print('Completed') if __name__ == '__main__': trainDistral(list_of_envs=[GridworldEnv(7), GridworldEnv(8)], learning_rate=0.00005, num_episodes=200)
def trainSQL0(file_name="SQL0", env=GridworldEnv(1), batch_size=128, gamma=0.999, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=1000, is_plot=False, num_episodes=200, max_num_steps_per_episode=1000, learning_rate=0.0001, memory_replay_size=10000, n_step=10, target_update=10): """ Soft Q-learning training routine when observation vector is input Retuns rewards and durations logs. """ num_actions = env.action_space.n input_size = env.observation_space.shape[0] model = DQN(input_size, num_actions) target_model = DQN(input_size, num_actions) target_model.load_state_dict(model.state_dict()) optimizer = optim.Adam(model.parameters(), lr=learning_rate) # optimizer = optim.RMSprop(model.parameters(), ) use_cuda = torch.cuda.is_available() if use_cuda: model.cuda() memory = ReplayMemory(memory_replay_size, n_step, gamma) episode_durations = [] mean_durations = [] episode_rewards = [] mean_rewards = [] steps_done, t = 0, 0 for i_episode in range(num_episodes): if i_episode % 20 == 0: clear_output() if i_episode != 0: print("Cur episode:", i_episode, "steps done:", episode_durations[-1], "exploration factor:", eps_end + (eps_start - eps_end) * \ math.exp(-1. * steps_done / eps_decay), "reward:", env.episode_total_reward) # Initialize the environment and state state = torch.from_numpy(env.reset()).type(torch.FloatTensor).view( -1, input_size) for t in count(): # Select and perform an action action = select_action(state, model, num_actions, eps_start, eps_end, eps_decay, steps_done) next_state_tmp, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # Observe new state next_state = torch.from_numpy(next_state_tmp).type( torch.FloatTensor).view(-1, input_size) if done: next_state = None # Store the transition in memory memory.push(model, target_model, state, action, next_state, reward) # Move to the next state state = next_state # plot_state(state) # env.render() # Perform one step of the optimization (on the target network) optimize_model(model, target_model, optimizer, memory, batch_size, gamma, beta) #### Difference w.r.t DQN if done or t + 1 >= max_num_steps_per_episode: episode_durations.append(t + 1) episode_rewards.append( env.episode_total_reward ) ##### Modify for OpenAI envs such as CartPole if is_plot: plot_durations(episode_durations, mean_durations) plot_rewards(episode_rewards, mean_rewards) steps_done += 1 break if i_episode % target_update == 0 and i_episode != 0: target_model.load_state_dict(model.state_dict()) print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-sql0-rewards', episode_rewards) np.save(file_name + '-sql0-durations', episode_durations) return model, episode_rewards, episode_durations
) ##### Modify for OpenAI envs such as CartPole if is_plot: plot_durations(episode_durations, mean_durations) plot_rewards(episode_rewards, mean_rewards) steps_done += 1 break if i_episode % target_update == 0 and i_episode != 0: target_model.load_state_dict(model.state_dict()) print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-sql0-rewards', episode_rewards) np.save(file_name + '-sql0-durations', episode_durations) return model, episode_rewards, episode_durations if __name__ == '__main__': # trainSQL0(env=GridworldEnv(4), learning_rate=0.00001, max_num_steps_per_episode=100, num_episodes=1000) trainSQL0(env=GridworldEnv(8), learning_rate=0.001, max_num_steps_per_episode=100, num_episodes=1000, n_step=10, target_update=100)
def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(5), GridworldEnv(4), GridworldEnv(6)], batch_size=128, gamma=0.999, alpha=0.8, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5, is_plot=False, num_episodes=200, max_num_steps_per_episode=1000, learning_rate=0.001, memory_replay_size=10000, memory_policy_size=1000): """ Soft Q-learning training routine. Retuns rewards and durations logs. Plot environment screen """ num_actions = list_of_envs[0].action_space.n input_size = list_of_envs[0].observation_space.shape[0] num_envs = len(list_of_envs) policy = PolicyNetwork(input_size, num_actions) models = [DQN(input_size, num_actions) for _ in range(0, num_envs)] ### Add torch.nn.ModuleList (?) memories = [ ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs) ] use_cuda = torch.cuda.is_available() if use_cuda: policy.cuda() for model in models: model.cuda() optimizers = [ optim.Adam(model.parameters(), lr=learning_rate) for model in models ] policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate) # optimizer = optim.RMSprop(model.parameters(), ) episode_durations = [[] for _ in range(num_envs)] episode_rewards = [[] for _ in range(num_envs)] steps_done = np.zeros(num_envs) episodes_done = np.zeros(num_envs) current_time = np.zeros(num_envs) # Initialize environments states = [] for env in list_of_envs: states.append( torch.from_numpy(env.reset()).type(torch.FloatTensor).view( -1, input_size)) while np.min(episodes_done) < num_episodes: # TODO: add max_num_steps_per_episode # Optimization is given by alterating minimization scheme: # 1. do the step for each env # 2. do one optimization step for each env using "soft-q-learning". # 3. do one optimization step for the policy for i_env, env in enumerate(list_of_envs): # select an action action = select_action(states[i_env], policy, models[i_env], num_actions, eps_start, eps_end, eps_decay, episodes_done[i_env], alpha, beta) steps_done[i_env] += 1 current_time[i_env] += 1 next_state_tmp, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # Observe new state next_state = torch.from_numpy(next_state_tmp).type( torch.FloatTensor).view(-1, input_size) if done: next_state = None # Store the transition in memory time = Tensor([current_time[i_env]]) memories[i_env].push(states[i_env], action, next_state, reward, time) # Perform one step of the optimization (on the target network) optimize_model(policy, models[i_env], optimizers[i_env], memories[i_env], batch_size, alpha, beta, gamma) # Update state states[i_env] = next_state # Check if agent reached target if done or current_time[i_env] >= max_num_steps_per_episode: if episodes_done[i_env] <= num_episodes: print( "ENV:", i_env, "iter:", episodes_done[i_env], "\treward:{0:.2f}".format(env.episode_total_reward), "\tit:", current_time[i_env], "\texp_factor:", eps_end + (eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay)) episode_rewards[i_env].append(env.episode_total_reward) episodes_done[i_env] += 1 episode_durations[i_env].append(current_time[i_env]) current_time[i_env] = 0 states[i_env] = torch.from_numpy(env.reset()).type( torch.FloatTensor).view(-1, input_size) if is_plot: plot_rewards(episode_rewards, i_env) optimize_policy(policy, policy_optimizer, memories, batch_size, num_envs, gamma, alpha, beta) print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-distral-2col-rewards', episode_rewards) np.save(file_name + '-distral-2col-durations', episode_durations) return models, policy, episode_rewards, episode_durations
states[i_env] = torch.from_numpy(env.reset()).type( torch.FloatTensor).view(-1, input_size) if is_plot: plot_rewards(episode_rewards, i_env) optimize_policy(policy, policy_optimizer, memories, batch_size, num_envs, gamma, alpha, beta) print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-distral-2col-rewards', episode_rewards) np.save(file_name + '-distral-2col-durations', episode_durations) return models, policy, episode_rewards, episode_durations if __name__ == '__main__': # trainD(list_of_envs=[GridworldEnv(4),GridworldEnv(5),GridworldEnv(6),GridworldEnv(7),GridworldEnv(8)], learning_rate=0.001, max_num_steps_per_episode=100, num_episodes=1000) # trainD(list_of_envs=[GridworldEnv(4),GridworldEnv(5),GridworldEnv(6),GridworldEnv(7),GridworldEnv(8)], learning_rate=0.001) trainD(list_of_envs=[GridworldEnv(4), GridworldEnv(5)], learning_rate=0.00001, beta=3)
final_state_value = 0 # Distill for each environment task_specific_losses.append( task_specific_update(models[i_env], distilled, optimizers[i_env], alpha, beta, gamma, final_state_value, i_env)) finish_episode(task_specific_losses, models, distilled, opt_distilled, alpha, beta, gamma) # if i_episode % args.log_interval == 0: for i in range(tasks): print('Episode: {}\tEnv: {}\tDuration: {}\tTotal Reward: {:.2f}'. format(i_episode, i, episode_duration[i_episode][i], episode_rewards[i_episode][i])) np.save(file_name + '-distral0-rewards', episode_rewards) np.save(file_name + '-distral0-durations', episode_duration) print('Completed') if __name__ == '__main__': # trainDistral(list_of_envs=[GridworldEnv(4), GridworldEnv(5), GridworldEnv(6), GridworldEnv(7), GridworldEnv(8)], learning_rate=0.0001, num_episodes=200) trainDistral(list_of_envs=[GridworldEnv(4), GridworldEnv(5)], learning_rate=0.0001, num_episodes=200, beta=5)
if is_plot: plot_rewards(episode_rewards, i_env) # Perform one step of the optimization on the Distilled policy optimize_policy(policy, policy_optimizer, memories, batch_size, num_envs, gamma, alpha, beta) print('Complete') env.render(close=True) env.close() ## Store Results np.save(file_name + '-rewards', episode_rewards) np.save(file_name + '-durations', episode_durations) return models, policy, episode_rewards, episode_durations if __name__ == '__main__': trainD(list_of_envs=[ GridworldEnv(4), GridworldEnv(5), GridworldEnv(6), GridworldEnv(7), GridworldEnv(8) ], learning_rate=0.001, max_num_steps_per_episode=100, num_episodes=1000, alpha=1.)
episode_rewards[i_env].append(env.episode_total_reward) episodes_done[i_env] += 1 episode_durations[i_env].append(current_time[i_env]) current_time[i_env] = 0 states[i_env] = torch.from_numpy( env.reset() ).type(torch.FloatTensor).view(-1,input_size) optimize_policy(policy, policy_optimizer, memories, batch_size, num_envs, gamma, alpha, beta) print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-distral-2col-rewards', episode_rewards) np.save(file_name + '-distral-2col-durations', episode_durations) return models, policy, episode_rewards, episode_durations if __name__ == '__main__': # trainD(list_of_envs=[GridworldEnv(4),GridworldEnv(5),GridworldEnv(6),GridworldEnv(7),GridworldEnv(8)], learning_rate=0.001, max_num_steps_per_episode=100, num_episodes=1000) # trainD(list_of_envs=[GridworldEnv(4),GridworldEnv(5),GridworldEnv(6),GridworldEnv(7),GridworldEnv(8)], learning_rate=0.001) trainD(list_of_envs=[GridworldEnv(4),GridworldEnv(5)], learning_rate=0.001, beta=4)
default=543, metavar='N', help='random seed (default: 1)') parser.add_argument('--render', action='store_true', help='render the environment') parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='interval between training status logs (default: 10)') args = parser.parse_args() #env = gym.make('CartPole-v0') env = GridworldEnv(6) env.seed(args.seed) #torch.manual_seed(args.seed) SavedAction = namedtuple('SavedAction', ['log_prob', 'value']) class Policy(nn.Module): def __init__(self): super(Policy, self).__init__() self.affine1 = nn.Linear(3, 128) self.action_head = nn.Linear(128, 4) self.value_head = nn.Linear(128, 1) self.saved_actions = [] self.rewards = []
current_time[i_env] = 0 states[i_env] = torch.from_numpy( env.reset() ).type(torch.FloatTensor).view(-1,input_size) if is_plot: plot_rewards(episode_rewards, i_env) optimize_policy(policy, policy_optimizer, memories, batch_size, num_envs, gamma, alpha, beta) print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-distral-2col-rewards', episode_rewards) np.save(file_name + '-distral-2col-durations', episode_durations) np.save(file_name + '-beta-distilled_logit_norms', distilled_logits_magnitude) np.save(file_name + '-beta-policy_logit_norms', policy_logits_magnitude) return models, policy, episode_rewards, episode_durations if __name__ == '__main__': # trainD(list_of_envs=[GridworldEnv(4),GridworldEnv(5),GridworldEnv(6),GridworldEnv(7),GridworldEnv(8)], learning_rate=0.001, max_num_steps_per_episode=100, num_episodes=1000) trainD(list_of_envs=[GridworldEnv(4),GridworldEnv(5),GridworldEnv(6),GridworldEnv(7),GridworldEnv(8)], learning_rate=0.001) # trainD(list_of_envs=[GridworldEnv(4),GridworldEnv(5)], learning_rate=0.001)
pi_0_temp, _ = distilled(next_state) temp_term = beta*action_pref_temp - torch.max(beta*action_pref_temp) final_state_value = torch.log((torch.pow(pi_0_temp, alpha) * torch.exp(temp_term)).sum()) / beta if done: final_state_value = 0 # Distill for each environment task_specific_losses.append(task_specific_update(models[i_env], distilled, optimizers[i_env], alpha, beta, gamma, final_state_value, i_env)) finish_episode(task_specific_losses, models, distilled, opt_distilled, alpha, beta, gamma) # if i_episode % args.log_interval == 0: for i in range(tasks): print('Episode: {}\tEnv: {}\tDuration: {}\tTotal Reward: {:.2f}'.format( i_episode, i, episode_duration[i_episode][i], episode_rewards[i_episode][i])) np.save(file_name + '-distral0-rewards' , episode_rewards) np.save(file_name + '-distral0-durations' , episode_duration) print('Completed') if __name__ == '__main__': # trainDistral(list_of_envs=[GridworldEnv(4), GridworldEnv(5), GridworldEnv(6), GridworldEnv(7), GridworldEnv(8)], learning_rate=0.0001, num_episodes=200) trainDistral(list_of_envs=[GridworldEnv(4), GridworldEnv(5)], learning_rate=0.00025, num_episodes=200, beta=5)