def save_model(self, env_name, suffix="", actor_path=None): """ Save the Actor Model after training is completed. :param env_name: The environment name. :param suffix: The optional suffix. :param actor_path: The path to save the actor. :return: None """ if not os.path.exists('models/'): os.makedirs('models/') if actor_path is None: actor_path = "models/actor_{}_{}".format(env_name, suffix) tprint('Saving model to {}'.format(actor_path)) torch.save(self.actor.state_dict(), actor_path)
def train_dagger(env, args, device): debug = args.getboolean('debug') memory = ReplayBuffer(max_size=args.getint('buffer_size')) learner = DAGGER(device, args) n_a = args.getint('n_actions') n_agents = args.getint('n_agents') batch_size = args.getint('batch_size') n_train_episodes = args.getint('n_train_episodes') beta_coeff = args.getfloat('beta_coeff') test_interval = args.getint('test_interval') n_test_episodes = args.getint('n_test_episodes') total_numsteps = 0 updates = 0 beta = 1 stats = {'mean': -1.0 * np.Inf, 'std': 0} for i in range(n_train_episodes): beta = max(beta * beta_coeff, 0.5) state = MultiAgentStateWithDelay(device, args, env.reset(), prev_state=None) done = False policy_loss_sum = 0 while not done: optimal_action = env.env.controller() if np.random.binomial(1, beta) > 0: action = optimal_action else: action = learner.select_action(state) action = action.cpu().numpy() next_state, reward, done, _ = env.step(action) next_state = MultiAgentStateWithDelay(device, args, next_state, prev_state=state) total_numsteps += 1 # action = torch.Tensor(action) notdone = torch.Tensor([not done]).to(device) reward = torch.Tensor([reward]).to(device) # action is (N, nA), need (B, 1, nA, N) optimal_action = torch.Tensor(optimal_action).to(device) optimal_action = optimal_action.transpose(1, 0) optimal_action = optimal_action.reshape((1, 1, n_a, n_agents)) memory.insert( Transition(state, optimal_action, notdone, next_state, reward)) state = next_state if memory.curr_size > batch_size: for _ in range(args.getint('updates_per_step')): transitions = memory.sample(batch_size) batch = Transition(*zip(*transitions)) policy_loss = learner.gradient_step(batch) policy_loss_sum += policy_loss updates += 1 if i % test_interval == 0 and debug: test_rewards = [] for _ in range(n_test_episodes): ep_reward = 0 state = MultiAgentStateWithDelay(device, args, env.reset(), prev_state=None) done = False while not done: action = learner.select_action(state) next_state, reward, done, _ = env.step( action.cpu().numpy()) next_state = MultiAgentStateWithDelay(device, args, next_state, prev_state=state) ep_reward += reward state = next_state # env.render() test_rewards.append(ep_reward) mean_reward = np.mean(test_rewards) if stats['mean'] < mean_reward: stats['mean'] = mean_reward stats['std'] = np.std(test_rewards) if debug and args.get('fname'): # save the best model learner.save_model(args.get('env'), suffix=args.get('fname')) if debug: statistics = env.get_stats() tprint( "Episode: {}, updates: {}, total numsteps: {}, reward: {}, policy loss: {}, vel_diffs: {}, min_dists: {}" .format(i, updates, total_numsteps, mean_reward, policy_loss_sum, np.mean(statistics['vel_diffs']), np.mean(statistics['min_dists']))) test_rewards = [] for _ in range(n_test_episodes): ep_reward = 0 state = MultiAgentStateWithDelay(device, args, env.reset(), prev_state=None) done = False while not done: action = learner.select_action(state) next_state, reward, done, _ = env.step(action.cpu().numpy()) next_state = MultiAgentStateWithDelay(device, args, next_state, prev_state=state) ep_reward += reward state = next_state # env.render() test_rewards.append(ep_reward) mean_reward = np.mean(test_rewards) stats['mean'] = mean_reward stats['std'] = np.std(test_rewards) statistics = env.get_stats() stats['vel_diffs'] = statistics['vel_diffs'] stats['min_dists'] = statistics['min_dists'] if debug and args.get('fname'): # save the best model learner.save_model(args.get('env'), suffix=args.get('fname')) env.close() return stats
def train_CTADAGGER(env, args, device): debug = args.getboolean('debug') memory = ReplayBuffer(max_size=args.getint('buffer_size')) learner = CTADAGGER(device, args) n_a = args.getint('n_actions') n_agents = args.getint('n_agents') batch_size = args.getint('batch_size') n_train_episodes = args.getint('n_train_episodes') beta_coeff = args.getfloat('beta_coeff') test_interval = args.getint('test_interval') n_test_episodes = args.getint('n_test_episodes') total_numsteps = 0 updates = 0 beta = 1 stats = {'mean': -1.0 * np.Inf, 'std': 0} # for i in range(1): for i in range(n_train_episodes): # print("episode :" + str(i)) beta = max(beta * beta_coeff, 0.5) state = MultiAgentStateWithDelay(device, args, env.reset(), prev_state=None) done = False policy_loss_sum = 0 while not done: optimal_action = env.env.controller() if np.random.binomial(1, beta) > 0: action = optimal_action else: action = learner.select_action(state, True) action = action.cpu().numpy() next_state, reward, done, _ = env.step(action) next_state = MultiAgentStateWithDelay(device, args, next_state, prev_state=state) total_numsteps += 1 # action = torch.Tensor(action) notdone = torch.Tensor([not done]).to(device) reward = torch.Tensor([reward]).to(device) # action is (N, nA), need (B, 1, nA, N) optimal_action = torch.Tensor(optimal_action).to(device) optimal_action = optimal_action.transpose(1, 0) optimal_action = optimal_action.reshape((1, 1, n_a, n_agents)) memory.insert( Transition(state, optimal_action, notdone, next_state, reward)) state = next_state if memory.curr_size > batch_size: for _ in range(args.getint('updates_per_step')): transitions = memory.sample(batch_size) batch = Transition(*zip(*transitions)) policy_loss = learner.gradient_step(batch, i == 0) policy_loss_sum += policy_loss updates += 1 if i % test_interval == 0 and debug: # if i == 0: # learner.initialize_online_learning(i == 0) learner.initialize_online_learning(i == 0) test_rewards = [] for _ in range(n_test_episodes): ep_reward = 0 state = MultiAgentStateWithDelay(device, args, env.reset(), prev_state=None) done = False while not done: action = learner.select_action_online(state) # if i == 0: # action = learner.select_action_online(state) # else: # action = learner.select_action(state, True) # action = learner.select_action(state, True) # print(action) act_norm = action.cpu().numpy() # act_norm -= np.mean(act_norm, axis=0) # act_norm /= LA.norm(act_norm) # act_norm /= learner.n_agents #/ learner.n_actions # print(act_norm) # optimal_action = env.env.controller() # print("----") # print(optimal_action) # exit(0) next_state, reward, done, _ = env.step(act_norm) next_state = MultiAgentStateWithDelay(device, args, next_state, prev_state=state) ep_reward += reward state = next_state # if i == 0: # learner.online_step(env, action) learner.online_step(env, action) # env.render() test_rewards.append(ep_reward) mean_reward = np.mean(test_rewards) if stats['mean'] < mean_reward: stats['mean'] = mean_reward stats['std'] = np.std(test_rewards) if debug and args.get('fname'): # save the best model learner.save_model(args.get('env'), suffix=args.get('fname')) if debug: statistics = env.get_stats() tprint( "Episode: {}, updates: {}, total numsteps: {}, reward: {}, policy loss: {}, vel_diffs: {}, min_dists: {}" .format(i, updates, total_numsteps, mean_reward, policy_loss_sum, np.mean(statistics['vel_diffs']), np.mean(statistics['min_dists']))) # if i == 0: # n_s = learner.actor.n_s # n_a = learner.actor.n_a # hidden_layers = learner.actor.hidden_layers # k = learner.actor.k # ind_agg = learner.actor.ind_agg # new_actor = Actor(n_s, n_a, hidden_layers, k, ind_agg).to(learner.device) # for i in range(learner.actor.n_layers): # if i == learner.actor.n_layers - 1: # new_actor.conv_layers[i] = learner.actor.conv_layers[i] # continue # A = learner.actor.conv_layers[i].A_sum # A -= torch.mean(A, dim=0) # A /= torch.norm(A) + 1 # new_actor.conv_layers[i].weight = torch.nn.Parameter(A) # new_actor.conv_layers[i].bias = torch.nn.Parameter(learner.actor.conv_layers[i].bias.clone()) # # self.online_actor.convex_layers[i].weight = torch.nn.Parameter(self.actor.convex_layers[i].weight.clone()) # # self.online_actor.convex_layers[i].bias = torch.nn.Parameter(self.actor.convex_layers[i].bias.clone()) # learner.actor_optim = Adam(new_actor.parameters(), lr=learner.lr) # learner.actor = new_actor learner.initialize_online_learning(False) test_rewards = [] for _ in range(n_test_episodes): ep_reward = 0 state = MultiAgentStateWithDelay(device, args, env.reset(), prev_state=None) done = False while not done: action = learner.select_action_online(state) act_norm = action.cpu().numpy() # act_norm -= np.mean(act_norm, axis=0) # act_norm /= LA.norm(act_norm) # act_norm /= learner.n_agents next_state, reward, done, _ = env.step(act_norm) next_state = MultiAgentStateWithDelay(device, args, next_state, prev_state=state) ep_reward += reward state = next_state learner.online_step(env, action) # env.render() test_rewards.append(ep_reward) mean_reward = np.mean(test_rewards) stats['mean'] = mean_reward stats['std'] = np.std(test_rewards) statistics = env.get_stats() stats['vel_diffs'] = statistics['vel_diffs'] stats['min_dists'] = statistics['min_dists'] if debug and args.get('fname'): # save the best model learner.save_model(args.get('env'), suffix=args.get('fname')) env.close() return stats