def save_model(self, env_name, suffix="", actor_path=None):
        """
        Save the Actor Model after training is completed.
        :param env_name: The environment name.
        :param suffix: The optional suffix.
        :param actor_path: The path to save the actor.
        :return: None
        """
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if actor_path is None:
            actor_path = "models/actor_{}_{}".format(env_name, suffix)
        tprint('Saving model to {}'.format(actor_path))
        torch.save(self.actor.state_dict(), actor_path)
def train_dagger(env, args, device):
    debug = args.getboolean('debug')
    memory = ReplayBuffer(max_size=args.getint('buffer_size'))
    learner = DAGGER(device, args)

    n_a = args.getint('n_actions')
    n_agents = args.getint('n_agents')
    batch_size = args.getint('batch_size')

    n_train_episodes = args.getint('n_train_episodes')
    beta_coeff = args.getfloat('beta_coeff')
    test_interval = args.getint('test_interval')
    n_test_episodes = args.getint('n_test_episodes')

    total_numsteps = 0
    updates = 0
    beta = 1

    stats = {'mean': -1.0 * np.Inf, 'std': 0}

    for i in range(n_train_episodes):

        beta = max(beta * beta_coeff, 0.5)

        state = MultiAgentStateWithDelay(device,
                                         args,
                                         env.reset(),
                                         prev_state=None)

        done = False
        policy_loss_sum = 0
        while not done:

            optimal_action = env.env.controller()
            if np.random.binomial(1, beta) > 0:
                action = optimal_action
            else:
                action = learner.select_action(state)
                action = action.cpu().numpy()

            next_state, reward, done, _ = env.step(action)

            next_state = MultiAgentStateWithDelay(device,
                                                  args,
                                                  next_state,
                                                  prev_state=state)

            total_numsteps += 1

            # action = torch.Tensor(action)
            notdone = torch.Tensor([not done]).to(device)
            reward = torch.Tensor([reward]).to(device)

            # action is (N, nA), need (B, 1, nA, N)
            optimal_action = torch.Tensor(optimal_action).to(device)
            optimal_action = optimal_action.transpose(1, 0)
            optimal_action = optimal_action.reshape((1, 1, n_a, n_agents))

            memory.insert(
                Transition(state, optimal_action, notdone, next_state, reward))

            state = next_state

        if memory.curr_size > batch_size:
            for _ in range(args.getint('updates_per_step')):
                transitions = memory.sample(batch_size)
                batch = Transition(*zip(*transitions))
                policy_loss = learner.gradient_step(batch)
                policy_loss_sum += policy_loss
                updates += 1

        if i % test_interval == 0 and debug:
            test_rewards = []
            for _ in range(n_test_episodes):
                ep_reward = 0
                state = MultiAgentStateWithDelay(device,
                                                 args,
                                                 env.reset(),
                                                 prev_state=None)
                done = False
                while not done:
                    action = learner.select_action(state)
                    next_state, reward, done, _ = env.step(
                        action.cpu().numpy())
                    next_state = MultiAgentStateWithDelay(device,
                                                          args,
                                                          next_state,
                                                          prev_state=state)
                    ep_reward += reward
                    state = next_state
                    # env.render()
                test_rewards.append(ep_reward)

            mean_reward = np.mean(test_rewards)
            if stats['mean'] < mean_reward:
                stats['mean'] = mean_reward
                stats['std'] = np.std(test_rewards)

                if debug and args.get('fname'):  # save the best model
                    learner.save_model(args.get('env'),
                                       suffix=args.get('fname'))

            if debug:
                statistics = env.get_stats()
                tprint(
                    "Episode: {}, updates: {}, total numsteps: {}, reward: {}, policy loss: {}, vel_diffs: {}, min_dists: {}"
                    .format(i, updates, total_numsteps, mean_reward,
                            policy_loss_sum, np.mean(statistics['vel_diffs']),
                            np.mean(statistics['min_dists'])))

    test_rewards = []
    for _ in range(n_test_episodes):
        ep_reward = 0
        state = MultiAgentStateWithDelay(device,
                                         args,
                                         env.reset(),
                                         prev_state=None)
        done = False
        while not done:
            action = learner.select_action(state)
            next_state, reward, done, _ = env.step(action.cpu().numpy())
            next_state = MultiAgentStateWithDelay(device,
                                                  args,
                                                  next_state,
                                                  prev_state=state)
            ep_reward += reward
            state = next_state
            # env.render()
        test_rewards.append(ep_reward)

    mean_reward = np.mean(test_rewards)
    stats['mean'] = mean_reward
    stats['std'] = np.std(test_rewards)

    statistics = env.get_stats()

    stats['vel_diffs'] = statistics['vel_diffs']
    stats['min_dists'] = statistics['min_dists']

    if debug and args.get('fname'):  # save the best model
        learner.save_model(args.get('env'), suffix=args.get('fname'))

    env.close()
    return stats
def train_CTADAGGER(env, args, device):
    debug = args.getboolean('debug')
    memory = ReplayBuffer(max_size=args.getint('buffer_size'))
    learner = CTADAGGER(device, args)

    n_a = args.getint('n_actions')
    n_agents = args.getint('n_agents')
    batch_size = args.getint('batch_size')

    n_train_episodes = args.getint('n_train_episodes')
    beta_coeff = args.getfloat('beta_coeff')
    test_interval = args.getint('test_interval')
    n_test_episodes = args.getint('n_test_episodes')

    total_numsteps = 0
    updates = 0
    beta = 1

    stats = {'mean': -1.0 * np.Inf, 'std': 0}

    # for i in range(1):
    for i in range(n_train_episodes):
        # print("episode :" + str(i))
        beta = max(beta * beta_coeff, 0.5)

        state = MultiAgentStateWithDelay(device,
                                         args,
                                         env.reset(),
                                         prev_state=None)

        done = False
        policy_loss_sum = 0
        while not done:

            optimal_action = env.env.controller()
            if np.random.binomial(1, beta) > 0:
                action = optimal_action
            else:
                action = learner.select_action(state, True)
                action = action.cpu().numpy()

            next_state, reward, done, _ = env.step(action)

            next_state = MultiAgentStateWithDelay(device,
                                                  args,
                                                  next_state,
                                                  prev_state=state)

            total_numsteps += 1

            # action = torch.Tensor(action)
            notdone = torch.Tensor([not done]).to(device)
            reward = torch.Tensor([reward]).to(device)

            # action is (N, nA), need (B, 1, nA, N)
            optimal_action = torch.Tensor(optimal_action).to(device)
            optimal_action = optimal_action.transpose(1, 0)
            optimal_action = optimal_action.reshape((1, 1, n_a, n_agents))

            memory.insert(
                Transition(state, optimal_action, notdone, next_state, reward))

            state = next_state

        if memory.curr_size > batch_size:
            for _ in range(args.getint('updates_per_step')):
                transitions = memory.sample(batch_size)
                batch = Transition(*zip(*transitions))
                policy_loss = learner.gradient_step(batch, i == 0)
                policy_loss_sum += policy_loss
                updates += 1

        if i % test_interval == 0 and debug:
            # if i == 0:
            #     learner.initialize_online_learning(i == 0)
            learner.initialize_online_learning(i == 0)
            test_rewards = []
            for _ in range(n_test_episodes):
                ep_reward = 0
                state = MultiAgentStateWithDelay(device,
                                                 args,
                                                 env.reset(),
                                                 prev_state=None)
                done = False
                while not done:
                    action = learner.select_action_online(state)
                    # if i == 0:
                    #     action = learner.select_action_online(state)
                    # else:
                    #     action = learner.select_action(state, True)
                    # action = learner.select_action(state, True)
                    # print(action)
                    act_norm = action.cpu().numpy()
                    # act_norm -= np.mean(act_norm, axis=0)
                    # act_norm /= LA.norm(act_norm)
                    # act_norm /= learner.n_agents #/ learner.n_actions
                    # print(act_norm)
                    # optimal_action = env.env.controller()
                    # print("----")
                    # print(optimal_action)
                    # exit(0)
                    next_state, reward, done, _ = env.step(act_norm)
                    next_state = MultiAgentStateWithDelay(device,
                                                          args,
                                                          next_state,
                                                          prev_state=state)
                    ep_reward += reward
                    state = next_state
                    # if i == 0:
                    #     learner.online_step(env, action)
                    learner.online_step(env, action)
                    # env.render()
                test_rewards.append(ep_reward)

            mean_reward = np.mean(test_rewards)
            if stats['mean'] < mean_reward:
                stats['mean'] = mean_reward
                stats['std'] = np.std(test_rewards)

                if debug and args.get('fname'):  # save the best model
                    learner.save_model(args.get('env'),
                                       suffix=args.get('fname'))

            if debug:
                statistics = env.get_stats()
                tprint(
                    "Episode: {}, updates: {}, total numsteps: {}, reward: {}, policy loss: {}, vel_diffs: {}, min_dists: {}"
                    .format(i, updates, total_numsteps, mean_reward,
                            policy_loss_sum, np.mean(statistics['vel_diffs']),
                            np.mean(statistics['min_dists'])))

        # if i == 0:
        #     n_s = learner.actor.n_s
        #     n_a = learner.actor.n_a
        #     hidden_layers = learner.actor.hidden_layers
        #     k = learner.actor.k
        #     ind_agg = learner.actor.ind_agg
        #     new_actor = Actor(n_s, n_a, hidden_layers, k, ind_agg).to(learner.device)
        #     for i in range(learner.actor.n_layers):
        #         if i == learner.actor.n_layers - 1:
        #             new_actor.conv_layers[i] = learner.actor.conv_layers[i]
        #             continue
        #         A = learner.actor.conv_layers[i].A_sum
        #         A -= torch.mean(A, dim=0)
        #         A /= torch.norm(A) + 1
        #         new_actor.conv_layers[i].weight = torch.nn.Parameter(A)
        #         new_actor.conv_layers[i].bias = torch.nn.Parameter(learner.actor.conv_layers[i].bias.clone())
        #         # self.online_actor.convex_layers[i].weight = torch.nn.Parameter(self.actor.convex_layers[i].weight.clone())
        #         # self.online_actor.convex_layers[i].bias = torch.nn.Parameter(self.actor.convex_layers[i].bias.clone())
        #     learner.actor_optim = Adam(new_actor.parameters(), lr=learner.lr)
        #     learner.actor = new_actor

    learner.initialize_online_learning(False)
    test_rewards = []
    for _ in range(n_test_episodes):
        ep_reward = 0
        state = MultiAgentStateWithDelay(device,
                                         args,
                                         env.reset(),
                                         prev_state=None)
        done = False
        while not done:
            action = learner.select_action_online(state)
            act_norm = action.cpu().numpy()
            # act_norm -= np.mean(act_norm, axis=0)
            # act_norm /= LA.norm(act_norm)
            # act_norm /= learner.n_agents
            next_state, reward, done, _ = env.step(act_norm)
            next_state = MultiAgentStateWithDelay(device,
                                                  args,
                                                  next_state,
                                                  prev_state=state)
            ep_reward += reward
            state = next_state
            learner.online_step(env, action)
            # env.render()
        test_rewards.append(ep_reward)

    mean_reward = np.mean(test_rewards)
    stats['mean'] = mean_reward
    stats['std'] = np.std(test_rewards)

    statistics = env.get_stats()

    stats['vel_diffs'] = statistics['vel_diffs']
    stats['min_dists'] = statistics['min_dists']

    if debug and args.get('fname'):  # save the best model
        learner.save_model(args.get('env'), suffix=args.get('fname'))

    env.close()
    return stats