Esempio n. 1
0
    def train_agent(
            self,
            exp_path,  # Path for saving the agent
            model,
            env):  # Can be either single env or vec env

        logger.info("Beginning individual training for {} steps".format(
            self.training_steps))
        model.set_env(env)
        model.learn(self.training_steps)

        logger.info('Finished train agent')
        savepath = self.basicdate + '_pnm_iteration_' + str(self.pnm_iteration)
        agent_filepath, _, _ = helper.save_model_with_env_settings(
            exp_path, model, self.model_type, env, savepath)
        agent_filepath = os.path.dirname(agent_filepath)
        return agent_filepath
Esempio n. 2
0
    def run_pnm(self):

        panther_agent_filepath, pelican_agent_filepath = self.initialAgents()

        # Initialize old NE stuff for stopping criterion
        value_to_pelican = 0.
        mixture_pelicans = np.array([1.])
        mixture_panthers = np.array([1.])

        # Create DataFrames for plotting purposes
        df_cols = [
            "NE_Payoff", "Pelican_BR_Payoff", "Panther_BR_Payoff",
            "Pelican_supp_size", "Panther_supp_size"
        ]
        df = pd.DataFrame(columns=df_cols)
        # second df for period rigorous exploitability checks
        exploit_df_cols = [
            "iter", "NE_Payoff", "Pelican_BR_Payoffs", "Panther_BR_Payoffs"
        ]
        exploit_df = pd.DataFrame(columns=exploit_df_cols)

        # Train best responses until Nash equilibrium is found or max_iterations are reached
        logger.info('Parallel Nash Memory (PNM)')
        for self.pnm_iteration in range(self.max_pnm_iterations):
            start = time.time()

            logger.info(
                "*********************************************************")
            logger.info('PNM iteration ' + str(self.pnm_iteration + 1) +
                        ' of ' + str(self.max_pnm_iterations))
            logger.info(
                "*********************************************************")

            self.pelicans.append(pelican_agent_filepath)
            self.panthers.append(panther_agent_filepath)

            if self.pnm_iteration == 0:
                self.compute_initial_payoffs()

            # Computing the payoff matrices and solving the corresponding LPs
            # Only compute for pelican in the sparse env, that of panther is the negative traspose (game is zero-sum)
            logger.info('Computing payoffs and mixtures')
            self.compute_payoff_matrix(self.pelicans, self.panthers)
            logger.info("=================================================")
            logger.info("New matrix game:")
            logger.info("As numpy array:")
            logger.info('\n' + str(self.payoffs))
            logger.info("As dataframe:")
            tmp_df = pd.DataFrame(self.payoffs).rename_axis(
                'Pelican', axis=0).rename_axis('Panther', axis=1)
            logger.info('\n' + str(tmp_df))

            # save payoff matrix
            np.save(
                '%s/payoffs_%d.npy' %
                (self.pnm_logs_exp_path, self.pnm_iteration), self.payoffs)

            def get_support_size(mixture):
                # return size of the support of mixed strategy mixture
                return sum([1 if m > 0 else 0 for m in mixture])

            # Check if we found a stable NE, in that case we are done (and fitting DF)
            if self.pnm_iteration > 0:
                # Both BR payoffs (from against last time's NE) in terms of pelican payoff
                br_value_pelican = np.dot(mixture_pelicans,
                                          self.payoffs[-1, :-1])
                br_value_panther = np.dot(mixture_panthers, self.payoffs[:-1,
                                                                         -1])

                ssize_pelican = get_support_size(mixture_pelicans)
                ssize_panther = get_support_size(mixture_panthers)

                logger.info(
                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
                logger.info("\n\
                             Pelican BR payoff: %.3f,\n\
                             Value of Game: %.3f,\n\
                             Panther BR payoff: %.3f,\n\
                             Pelican Supp Size: %d,\n\
                             Panther Supp Size: %d,\n" %
                            (br_value_pelican, value_to_pelican,
                             br_value_panther, ssize_pelican, ssize_panther))
                logger.info(
                    "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
                values = dict(
                    zip(df_cols, [
                        value_to_pelican, br_value_pelican, br_value_panther,
                        ssize_pelican, ssize_panther
                    ]))
                df = df.append(values, ignore_index=True)

                # Write to csv file
                df_path = os.path.join(
                    self.exp_path, 'values_iter_%02d.csv' % self.pnm_iteration)
                df.to_csv(df_path, index=False)
                helper.get_fig(df)
                fig_path = os.path.join(
                    self.exp_path, 'values_iter_%02d.pdf' % self.pnm_iteration)
                plt.savefig(fig_path)
                print("==========================================")
                print("WRITTEN VALUES DF TO CSV: %s" % df_path)
                print("==========================================")

                # here value_to_pelican is from the last time the subgame was solved
                if abs(br_value_pelican - value_to_pelican) < self.stopping_eps and\
                   abs(br_value_panther - value_to_pelican) < self.stopping_eps:

                    print('Stable Nash Equilibrium found')
                    break

            logger.info("SOLVING NEW GAME:")
            # solve game for pelican
            (mixture_pelicans,
             value_to_pelican) = lp_solve.solve_zero_sum_game(self.payoffs)
            # with np.printoptions(precision=3):
            logger.info(mixture_pelicans)
            mixture_pelicans /= np.sum(mixture_pelicans)
            # with np.printoptions(precision=3):
            logger.info("After normalisation:")
            logger.info(mixture_pelicans)
            np.save(
                '%s/mixture_pelicans_%d.npy' %
                (self.pnm_logs_exp_path, self.pnm_iteration), mixture_pelicans)

            # solve game for panther
            (mixture_panthers, value_panthers
             ) = lp_solve.solve_zero_sum_game(-self.payoffs.transpose())
            # with np.printoptions(precision=3):
            logger.info(mixture_panthers)
            mixture_panthers /= np.sum(mixture_panthers)
            # with np.printoptions(precision=3):
            logger.info("After normalisation:")
            logger.info(mixture_panthers)
            np.save(
                '%s/mixture_panthers_%d.npy' %
                (self.pnm_logs_exp_path, self.pnm_iteration), mixture_panthers)

            # end of logging matrix game and solution
            logger.info("=================================================")

            # Train from skratch or retrain an existing model for pelican
            logger.info('Training pelican')

            self.pelican_model = self.bootstrap(self.pelicans,
                                                self.pelican_env,
                                                mixture_pelicans)

            pelican_agent_filepath = self.train_agent_against_mixture(
                'pelican', self.pelicans_tmp_exp_path, self.pelican_model,
                self.pelican_env, self.panthers, mixture_panthers,
                self.training_steps)

            # Train from scratch or retrain an existing model for panther
            logger.info('Training panther')

            self.panther_model = self.bootstrap(self.panthers,
                                                self.panther_env,
                                                mixture_panthers)

            panther_agent_filepath = self.train_agent_against_mixture(
                'panther', self.panthers_tmp_exp_path, self.panther_model,
                self.panther_env, self.pelicans, mixture_pelicans,
                self.training_steps)

            logger.info("PNM iteration lasted: %d seconds" %
                        (time.time() - start))

            if self.pnm_iteration > 0 and self.pnm_iteration % self.testing_interval == 0:
                # Find best pelican (protagonist) against panther (opponent) mixture
                candidate_pelican_rbbr_fpaths, candidate_pelican_rbbr_win_percentages = self.iter_train_against_mixture(
                    self.
                    exploit_n_rbbrs,  # Number of resource bounded best responses
                    self.pelicans_tmp_exp_path,
                    self.pelican_model,  # driving_agent, # agent that we train
                    self.
                    pelican_env,  # env, # Can either be a single env or subvecproc
                    self.pelicans,  # Filepaths to existing models
                    mixture_pelicans,  # mixture for bootstrapping
                    self.
                    panthers,  # opponent_policy_fpaths, # policies of opponent of driving agent
                    mixture_panthers)  # opponent_mixture)

                logger.info("################################################")
                logger.info(
                    'candidate_pelican_rbbr_win_percentages: %s' %
                    np.round(candidate_pelican_rbbr_win_percentages, 2))
                logger.info("################################################")
                br_values_pelican = np.round(
                    candidate_pelican_rbbr_win_percentages, 2).tolist()

                candidate_panther_rbbr_fpaths, candidate_panther_rbbr_win_percentages = self.iter_train_against_mixture(
                    self.
                    exploit_n_rbbrs,  # Number of resource bounded best responses
                    self.panthers_tmp_exp_path,
                    self.panther_model,  # driving_agent, # agent that we train
                    self.
                    panther_env,  # env, # Can either be a single env or subvecproc
                    self.panthers,  # Filepaths to existing models
                    mixture_panthers,  # mixture for bootstrapping
                    self.
                    pelicans,  # opponent_policy_fpaths, # policies of opponent of driving agent
                    mixture_pelicans)  # opponent_mixture)

                logger.info("################################################")
                logger.info(
                    'candidate_panther_rbbr_win_percentages: %s' %
                    np.round(candidate_panther_rbbr_win_percentages, 2))
                logger.info("################################################")
                br_values_panther = [
                    1 - p for p in np.round(
                        candidate_panther_rbbr_win_percentages, 2)
                ]

                values = dict(
                    zip(exploit_df_cols, [
                        self.pnm_iteration, value_to_pelican,
                        br_values_pelican, br_values_panther
                    ]))
                exploit_df = exploit_df.append(values, ignore_index=True)

                # add medians
                exploit_df['pelican_median'] = exploit_df[
                    'Pelican_BR_Payoffs'].apply(np.median)
                exploit_df['panther_median'] = exploit_df[
                    'Panther_BR_Payoffs'].apply(np.median)

                # Write to csv file
                df_path = os.path.join(
                    self.exp_path,
                    'exploit_iter_%02d.csv' % self.pnm_iteration)

                tmp_df = exploit_df.set_index('iter')
                tmp_df.to_csv(df_path, index=True)

                helper.get_fig_with_exploit(df, tmp_df)
                fig_path = os.path.join(
                    self.exp_path,
                    'values_with_exploit_iter_%02d.pdf' % self.pnm_iteration)
                plt.savefig(fig_path)
                print("==========================================")
                print("WRITTEN EXPLOIT DF TO CSV: %s" % df_path)
                print("==========================================")

                if self.video_flag:
                    # occasionally ouput useful things along the way
                    # Make videos
                    verbose = False
                    video_path = os.path.join(
                        self.exp_path,
                        'pelican_pnm_iter_%02d.mp4' % self.pnm_iteration)
                    basewidth, hsize = helper.make_video_VEC_ENV(
                        self.pelican_model,
                        self.pelican_env,
                        video_path,
                        fps=self.fps,
                        basewidth=self.basewidth,
                        n_steps=self.video_steps,
                        verbose=verbose)

                    video_path = os.path.join(
                        self.exp_path,
                        'panther_pnm_iter_%02d.mp4' % self.pnm_iteration)
                    basewidth, hsize = helper.make_video_VEC_ENV(
                        self.panther_model,
                        self.panther_env,
                        video_path,
                        fps=self.fps,
                        basewidth=self.basewidth,
                        n_steps=self.video_steps,
                        verbose=verbose)

        # Saving final mixture and corresponding agents
        logger.info("################################################")
        logger.info("Saving final pelican mixtures and agents:")
        support_pelicans = np.nonzero(mixture_pelicans)[0]
        mixture_pelicans = mixture_pelicans[support_pelicans]
        np.save(self.exp_path + '/final_mixture_pelicans.npy',
                mixture_pelicans)
        logger.info("Final pelican mixture saved to: %s" % self.exp_path +
                    '/final_mixture_pelicans.npy')
        print("mixture:")
        print(mixture_pelicans)
        for i, idx in enumerate(mixture_pelicans):
            self.pelican_model = helper.loadAgent(
                glob.glob(self.pelicans[i] + "/*.zip")[0], self.model_type)
            self.pelican_model.set_env(self.pelican_env)
            agent_filepath, _, _ = helper.save_model_with_env_settings(
                self.pelicans_tmp_exp_path, self.pelican_model,
                self.model_type, self.pelican_env,
                self.basicdate + "_ps_" + str(i))
            logger.info("Saving  pelican %d to %s" % (i, agent_filepath))
        support_panthers = np.nonzero(mixture_panthers)[0]
        mixture_panthers = mixture_panthers[support_panthers]
        np.save(self.exp_path + '/final_mixture_panthers.npy',
                mixture_panthers)
        logger.info("Final panther mixture saved to: %s" % self.exp_path +
                    '/final_mixture_panthers.npy')
        for i, idx in enumerate(mixture_panthers):
            self.panther_model = helper.loadAgent(
                glob.glob(self.panthers[i] + "/*.zip")[0], self.model_type)
            self.panther_model.set_env(self.panther_env)
            agent_filepath, _, _ = helper.save_model_with_env_settings(
                self.panthers_tmp_exp_path, self.panther_model,
                self.model_type, self.panther_env,
                self.basicdate + "_ps_" + str(i))

            logger.info("Saving  panther %d to %s" % (i, agent_filepath))
Esempio n. 3
0
    def train_agent_against_mixture(
            self,
            driving_agent,  # agent that we train
            exp_path,
            model,
            env,  # Can either be a single env or subvecproc
            opponent_policy_fpaths,  # policies of opponent of driving agent
            opponent_mixture,
            training_steps,
            filepath_addon=''):  # mixture of opponent of driving agent

        ################################################################
        # Heuristic to compute number of opponents to sample as mixture
        ################################################################
        # Min positive probability
        min_prob = min([pr for pr in opponent_mixture if pr > 0])
        target_n_opponents = self.num_parallel_envs * int(1.0 / min_prob)
        n_opponents = min(target_n_opponents, self.max_n_opponents_to_sample)

        if self.parallel:
            # Ensure that n_opponents is a multiple of
            n_opponents = self.num_parallel_envs * round(
                n_opponents / self.num_parallel_envs)

        logger.info("=============================================")
        logger.info("Sampling %d opponents" % n_opponents)
        logger.info("=============================================")

        # Sample n_opponents
        opponents = np.random.choice(opponent_policy_fpaths,
                                     size=n_opponents,
                                     p=opponent_mixture)

        logger.info("=============================================")
        logger.info("Opponents has %d elements" % len(opponents))
        logger.info("=============================================")

        # If we use parallel envs, we run all the training against different sampled opponents in parallel
        if self.parallel:
            # Method to load new opponents via filepath
            setter = 'set_panther_using_path' if driving_agent == 'pelican' else 'set_pelican_using_path'
            for i, opponent in enumerate(opponents):
                # Stick this in the right slot, looping back after self.num_parallel_envs
                env.env_method(setter,
                               opponent,
                               indices=[i % self.num_parallel_envs])
                # When we have filled all self.num_parallel_envs, then train
                if i > 0 and (i + 1) % self.num_parallel_envs == 0:
                    logger.info(
                        "Beginning parallel training for {} steps".format(
                            self.training_steps))
                    model.set_env(env)
                    model.learn(training_steps)

        # Otherwise we sample different opponents and we train against each of them separately
        else:
            for opponent in opponents:
                if driving_agent == 'pelican':
                    env.set_panther_using_path(opponent)
                else:
                    env.set_pelican_using_path(opponent)
                logger.info(
                    "Beginning sequential training for {} steps".format(
                        self.training_steps))
                model.set_env(env)
                model.learn(self.training_steps)

        # Save agent
        logger.info('Finished train agent')
        savepath = self.basicdate + '_pnm_iteration_' + str(
            self.pnm_iteration) + filepath_addon
        agent_filepath, _, _ = helper.save_model_with_env_settings(
            exp_path, model, self.model_type, env, savepath)
        agent_filepath = os.path.dirname(agent_filepath)
        return agent_filepath