def run(self): # Reparse the flags for this process. FLAGS = flags.FLAGS FLAGS(sys.argv) # Reload gin configurations for this process. gin_files = [ osp.join(settings.SRC_DIR, "configs", f"{x}.gin") for x in FLAGS.config_files ] gin.parse_config_files_and_bindings(config_files=gin_files, bindings=FLAGS.config_overrides, skip_unknown=False) for job in iter(self.job_queue.get, None): game = job() writer = SummaryWriter( logdir=osp.join(settings.get_run_dir(), f"epoch_{self.epoch}")) best_deviation, report = training.train( game=game, identity=self.is_attacker, opponent_mix_str=self.opponent_mixed_strategy, epoch=self.epoch, writer=writer) self.result_queue.put((self.is_attacker, best_deviation, report))
def _get_gambit_nash_path(): """ Get path to Gambit's payoff matrix. :return: Filepath. :rtype: str """ gambit_path = osp.join(settings.get_run_dir(), "nash.txt") return gambit_path
def test_dqn_cartpole(): """ Test DQN. References: - https://github.com/google/dopamine/blob/master/dopamine/agents/dqn/configs/dqn_cartpole.gin """ from attackgraph.rl.dqn import DQN import wandb flags.DEFINE_string("run_name", "test_dqn_cartpole", "") FLAGS = flags.FLAGS FLAGS(sys.argv) wandb.init( project="test_dqn_cartpole", dir=settings.get_run_dir(), resume=False) def _policy_factory(*args, **kwargs): """ Generate new policy. """ return DQN( is_attacker=True, state_dim=4, hidden_sizes=[8, 4], action_dim=2, lr=0.0001) env = gym.make("CartPole-v0") env.seed(0) env = EnvToAttackGraph(env) trainer = Learner( seed=0, # Policy. get_new_policy=_policy_factory, exploration_fraction=0.4, exploration_final_eps=0.01, # Time. total_timesteps=400000, learning_starts=500, train_freq=4, target_network_update_freq=10000, gamma=0.9, # Replay buffer. batch_size=512, buffer_size=1000000) trainer.learn_multi_nets(env, epoch=0)
def run(self): # Because we are "spawning" the process instead of "forking" the process, we need to # reimport the run's configurations. # Reparse the flags for this process. FLAGS = flags.FLAGS FLAGS(sys.argv) # Reload gin configurations for this process. gin_files = [ osp.join(settings.SRC_DIR, "configs", f"{x}.gin") for x in FLAGS.config_files ] gin.parse_config_files_and_bindings(config_files=gin_files, bindings=FLAGS.config_overrides, skip_unknown=False) policy_name = "attacker" if self.train_attacker else "defender" for job in iter(self.job_queue.get, None): # The game we're given has no policies and has not been initialized. game, opponent = job game = game() # Unpickle game. # Register the opponent we will be playing as the opponent's only policy. if self.train_attacker: game.add_def_str(opponent) else: game.add_att_str(opponent) # The opponent sampling is done from the result directory, so we need # to copy any model we use into the policy set. if self.train_attacker: opponent_dir = settings.get_defender_strategy_dir() else: opponent_dir = settings.get_attacker_strategy_dir() new_filepath = osp.join(opponent_dir, osp.basename(opponent)) shutil.copyfile(src=opponent, dst=new_filepath) save_path = osp.join(settings.get_run_dir(), osp.basename(opponent)) save_path = save_path[:-4] # Remove ".pkl". training.train(game=game, identity=int(self.train_attacker), opponent_mix_str=np.array([1.0]), epoch=osp.basename(opponent), writer=SummaryWriter(logdir=save_path), save_path=osp.join(save_path, f"{policy_name}.pkl"))
def _train(policy_save_path, opponent, writer): env = GridWorldSoccer() env = MultiToSingleAgentWrapper(env=env, agent_id=1, opponents={2: opponent}) save_path = osp.join(settings.get_run_dir(), osp.basename(policy_save_path)) save_path = save_path[:-4] # Remove ".pkl". trainer = Trainer(policy_ctor=DQN) best_response, _, replay_buffer, _ = trainer.run( env=env, name=osp.basename(policy_save_path), writer=writer) # Save data to results folder for QMixture. torch.save(best_response, f"{save_path}.pkl", pickle_module=dill) fp.save_pkl(replay_buffer, f"{save_path}.replay_buffer.pkl") return best_response, replay_buffer
def _train_classifier(classifier, buffer_paths, mixture, env, test_split: float, training_attacker: bool): """ Train an opponent classifier. """ # Load all the replay buffers and merge/split them. logger.info(f"Loading replay buffers from: ") labels = [] replay_buffers = [] for buffer_i, path in enumerate(buffer_paths): logger.info(f" - {path}") replay_buffers += [fp.load_pkl(path)] labels += [np.ones([len(replay_buffers[-1])]) * buffer_i] replay_buffer = merge_replay_buffers(replay_buffers) # We only want the state. replay_buffer = [x[0] for x in replay_buffer._storage] replay_buffer = np.array(replay_buffer) labels = np.ravel(labels) assert replay_buffer.shape[0] == labels.shape[0] # Shuffle the data. new_indices = np.random.permutation(len(labels)) replay_buffer = replay_buffer[new_indices] labels = labels[new_indices] # Train/test split. n_test_data = int(len(labels) * test_split) # Train the opponent classifier. classifier = supervised_learning(net=classifier, train_X=replay_buffer[:-n_test_data], train_Y=labels[:-n_test_data], test_X=replay_buffer[-n_test_data:], test_Y=labels[-n_test_data:], criterion=gin.REQUIRED, n_epochs=gin.REQUIRED, eval_freq=gin.REQUIRED, batch_size=gin.REQUIRED, log_dir=settings.get_run_dir()) return student
def main(argv): """ Run evaluation script. :param argv: Command line arguments. """ # Configure information displayed to terminal. np.set_printoptions(precision=2) warnings.filterwarnings("ignore") # Set-up the result directory. run_dir = settings.get_run_dir() if osp.exists(run_dir): print("Cannot resume previously saved run, overwriting data.") else: os.mkdir(run_dir) # Set-up logging. logger = logging.getLogger("attackgraph") logger.setLevel(logging.INFO) logger.propagate = False logger.handlers = [] # absl has a default handler that we need to remove. # logger.propagate = False formatter = logging.Formatter( "%(asctime)s %(name)s %(levelname)s %(message)s") # Log to terminal. terminal_handler = logging.StreamHandler() terminal_handler.setFormatter(formatter) # Log to file. file_handler = logging.FileHandler(osp.join(run_dir, "out.log")) file_handler.setLevel(logging.INFO) file_handler.setFormatter(formatter) # Debug output. debug_handler = logging.FileHandler(osp.join(run_dir, "debug.log")) debug_handler.setLevel(logging.DEBUG) debug_handler.setFormatter(formatter) # Register handlers. logger.addHandler(terminal_handler) logger.addHandler(file_handler) logger.addHandler(debug_handler) logger.info(f"Saving results to: {run_dir}") # Set-up gin configuration. gin_files = [ osp.join(settings.SRC_DIR, "configs", f"{x}.gin") for x in FLAGS.config_files ] gin.parse_config_files_and_bindings(config_files=gin_files, bindings=FLAGS.config_overrides, skip_unknown=False) # Save program flags. with open(osp.join(run_dir, "flags.txt"), "w") as flag_file: # We want only flags relevant to this module to be saved, no extra flags. # See: https://github.com/abseil/abseil-py/issues/92 key_flags = FLAGS.get_key_flags_for_module(argv[0]) key_flags = "\n".join(flag.serialize() for flag in key_flags) flag_file.write(key_flags) with open(osp.join(run_dir, "config.txt"), "w") as config_file: config_file.write(gin.config_str()) # Properly restrict pytorch to not consume extra resources. # - https://github.com/pytorch/pytorch/issues/975 # - https://github.com/ray-project/ray/issues/3609 torch.set_num_threads(1) os.environ["OMP_NUM_THREADS"] = "1" evaluate_qmix([ player2_policies.Player2v0(), player2_policies.Player2v1(), player2_policies.Player2v2(), player2_policies.Player2v3(), player2_policies.Player2v4() ])
def evaluate_qmix(opponents: typing.List, mixture: typing.List): """ . """ assert len(opponents) == len(mixture) name = "player1" env = GridWorldSoccer() # ------------------------------------------------------------------------- # Train best-response to each pure-strategy opponent. logger.info("Training best-response against each pure-strategy.") best_responses = [] replay_buffers = [] best_response_paths = [] for opponent_i, opponent in enumerate(opponents): logger.info(f" - Training against opponent {opponent_i}") br_path = osp.join(settings.get_run_dir(), f"v{opponent_i}.best_response.pkl") best_response_paths += [br_path] with gin.config_scope("pure"): response, replay_buffer = _train( br_path, opponent, SummaryWriter(logdir=osp.join(settings.get_run_dir(), f"br_vs_{opponent_i}"))) best_responses += [response] replay_buffers += [replay_buffer] # ------------------------------------------------------------------------- # Simulate the performance of QMixture. logger.info("Simulating the performance of the QMixture.") qmix = QMixture(mixture=mixture, q_funcs=best_responses) # Save policy, for future evaluation. qmix_path = osp.join(settings.get_run_dir(), "qmix.pkl") torch.save(qmix, qmix_path, pickle_module=dill) qmix_rewards = [] mixed_reward = 0.0 reward_std = 0.0 for opponent_i, opponent in enumerate(opponents): rewards, _ = simulate_profile(env=env, nn_att=qmix, nn_def=opponent, n_episodes=250, save_dir=None, summary_writer=None, raw_rewards=True) logger.info( f" - Opponent {opponent_i} vs. QMix: {np.mean(rewards)}, {np.std(rewards)}" ) qmix_rewards += [rewards] mixed_reward += mixture[opponent_i] * np.mean(rewards) reward_std += mixture[opponent_i]**2 * np.std(rewards)**2 reward_std = np.sqrt(reward_std) logger.info( f"Expected reward against mixture opponent: {mixed_reward}, {reward_std}" ) dill.dump( mixed_reward, open(osp.join(settings.get_run_dir(), "qmix.simulated_reward.pkl"), "wb")) # ------------------------------------------------------------------------- # Simulate the performance of QMixture with state frequencies. """ logger.info("Simulating the performance of the QMixture with State-Frequency weighting.") qmix_statefreq = QMixtureStateFreq(mixture=mixture, q_funcs=best_responses, replay_buffers=replay_buffers) # Save policy, for future evaluation. qmix_statefreq_path = osp.join(settings.get_run_dir(), "qmix_statefreq.pkl") torch.save(qmix_statefreq, qmix_statefreq_path, pickle_module=dill) qmix_statefreq_rewards = [] mixed_statefreq_reward = 0.0 for opponent_i, opponent in enumerate(opponents): rewards, _ = simulate_profile( env=env, nn_att=qmix_statefreq, nn_def=opponent, n_episodes=250, save_dir=None, summary_writer=SummaryWriter(logdir=osp.join(settings.get_run_dir(), f"simulate_statefreq_vs_{opponent_i}")), raw_rewards=True) logger.info(f" - Opponent {opponent_i}: {np.mean(rewards)}, {np.std(rewards)}") with open(osp.join(settings.get_run_dir(), f"qmix_statefreq.rewards_v{opponent_i}.pkl"), "wb") as outfile: dill.dump(rewards, outfile) qmix_statefreq_rewards += [rewards] mixed_statefreq_reward += mixture[opponent_i] * np.mean(rewards) logger.info(f"Expected reward against mixture opponent: {mixed_statefreq_reward}") dill.dump(mixed_reward, open(osp.join(settings.get_run_dir(), "qmix_statefreq.simulated_reward.pkl"), "wb")) """ # ------------------------------------------------------------------------- # Train best-response to opponent mixture. logger.info("Training a best-response against the mixture opponent.") mixture_br_path = osp.join(settings.get_run_dir(), "mixture.best_response.pkl") opponent_agent = Agent(mixture=mixture, policies=opponents) with gin.config_scope("mix"): mixture_br, _ = _train( mixture_br_path, opponent_agent, SummaryWriter( logdir=osp.join(settings.get_run_dir(), "br_vs_mixture"))) # ------------------------------------------------------------------------- # Evaluate the mixture policy against the individual opponent strategies. logger.info( "Evaluating the best-response trained against mixture opponents on pure-strategy opponents." ) mix_br_reward = 0.0 reward_std = 0.0 for opponent_i, opponent in enumerate(opponents): rewards, _ = simulate_profile(env=env, nn_att=mixture_br, nn_def=opponent, n_episodes=250, save_dir=None, summary_writer=None, raw_rewards=True) logger.info( f" - Opponent {opponent_i} vs. MixtureBR: {np.mean(rewards)}, {np.std(rewards)}" ) mix_br_reward += mixture[opponent_i] * np.mean(rewards) reward_std += mixture[opponent_i]**2 * np.std(rewards)**2 reward_std = np.sqrt(reward_std) logger.info( f"Expected reward for mixture best-response: {mix_br_reward}, {reward_std}" ) # ------------------------------------------------------------------------- # Evaluate pure-strategy-best-response policies against all opponents (all pure strategy + mixture). logger.info( "Evaluating pure-strategy-best-response against all opponent policies." ) response_rewards = {} response_std = {} for opponent_i, opponent in enumerate(opponents): for response_i, best_response in enumerate(best_responses): rewards, _ = simulate_profile(env=env, nn_att=best_response, nn_def=opponent, n_episodes=250, save_dir=None, summary_writer=None, raw_rewards=True) logger.info( f" - Opponent {opponent_i} vs. Best-Response {response_i}: {np.mean(rewards)}, {np.std(rewards)}" ) if response_i not in response_rewards: response_rewards[response_i] = 0.0 response_std[response_i] = 0.0 response_rewards[response_i] += mixture[opponent_i] * np.mean( rewards) response_std[response_i] += mixture[opponent_i]**2 * np.std( rewards)**2 for key, value in response_rewards.items(): logger.info( f"Expected reward of response {key} against mixture: {value}, {np.sqrt(response_std[key])}" ) logger.info("Finished.")
def initialize(load_env=None, env_name=None, n_processes: int = 1): logger.info("=======================================================") logger.info("=======Begin Initialization and first epoch============") logger.info("=======================================================") # Create Environment if isinstance(load_env, str): path = osp.join(settings.get_env_data_dir(), "{}.pkl".format(load_env)) if not fp.isExist(path): raise ValueError("The env being loaded does not exist.") env = fp.load_pkl(path) else: # env is created and saved. env = dag.env_rand_gen_and_save(env_name) # save graph copy env.save_graph_copy() env.save_mask_copy() # TODO: change transfer # create players and point to their env env.create_players() env.create_action_space() # print root node roots = env.get_Roots() logger.info(f"Root Nodes: {roots}") ed = env.get_ORedges() logger.info(f"Or edges: {ed}") # initialize game data game = empirical_game.EmpiricalGame(env) game.env.defender.set_env_belong_to(game.env) game.env.attacker.set_env_belong_to(game.env) # make no sense env.defender.set_env_belong_to(env) env.attacker.set_env_belong_to(env) # uniform strategy has been produced ahead of time logger.info("Epoch 1") epoch = 1 epoch_dir = osp.join(settings.get_results_dir(), f"epoch_{epoch}") writer = SummaryWriter(logdir=epoch_dir) act_att = 'att_str_epoch1.pkl' act_def = 'def_str_epoch1.pkl' game.add_att_str(act_att) game.add_def_str(act_def) logger.info('Begin simulation for uniform strategy.') aReward, dReward = simulation.simulate_profile( env=game.env, game=game, nn_att=act_att, nn_def=act_def, n_episodes=game.num_episodes, n_processes=n_processes, save_dir=epoch_dir, summary_writer=writer) logger.info('Done simulation for uniform strategy.') game.init_payoffmatrix(dReward, aReward) ne = {} ne[0] = np.array([1], dtype=np.float32) ne[1] = np.array([1], dtype=np.float32) game.add_nasheq(epoch, ne) # save a copy of game data game_path = osp.join(settings.get_run_dir(), "game.pkl") fp.save_pkl(game, game_path) sys.stdout.flush() return game
def _run(env, game, meta_method_name, epoch: int = 1, game_path: str = None, n_processes: int = 1): assert n_processes > 0, "Invalid number of processors." if game_path is None: game_path = osp.join(settings.get_run_dir(), "game.pkl") logger.info("=======================================================") logger.info("===============Begin Running DO-EGTA===================") logger.info("=======================================================") proc = psutil.Process(os.getpid()) result_dir = settings.get_run_dir() selector = meta_method_selector(meta_method_name) count = 80 while count != 0: mem0 = proc.memory_info().rss # Fix opponent strategy. mix_str_def, mix_str_att = selector.sample(game, epoch) # Save mixed strategies. # with open(osp.join(result_dir, f"mix_defender.{epoch}.pkl"), "wb") as outfile: # pickle.dump(mix_str_def, outfile) # with open(osp.join(result_dir, f"mix_attacker.{epoch}.pkl"), "wb") as outfile: # pickle.dump(mix_str_att, outfile) # with open(osp.join(result_dir, f"payoff_defender.{epoch}.pkl"), "wb") as outfile: # pickle.dump(game.payoffmatrix_def, outfile) # with open(osp.join(result_dir, f"payoff_attacker.{epoch}.pkl"), "wb") as outfile: # pickle.dump(game.payoffmatrix_att, outfile) # Equilibrium pay-off. aPayoff, dPayoff = util.payoff_mixed_NE(game, epoch) game.att_payoff.append(aPayoff) game.def_payoff.append(dPayoff) # increase epoch epoch += 1 logger.info("Epoch " + str(epoch)) epoch_dir = osp.join(result_dir, f"epoch_{epoch}") # Summary writer for each epoch. writer = SummaryWriter(logdir=epoch_dir) # train and save RL agents # Train new best-response policies. if n_processes > 1: logger.info("Begining training attacker and defender in parallel.") time_training = time.time() job_queue = multiprocessing.SimpleQueue() result_queue = multiprocessing.SimpleQueue() attacker_trainer = LearnerWorker(job_queue, result_queue, 1, mix_str_def, epoch) defender_trainer = LearnerWorker(job_queue, result_queue, 0, mix_str_att, epoch) attacker_trainer.start() defender_trainer.start() # Submit training jobs on our game. for _ in range(2): job_queue.put(CloudpickleWrapper(game)) # Send sentinel values to tell processes to cleanly shutdown (1 per worker). for _ in range(2): job_queue.put(None) attacker_trainer.join() defender_trainer.join() # Collect and report results. We need to sort the results because they may appear in any order. results = [] for _ in range(2): results += [result_queue.get()] results = results if not results[0][ 0] else results[::-1] # Put defender first then attacker. # Process results into expected variables for non-distributed. a_BD = results[1][1] d_BD = results[0][1] logger.info("Done training attacker and defender.") logger.info(f"Defender training report: \n{results[0][2]}") logger.info(f"Attacker training report: \n{results[1][2]}") time_training = time.time() - time_training else: logger.info("Begin training attacker......") time_train_attacker = time.time() a_BD, report = training.train(game, 1, mix_str_def, epoch, writer) time_train_attacker = time.time() - time_train_attacker logger.info(f"\n{report}") logger.info("Attacker training done......") logger.info("Begin training defender......") time_train_defender = time.time() d_BD, report = training.train(game, 0, mix_str_att, epoch, writer) time_train_defender = time.time() - time_train_defender logger.info(f"\n{report}") logger.info("Defender training done......") mem1 = proc.memory_info().rss game.att_BD_list.append(a_BD) game.def_BD_list.append(d_BD) mem2 = proc.memory_info().rss game.add_att_str("att_str_epoch" + str(epoch) + ".pkl") game.add_def_str("def_str_epoch" + str(epoch) + ".pkl") # simulate and extend the payoff matrix. time_extend_game = time.time() game = simulation.simulate_expanded_game(game=game, n_processes=n_processes, save_dir=epoch_dir, summary_writer=writer) time_extend_game = time.time() - time_extend_game mem3 = proc.memory_info().rss # find nash equilibrium using gambit analysis time_gambit = time.time() payoffmatrix_def = game.payoffmatrix_def payoffmatrix_att = game.payoffmatrix_att logger.info("Begin Gambit analysis.") nash_att, nash_def = ga.do_gambit_analysis(payoffmatrix_def, payoffmatrix_att) ga.add_new_NE(game, nash_att, nash_def, epoch) game.env.attacker.nn_att = None game.env.defender.nn_def = None fp.save_pkl(game, game_path) time_gambit = time.time() - time_gambit logger.info("RESULTS:") logger.info(' - a_BD_list: {}'.format(game.att_BD_list)) logger.info(' - aPayoff: {}'.format(game.att_payoff)) logger.info(' - d_BD_list: {}'.format(game.def_BD_list)) logger.info(' - dPayoff: {}'.format(game.def_payoff)) logger.info("MEM: {}, {}, {}.".format( (mem1 - mem0) / mem0, (mem2 - mem0) / mem0, (mem3 - mem0) / mem0)) logger.info("TIME: ") if n_processes == 1: logger.info(f" - Training attacker: {time_train_attacker}") logger.info(f" - Training defender: {time_train_defender}") else: logger.info(f" - Training: {time_training}") logger.info(f" - Extend game: {time_extend_game}") logger.info(f" - Gambit: {time_gambit}") logger.info("Round_" + str(epoch) + " has done and game was saved.") logger.info("=======================================================") count -= 1 sys.stdout.flush() # TODO: make sure this is correct. logger.info("END: " + str(epoch)) os._exit(os.EX_OK)
def run(self, env, name, writer, **network_kwargs): """ Train a deepq model. :param env: Environment. :param name: Name of the training run, to save data seperately. :param writer: SummaryWriter for logging metrics. """ time_init = time.time() # Create the new agent that we are going to train to best respond. best_responder = self.policy_ctor() # Set-up experience replay buffer. replay_buffer = ReplayBuffer(self.buffer_size) assert not self.prioritized_replay, "Prioirized replay is not implemented in PyTorch recreation." # Create exploration schedule. exploration = LinearSchedule(schedule_timesteps=int( self.exploration_fraction * self.total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) # Set-up training variables. mean_rewards = [] episode_rewards = [0.0] saved_mean_reward = None # Begin episode. obs = env.reset() reset = True # Establish temporary directory to hold checkpoints of our agent from throughout training. # We do this so we can return the best version of our agent throughout training. temp_dir = tempfile.TemporaryDirectory() best_model_path = osp.join(temp_dir.name, "model.pytorch") # Time metrics. time_init = time.time() - time_init t_transitions = [] t_actions = [] t_steps = [] t_samples = [] t_updates = [] n_updates = 0.0 # Environment training loop. time_training = time.time() for t in range(self.total_timesteps): time_transition = time.time() # Check terminantion conditions. if self.callback is not None and self.callback( locals(), globals()): break # Collect meta-data agent may need to compute action. time_action = time.time() action_kwargs = {} # Update exploration strategy. if self.param_noise: update_eps = 0.0 # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps=exploration.value(t). # See Appendix C.1 in `Parameter Space Noise for Exploration`, Plappert et al., 2017. update_param_noise_threshold = -1.0 * np.log( 1.0 - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) action_kwargs["reset"] = reset action_kwargs[ "update_param_noise_threshold"] = update_param_noise_threshold action_kwargs["update_param_noise_scale"] = True else: update_eps = exploration.value(t) update_param_noise_threshold = 0.0 # Step agent. writer.add_scalar(f"{name}/epsilon", update_eps, t) action = best_responder.act(observation=np.array(obs)[None], stochastic=True, update_eps=update_eps, mask=None, training_attacker=False, **action_kwargs)[0] t_actions += [time.time() - time_action] # Step environment. time_step = time.time() new_obs, reward, done, _ = env.step(action) t_steps += [time.time() - time_step] # Store transition data. replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs episode_rewards[-1] += reward # If the environment finished, reset the environment and sample from opponent's meta-strategy. if done: obs = env.reset() # Log the environment reset. episode_rewards.append(0.0) reset = True # Periodically train our policy. if (t > self.learning_starts) and (t % self.train_freq == 0): n_updates += 1.0 time_sample = time.time() # Collect batch (b) of experiences. b_o, b_a, b_r, b_op, b_d = replay_buffer.sample( self.batch_size) b_weights = np.ones_like(b_r) t_samples += [time.time() - time_sample] time_update = time.time() best_responder.update(observations=b_o, actions=b_a, rewards=b_r, next_observations=b_op, done_mask=b_d, importance_weights=b_weights, summary_writer=writer, mask=None, training_attacker=False, t=t) t_updates += [time.time() - time_update] # Periodically update target network. if (t > self.learning_starts) and ( t % self.target_network_update_freq == 0): best_responder.update_target_network() # Record results. n_episodes = len(episode_rewards) if t > self.learning_starts: mean_100ep_reward = round(np.mean(episode_rewards[-251:-1]), 1) mean_rewards.append(mean_100ep_reward) writer.add_scalar(f"{name}/mean_reward", np.nan_to_num(mean_100ep_reward), t) # Periodically save a snapshot of our best-responder. if (self.checkpoint_freq is not None) and (t > self.learning_starts) and ( n_episodes > 100) and (t % self.checkpoint_freq == 0): # Save checkpoints of only the best-performing model we have encountered. if (saved_mean_reward is None) or (mean_100ep_reward > saved_mean_reward): torch.save(best_responder, best_model_path, pickle_module=dill) saved_mean_reward = mean_100ep_reward t_transitions += [time.time() - time_transition] # Load the best-performing encountered policy as our resulting best-responder. BD = None if osp.exists(best_model_path): best_responder = torch.load(best_model_path) BD = saved_mean_reward if saved_mean_reward is not None else mean_100ep_reward # Clean-up temporary directory. temp_dir.cleanup() # Save data to generate learning curves. data_path = osp.join(settings.get_run_dir(), f"mean_rewards.{name}.pkl") fp.save_pkl(mean_rewards, data_path) # Log timing statistics. # We put this together into a string to send back to have the main process print. # This is to prevent potential multiprocessing errors. report = "" report += " - n_transitions: {}\n".format(len(t_transitions)) report += " - n_updates: {}\n".format(len(t_updates)) report += " - t_init: {}\n".format(time_init) report += " - t_transitions: {}\n".format(np.mean(t_transitions)) report += " - t_actions: {}\n".format(np.mean(t_actions)) report += " - t_steps: {}\n".format(np.mean(t_steps)) report += " - t_samples: {}\n".format(np.mean(t_samples)) report += " - t_updates: {}\n".format(np.mean(t_updates)) return best_responder, BD, replay_buffer, report
def learn_multi_nets(self, env, epoch, writer, **network_kwargs): """ Train a deepq model. :param env: Environment. :param epoch: Current EGTA epoch. This is only used for saving results. :param writer: SummaryWriter for logging metrics. """ time_init = time.time() # If the training flag is 1 we're training the attacker, or the defender if the flag is 0. training_attacker = env.training_flag assert training_attacker == 0 or training_attacker == 1, f"Invalid training flag: {training_attacker}." log_prefix = "attacker" if training_attacker else "defender" # Select parameters based off attacker/defender. n_actions = env.act_dim_att( ) if training_attacker else env.act_dim_def() observation_space = env.obs_dim_att( ) if training_attacker else env.obs_dim_def() # Create the new agent that we are going to train to best respond. best_responder = self.get_new_policy(locals_=locals(), globals_=globals()) # Set-up experience replay buffer. replay_buffer = ReplayBuffer(self.buffer_size) assert not self.prioritized_replay, "Prioirized replay is not implemented in PyTorch recreation." # Create exploration schedule. exploration = LinearSchedule(schedule_timesteps=int( self.exploration_fraction * self.total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) # Set-up training variables. mean_rewards = [] episode_rewards = [0.0] saved_mean_reward = None # Begin episode. obs = env.reset_everything_with_return() reset = True # Sample our initial opponent's strategy. opponent_sampler = OpponentSampler( env=env, opponent_identity=0 if training_attacker else 1) opponent_sampler.sample() # Establish temporary directory to hold checkpoints of our agent from throughout training. # We do this so we can return the best version of our agent throughout training. temp_dir = tempfile.TemporaryDirectory() best_model_path = osp.join(temp_dir.name, "model.pytorch") # Time metrics. time_init = time.time() - time_init t_transitions = [] t_actions = [] t_steps = [] t_samples = [] t_updates = [] n_updates = 0.0 # Reward Shaping temp_buffer = [] # Environment training loop. time_training = time.time() for t in range(self.total_timesteps): time_transition = time.time() # Check terminantion conditions. if self.callback is not None and self.callback( locals(), globals()): break # Collect meta-data agent may need to compute action. time_action = time.time() action_kwargs = {} # Update exploration strategy. if self.param_noise: update_eps = 0.0 # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps=exploration.value(t). # See Appendix C.1 in `Parameter Space Noise for Exploration`, Plappert et al., 2017. update_param_noise_threshold = -1.0 * np.log( 1.0 - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) action_kwargs["reset"] = reset action_kwargs[ "update_param_noise_threshold"] = update_param_noise_threshold action_kwargs["update_param_noise_scale"] = True else: update_eps = exploration.value(t) update_param_noise_threshold = 0.0 # If we are the attacker, apply a mask to our action space. if training_attacker: mask = mask_generator_att(env, np.array(obs)[None]) else: mask = None # Step agent. writer.add_scalar(f"{log_prefix}/epsilon", update_eps, t) action = best_responder.act(observation=np.array(obs)[None], stochastic=True, update_eps=update_eps, mask=mask, training_attacker=training_attacker, **action_kwargs)[0] t_actions += [time.time() - time_action] # Step environment. time_step = time.time() new_obs, reward, done = env.step(action) t_steps += [time.time() - time_step] # Store transition data. # Reward shaping if self.reward_shaping: pass_flag = False if training_attacker == 0: rewards_shaping = env.rewards() if rewards_shaping['pass_flag']: for transition in temp_buffer: obs0, action0, rew0, new_obs0, done0 = transition rew_new = rewards_shaping[str(action0)].v episode_rewards[-1] += rew_new replay_buffer.add(obs0, action0, rew_new, new_obs0, done0) temp_buffer = [] env.reset_reward_shaping() pass_flag = True elif training_attacker == 1: rewards_shaping = env.rewards() if rewards_shaping['pass_flag']: for transition in temp_buffer: obs1, action1, rew1, new_obs1, done1 = transition rew_new = rewards_shaping[str(action1)].v episode_rewards[-1] += rew_new replay_buffer.add(obs1, action1, rew_new, new_obs1, done1) temp_buffer = [] env.reset_reward_shaping() pass_flag = True if pass_flag: episode_rewards[-1] += reward replay_buffer.add(obs, action, reward, new_obs, float(done)) else: temp_buffer.append( (obs, action, reward, new_obs, float(done))) obs = new_obs if done: obs = env.reset_everything_with_return() episode_rewards.append(0.0) reset = True # sample a new strategy from meta-stategy solver. opponent_sampler.sample() # No reward shaping. else: replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs episode_rewards[-1] += reward # If the environment finished, reset the environment and sample from opponent's meta-strategy. if done: obs = env.reset_everything_with_return() opponent_sampler.sample() # Log the environment reset. episode_rewards.append(0.0) reset = True # Periodically train our policy. if (t > self.learning_starts) and (t % self.train_freq == 0): n_updates += 1.0 time_sample = time.time() # Collect batch (b) of experiences. b_o, b_a, b_r, b_op, b_d = replay_buffer.sample( self.batch_size) b_weights = np.ones_like(b_r) # Generate action masks. if training_attacker: b_mask = mask_generator_att(env, b_op) else: b_mask = None t_samples += [time.time() - time_sample] time_update = time.time() best_responder.update(observations=b_o, actions=b_a, rewards=b_r, next_observations=b_op, done_mask=b_d, importance_weights=b_weights, mask=b_mask, training_attacker=training_attacker, summary_writer=writer, t=t) t_updates += [time.time() - time_update] # Periodically update target network. if (t > self.learning_starts) and ( t % self.target_network_update_freq == 0): best_responder.update_target_network() # Record results. n_episodes = len(episode_rewards) if t > self.learning_starts: mean_100ep_reward = round(np.mean(episode_rewards[-251:-1]), 1) mean_rewards.append(mean_100ep_reward) writer.add_scalar(f"{log_prefix}/mean_reward", np.nan_to_num(mean_100ep_reward), t) # Periodically save a snapshot of our best-responder. if (self.checkpoint_freq is not None) and (t > self.learning_starts) and ( n_episodes > 100) and (t % self.checkpoint_freq == 0): # Save checkpoints of only the best-performing model we have encountered. if (saved_mean_reward is None) or (mean_100ep_reward > saved_mean_reward): torch.save(best_responder, best_model_path, pickle_module=dill) saved_mean_reward = mean_100ep_reward t_transitions += [time.time() - time_transition] # Load the best-performing encountered policy as our resulting best-responder. BD = None if osp.exists(best_model_path): best_responder = torch.load(best_model_path) BD = saved_mean_reward if saved_mean_reward is not None else mean_100ep_reward # Clean-up temporary directory. temp_dir.cleanup() # Save data to generate learning curves. name = "attacker" if training_attacker else "defender" data_path = osp.join(settings.get_run_dir(), f"mean_rewards.{name}.{epoch}.pkl") fp.save_pkl(mean_rewards, data_path) # Log timing statistics. # We put this together into a string to send back to have the main process print. # This is to prevent potential multiprocessing errors. report = "" report += " - n_transitions: {}\n".format(len(t_transitions)) report += " - n_updates: {}\n".format(len(t_updates)) report += " - t_init: {}\n".format(time_init) report += " - t_transitions: {}\n".format(np.mean(t_transitions)) report += " - t_actions: {}\n".format(np.mean(t_actions)) report += " - t_steps: {}\n".format(np.mean(t_steps)) report += " - t_samples: {}\n".format(np.mean(t_samples)) report += " - t_updates: {}\n".format(np.mean(t_updates)) return best_responder, BD, replay_buffer, report