def env_rand_gen_and_save(env_name, num_attr_N=11, num_attr_E=4, T=10, graphid=1, numNodes=30, numEdges=100, numRoot=4, numGoals=6, history=3): env = Environment(num_attr_N=num_attr_N, num_attr_E=num_attr_E, T=T, graphid=graphid, numNodes=numNodes, numEdges=numEdges, numRoot=numRoot, numGoals=numGoals, history=history) env.randomDAG() path = osp.join(settings.get_env_data_dir(), "{}.pkl".format(env_name)) print("env path is ", path) # if fp.isExist(path): # raise ValueError("Env with such name already exists.") fp.save_pkl(env, path) print(env_name + " has been saved.") return env
def env_rand_gen(env_name, num_attr_N=11, num_attr_E=4, T=10, graphid=1, numNodes=30, numEdges=100, numRoot=5, numGoals=6, history=3): env = dag.Environment(num_attr_N=num_attr_N, num_attr_E=num_attr_E, T=T, graphid=graphid, numNodes=numNodes, numEdges=numEdges, numRoot=numRoot, numGoals=numGoals, history=history) # env.randomDAG() env.load_graph() path = os.getcwd() + "/env_data/" + env_name + ".pkl" print("env path is ", path) fp.save_pkl(env, path) print(env_name + " has been saved.") return env
def run(load_env, env_name, n_processes): """ Run the double-oracle algorithm. """ # Create initial policies. fp.save_pkl( uniform_str_init.act_att, osp.join(settings.get_attacker_strategy_dir(), "att_str_epoch1.pkl")) fp.save_pkl( uniform_str_init.act_def, osp.join(settings.get_defender_strategy_dir(), "def_str_epoch1.pkl")) game = initialize(load_env=FLAGS.env, env_name=None, n_processes=n_processes) _run(game.env, game, meta_method_name=FLAGS.meta_method, n_processes=n_processes)
def _train(policy_save_path, opponent, writer): env = GridWorldSoccer() env = MultiToSingleAgentWrapper(env=env, agent_id=1, opponents={2: opponent}) save_path = osp.join(settings.get_run_dir(), osp.basename(policy_save_path)) save_path = save_path[:-4] # Remove ".pkl". trainer = Trainer(policy_ctor=DQN) best_response, _, replay_buffer, _ = trainer.run( env=env, name=osp.basename(policy_save_path), writer=writer) # Save data to results folder for QMixture. torch.save(best_response, f"{save_path}.pkl", pickle_module=dill) fp.save_pkl(replay_buffer, f"{save_path}.replay_buffer.pkl") return best_response, replay_buffer
def initialize(load_env=None, env_name=None, n_processes: int = 1): logger.info("=======================================================") logger.info("=======Begin Initialization and first epoch============") logger.info("=======================================================") # Create Environment if isinstance(load_env, str): path = osp.join(settings.get_env_data_dir(), "{}.pkl".format(load_env)) if not fp.isExist(path): raise ValueError("The env being loaded does not exist.") env = fp.load_pkl(path) else: # env is created and saved. env = dag.env_rand_gen_and_save(env_name) # save graph copy env.save_graph_copy() env.save_mask_copy() # TODO: change transfer # create players and point to their env env.create_players() env.create_action_space() # print root node roots = env.get_Roots() logger.info(f"Root Nodes: {roots}") ed = env.get_ORedges() logger.info(f"Or edges: {ed}") # initialize game data game = empirical_game.EmpiricalGame(env) game.env.defender.set_env_belong_to(game.env) game.env.attacker.set_env_belong_to(game.env) # make no sense env.defender.set_env_belong_to(env) env.attacker.set_env_belong_to(env) # uniform strategy has been produced ahead of time logger.info("Epoch 1") epoch = 1 epoch_dir = osp.join(settings.get_results_dir(), f"epoch_{epoch}") writer = SummaryWriter(logdir=epoch_dir) act_att = 'att_str_epoch1.pkl' act_def = 'def_str_epoch1.pkl' game.add_att_str(act_att) game.add_def_str(act_def) logger.info('Begin simulation for uniform strategy.') aReward, dReward = simulation.simulate_profile( env=game.env, game=game, nn_att=act_att, nn_def=act_def, n_episodes=game.num_episodes, n_processes=n_processes, save_dir=epoch_dir, summary_writer=writer) logger.info('Done simulation for uniform strategy.') game.init_payoffmatrix(dReward, aReward) ne = {} ne[0] = np.array([1], dtype=np.float32) ne[1] = np.array([1], dtype=np.float32) game.add_nasheq(epoch, ne) # save a copy of game data game_path = osp.join(settings.get_run_dir(), "game.pkl") fp.save_pkl(game, game_path) sys.stdout.flush() return game
def _run(env, game, meta_method_name, epoch: int = 1, game_path: str = None, n_processes: int = 1): assert n_processes > 0, "Invalid number of processors." if game_path is None: game_path = osp.join(settings.get_run_dir(), "game.pkl") logger.info("=======================================================") logger.info("===============Begin Running DO-EGTA===================") logger.info("=======================================================") proc = psutil.Process(os.getpid()) result_dir = settings.get_run_dir() selector = meta_method_selector(meta_method_name) count = 80 while count != 0: mem0 = proc.memory_info().rss # Fix opponent strategy. mix_str_def, mix_str_att = selector.sample(game, epoch) # Save mixed strategies. # with open(osp.join(result_dir, f"mix_defender.{epoch}.pkl"), "wb") as outfile: # pickle.dump(mix_str_def, outfile) # with open(osp.join(result_dir, f"mix_attacker.{epoch}.pkl"), "wb") as outfile: # pickle.dump(mix_str_att, outfile) # with open(osp.join(result_dir, f"payoff_defender.{epoch}.pkl"), "wb") as outfile: # pickle.dump(game.payoffmatrix_def, outfile) # with open(osp.join(result_dir, f"payoff_attacker.{epoch}.pkl"), "wb") as outfile: # pickle.dump(game.payoffmatrix_att, outfile) # Equilibrium pay-off. aPayoff, dPayoff = util.payoff_mixed_NE(game, epoch) game.att_payoff.append(aPayoff) game.def_payoff.append(dPayoff) # increase epoch epoch += 1 logger.info("Epoch " + str(epoch)) epoch_dir = osp.join(result_dir, f"epoch_{epoch}") # Summary writer for each epoch. writer = SummaryWriter(logdir=epoch_dir) # train and save RL agents # Train new best-response policies. if n_processes > 1: logger.info("Begining training attacker and defender in parallel.") time_training = time.time() job_queue = multiprocessing.SimpleQueue() result_queue = multiprocessing.SimpleQueue() attacker_trainer = LearnerWorker(job_queue, result_queue, 1, mix_str_def, epoch) defender_trainer = LearnerWorker(job_queue, result_queue, 0, mix_str_att, epoch) attacker_trainer.start() defender_trainer.start() # Submit training jobs on our game. for _ in range(2): job_queue.put(CloudpickleWrapper(game)) # Send sentinel values to tell processes to cleanly shutdown (1 per worker). for _ in range(2): job_queue.put(None) attacker_trainer.join() defender_trainer.join() # Collect and report results. We need to sort the results because they may appear in any order. results = [] for _ in range(2): results += [result_queue.get()] results = results if not results[0][ 0] else results[::-1] # Put defender first then attacker. # Process results into expected variables for non-distributed. a_BD = results[1][1] d_BD = results[0][1] logger.info("Done training attacker and defender.") logger.info(f"Defender training report: \n{results[0][2]}") logger.info(f"Attacker training report: \n{results[1][2]}") time_training = time.time() - time_training else: logger.info("Begin training attacker......") time_train_attacker = time.time() a_BD, report = training.train(game, 1, mix_str_def, epoch, writer) time_train_attacker = time.time() - time_train_attacker logger.info(f"\n{report}") logger.info("Attacker training done......") logger.info("Begin training defender......") time_train_defender = time.time() d_BD, report = training.train(game, 0, mix_str_att, epoch, writer) time_train_defender = time.time() - time_train_defender logger.info(f"\n{report}") logger.info("Defender training done......") mem1 = proc.memory_info().rss game.att_BD_list.append(a_BD) game.def_BD_list.append(d_BD) mem2 = proc.memory_info().rss game.add_att_str("att_str_epoch" + str(epoch) + ".pkl") game.add_def_str("def_str_epoch" + str(epoch) + ".pkl") # simulate and extend the payoff matrix. time_extend_game = time.time() game = simulation.simulate_expanded_game(game=game, n_processes=n_processes, save_dir=epoch_dir, summary_writer=writer) time_extend_game = time.time() - time_extend_game mem3 = proc.memory_info().rss # find nash equilibrium using gambit analysis time_gambit = time.time() payoffmatrix_def = game.payoffmatrix_def payoffmatrix_att = game.payoffmatrix_att logger.info("Begin Gambit analysis.") nash_att, nash_def = ga.do_gambit_analysis(payoffmatrix_def, payoffmatrix_att) ga.add_new_NE(game, nash_att, nash_def, epoch) game.env.attacker.nn_att = None game.env.defender.nn_def = None fp.save_pkl(game, game_path) time_gambit = time.time() - time_gambit logger.info("RESULTS:") logger.info(' - a_BD_list: {}'.format(game.att_BD_list)) logger.info(' - aPayoff: {}'.format(game.att_payoff)) logger.info(' - d_BD_list: {}'.format(game.def_BD_list)) logger.info(' - dPayoff: {}'.format(game.def_payoff)) logger.info("MEM: {}, {}, {}.".format( (mem1 - mem0) / mem0, (mem2 - mem0) / mem0, (mem3 - mem0) / mem0)) logger.info("TIME: ") if n_processes == 1: logger.info(f" - Training attacker: {time_train_attacker}") logger.info(f" - Training defender: {time_train_defender}") else: logger.info(f" - Training: {time_training}") logger.info(f" - Extend game: {time_extend_game}") logger.info(f" - Gambit: {time_gambit}") logger.info("Round_" + str(epoch) + " has done and game was saved.") logger.info("=======================================================") count -= 1 sys.stdout.flush() # TODO: make sure this is correct. logger.info("END: " + str(epoch)) os._exit(os.EX_OK)
def run(self, env, name, writer, **network_kwargs): """ Train a deepq model. :param env: Environment. :param name: Name of the training run, to save data seperately. :param writer: SummaryWriter for logging metrics. """ time_init = time.time() # Create the new agent that we are going to train to best respond. best_responder = self.policy_ctor() # Set-up experience replay buffer. replay_buffer = ReplayBuffer(self.buffer_size) assert not self.prioritized_replay, "Prioirized replay is not implemented in PyTorch recreation." # Create exploration schedule. exploration = LinearSchedule(schedule_timesteps=int( self.exploration_fraction * self.total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) # Set-up training variables. mean_rewards = [] episode_rewards = [0.0] saved_mean_reward = None # Begin episode. obs = env.reset() reset = True # Establish temporary directory to hold checkpoints of our agent from throughout training. # We do this so we can return the best version of our agent throughout training. temp_dir = tempfile.TemporaryDirectory() best_model_path = osp.join(temp_dir.name, "model.pytorch") # Time metrics. time_init = time.time() - time_init t_transitions = [] t_actions = [] t_steps = [] t_samples = [] t_updates = [] n_updates = 0.0 # Environment training loop. time_training = time.time() for t in range(self.total_timesteps): time_transition = time.time() # Check terminantion conditions. if self.callback is not None and self.callback( locals(), globals()): break # Collect meta-data agent may need to compute action. time_action = time.time() action_kwargs = {} # Update exploration strategy. if self.param_noise: update_eps = 0.0 # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps=exploration.value(t). # See Appendix C.1 in `Parameter Space Noise for Exploration`, Plappert et al., 2017. update_param_noise_threshold = -1.0 * np.log( 1.0 - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) action_kwargs["reset"] = reset action_kwargs[ "update_param_noise_threshold"] = update_param_noise_threshold action_kwargs["update_param_noise_scale"] = True else: update_eps = exploration.value(t) update_param_noise_threshold = 0.0 # Step agent. writer.add_scalar(f"{name}/epsilon", update_eps, t) action = best_responder.act(observation=np.array(obs)[None], stochastic=True, update_eps=update_eps, mask=None, training_attacker=False, **action_kwargs)[0] t_actions += [time.time() - time_action] # Step environment. time_step = time.time() new_obs, reward, done, _ = env.step(action) t_steps += [time.time() - time_step] # Store transition data. replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs episode_rewards[-1] += reward # If the environment finished, reset the environment and sample from opponent's meta-strategy. if done: obs = env.reset() # Log the environment reset. episode_rewards.append(0.0) reset = True # Periodically train our policy. if (t > self.learning_starts) and (t % self.train_freq == 0): n_updates += 1.0 time_sample = time.time() # Collect batch (b) of experiences. b_o, b_a, b_r, b_op, b_d = replay_buffer.sample( self.batch_size) b_weights = np.ones_like(b_r) t_samples += [time.time() - time_sample] time_update = time.time() best_responder.update(observations=b_o, actions=b_a, rewards=b_r, next_observations=b_op, done_mask=b_d, importance_weights=b_weights, summary_writer=writer, mask=None, training_attacker=False, t=t) t_updates += [time.time() - time_update] # Periodically update target network. if (t > self.learning_starts) and ( t % self.target_network_update_freq == 0): best_responder.update_target_network() # Record results. n_episodes = len(episode_rewards) if t > self.learning_starts: mean_100ep_reward = round(np.mean(episode_rewards[-251:-1]), 1) mean_rewards.append(mean_100ep_reward) writer.add_scalar(f"{name}/mean_reward", np.nan_to_num(mean_100ep_reward), t) # Periodically save a snapshot of our best-responder. if (self.checkpoint_freq is not None) and (t > self.learning_starts) and ( n_episodes > 100) and (t % self.checkpoint_freq == 0): # Save checkpoints of only the best-performing model we have encountered. if (saved_mean_reward is None) or (mean_100ep_reward > saved_mean_reward): torch.save(best_responder, best_model_path, pickle_module=dill) saved_mean_reward = mean_100ep_reward t_transitions += [time.time() - time_transition] # Load the best-performing encountered policy as our resulting best-responder. BD = None if osp.exists(best_model_path): best_responder = torch.load(best_model_path) BD = saved_mean_reward if saved_mean_reward is not None else mean_100ep_reward # Clean-up temporary directory. temp_dir.cleanup() # Save data to generate learning curves. data_path = osp.join(settings.get_run_dir(), f"mean_rewards.{name}.pkl") fp.save_pkl(mean_rewards, data_path) # Log timing statistics. # We put this together into a string to send back to have the main process print. # This is to prevent potential multiprocessing errors. report = "" report += " - n_transitions: {}\n".format(len(t_transitions)) report += " - n_updates: {}\n".format(len(t_updates)) report += " - t_init: {}\n".format(time_init) report += " - t_transitions: {}\n".format(np.mean(t_transitions)) report += " - t_actions: {}\n".format(np.mean(t_actions)) report += " - t_steps: {}\n".format(np.mean(t_steps)) report += " - t_samples: {}\n".format(np.mean(t_samples)) report += " - t_updates: {}\n".format(np.mean(t_updates)) return best_responder, BD, replay_buffer, report
def learn_multi_nets(self, env, epoch, writer, **network_kwargs): """ Train a deepq model. :param env: Environment. :param epoch: Current EGTA epoch. This is only used for saving results. :param writer: SummaryWriter for logging metrics. """ time_init = time.time() # If the training flag is 1 we're training the attacker, or the defender if the flag is 0. training_attacker = env.training_flag assert training_attacker == 0 or training_attacker == 1, f"Invalid training flag: {training_attacker}." log_prefix = "attacker" if training_attacker else "defender" # Select parameters based off attacker/defender. n_actions = env.act_dim_att( ) if training_attacker else env.act_dim_def() observation_space = env.obs_dim_att( ) if training_attacker else env.obs_dim_def() # Create the new agent that we are going to train to best respond. best_responder = self.get_new_policy(locals_=locals(), globals_=globals()) # Set-up experience replay buffer. replay_buffer = ReplayBuffer(self.buffer_size) assert not self.prioritized_replay, "Prioirized replay is not implemented in PyTorch recreation." # Create exploration schedule. exploration = LinearSchedule(schedule_timesteps=int( self.exploration_fraction * self.total_timesteps), initial_p=self.exploration_initial_eps, final_p=self.exploration_final_eps) # Set-up training variables. mean_rewards = [] episode_rewards = [0.0] saved_mean_reward = None # Begin episode. obs = env.reset_everything_with_return() reset = True # Sample our initial opponent's strategy. opponent_sampler = OpponentSampler( env=env, opponent_identity=0 if training_attacker else 1) opponent_sampler.sample() # Establish temporary directory to hold checkpoints of our agent from throughout training. # We do this so we can return the best version of our agent throughout training. temp_dir = tempfile.TemporaryDirectory() best_model_path = osp.join(temp_dir.name, "model.pytorch") # Time metrics. time_init = time.time() - time_init t_transitions = [] t_actions = [] t_steps = [] t_samples = [] t_updates = [] n_updates = 0.0 # Reward Shaping temp_buffer = [] # Environment training loop. time_training = time.time() for t in range(self.total_timesteps): time_transition = time.time() # Check terminantion conditions. if self.callback is not None and self.callback( locals(), globals()): break # Collect meta-data agent may need to compute action. time_action = time.time() action_kwargs = {} # Update exploration strategy. if self.param_noise: update_eps = 0.0 # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps=exploration.value(t). # See Appendix C.1 in `Parameter Space Noise for Exploration`, Plappert et al., 2017. update_param_noise_threshold = -1.0 * np.log( 1.0 - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) action_kwargs["reset"] = reset action_kwargs[ "update_param_noise_threshold"] = update_param_noise_threshold action_kwargs["update_param_noise_scale"] = True else: update_eps = exploration.value(t) update_param_noise_threshold = 0.0 # If we are the attacker, apply a mask to our action space. if training_attacker: mask = mask_generator_att(env, np.array(obs)[None]) else: mask = None # Step agent. writer.add_scalar(f"{log_prefix}/epsilon", update_eps, t) action = best_responder.act(observation=np.array(obs)[None], stochastic=True, update_eps=update_eps, mask=mask, training_attacker=training_attacker, **action_kwargs)[0] t_actions += [time.time() - time_action] # Step environment. time_step = time.time() new_obs, reward, done = env.step(action) t_steps += [time.time() - time_step] # Store transition data. # Reward shaping if self.reward_shaping: pass_flag = False if training_attacker == 0: rewards_shaping = env.rewards() if rewards_shaping['pass_flag']: for transition in temp_buffer: obs0, action0, rew0, new_obs0, done0 = transition rew_new = rewards_shaping[str(action0)].v episode_rewards[-1] += rew_new replay_buffer.add(obs0, action0, rew_new, new_obs0, done0) temp_buffer = [] env.reset_reward_shaping() pass_flag = True elif training_attacker == 1: rewards_shaping = env.rewards() if rewards_shaping['pass_flag']: for transition in temp_buffer: obs1, action1, rew1, new_obs1, done1 = transition rew_new = rewards_shaping[str(action1)].v episode_rewards[-1] += rew_new replay_buffer.add(obs1, action1, rew_new, new_obs1, done1) temp_buffer = [] env.reset_reward_shaping() pass_flag = True if pass_flag: episode_rewards[-1] += reward replay_buffer.add(obs, action, reward, new_obs, float(done)) else: temp_buffer.append( (obs, action, reward, new_obs, float(done))) obs = new_obs if done: obs = env.reset_everything_with_return() episode_rewards.append(0.0) reset = True # sample a new strategy from meta-stategy solver. opponent_sampler.sample() # No reward shaping. else: replay_buffer.add(obs, action, reward, new_obs, float(done)) obs = new_obs episode_rewards[-1] += reward # If the environment finished, reset the environment and sample from opponent's meta-strategy. if done: obs = env.reset_everything_with_return() opponent_sampler.sample() # Log the environment reset. episode_rewards.append(0.0) reset = True # Periodically train our policy. if (t > self.learning_starts) and (t % self.train_freq == 0): n_updates += 1.0 time_sample = time.time() # Collect batch (b) of experiences. b_o, b_a, b_r, b_op, b_d = replay_buffer.sample( self.batch_size) b_weights = np.ones_like(b_r) # Generate action masks. if training_attacker: b_mask = mask_generator_att(env, b_op) else: b_mask = None t_samples += [time.time() - time_sample] time_update = time.time() best_responder.update(observations=b_o, actions=b_a, rewards=b_r, next_observations=b_op, done_mask=b_d, importance_weights=b_weights, mask=b_mask, training_attacker=training_attacker, summary_writer=writer, t=t) t_updates += [time.time() - time_update] # Periodically update target network. if (t > self.learning_starts) and ( t % self.target_network_update_freq == 0): best_responder.update_target_network() # Record results. n_episodes = len(episode_rewards) if t > self.learning_starts: mean_100ep_reward = round(np.mean(episode_rewards[-251:-1]), 1) mean_rewards.append(mean_100ep_reward) writer.add_scalar(f"{log_prefix}/mean_reward", np.nan_to_num(mean_100ep_reward), t) # Periodically save a snapshot of our best-responder. if (self.checkpoint_freq is not None) and (t > self.learning_starts) and ( n_episodes > 100) and (t % self.checkpoint_freq == 0): # Save checkpoints of only the best-performing model we have encountered. if (saved_mean_reward is None) or (mean_100ep_reward > saved_mean_reward): torch.save(best_responder, best_model_path, pickle_module=dill) saved_mean_reward = mean_100ep_reward t_transitions += [time.time() - time_transition] # Load the best-performing encountered policy as our resulting best-responder. BD = None if osp.exists(best_model_path): best_responder = torch.load(best_model_path) BD = saved_mean_reward if saved_mean_reward is not None else mean_100ep_reward # Clean-up temporary directory. temp_dir.cleanup() # Save data to generate learning curves. name = "attacker" if training_attacker else "defender" data_path = osp.join(settings.get_run_dir(), f"mean_rewards.{name}.{epoch}.pkl") fp.save_pkl(mean_rewards, data_path) # Log timing statistics. # We put this together into a string to send back to have the main process print. # This is to prevent potential multiprocessing errors. report = "" report += " - n_transitions: {}\n".format(len(t_transitions)) report += " - n_updates: {}\n".format(len(t_updates)) report += " - t_init: {}\n".format(time_init) report += " - t_transitions: {}\n".format(np.mean(t_transitions)) report += " - t_actions: {}\n".format(np.mean(t_actions)) report += " - t_steps: {}\n".format(np.mean(t_steps)) report += " - t_samples: {}\n".format(np.mean(t_samples)) report += " - t_updates: {}\n".format(np.mean(t_updates)) return best_responder, BD, replay_buffer, report