def learn_with_selfplay(max_agents, num_learn_steps, num_learn_steps_pre_training, num_eval_eps, num_skip_steps=0, model_name='dqn', only_rule_based_op=False, patience=5, image_observations=True, output_folder="output", fine_tune_on=None, opponent_pred_obs=False, adversarial_training=None, save_freq=None): """ Train an agent with regular self-play. If there are checkpoints of previous training continue training with the checkpoints. :param max_agents: Stop after max_agents intermediate agents have been trained. An intermediate agent is saved when training successfully created an improved agent. :param num_learn_steps: Number of frames / steps for every learning iteration :param num_learn_steps_pre_training: Number of frames / steps for pre-training on the rule-based agent :param num_eval_eps: Number of episodes for intermediate evaluation. Intermediate evaluation determines whether trained agent improved compared to previous version :param num_skip_steps: Skip num_skip_steps frames performing the action from the previous step :param model_name: Name for saving the model. If there are already checkpoints with this name training is continued. Checkpoints will be saved as madel_namei, where i is the training iteration. :param only_rule_based_op: If set to true training is only performed on the rule-based agent. :param patience: Patience parameter for evaluation :param image_observations: Use image instead of feature observations :param output_folder: Root folder for outputs :param fine_tune_on: If not None instead of self-play training perform training of an adversarial policy against the victim specified as string to this parameter :param opponent_pred_obs: If this is set to True, the predictions of the opponents in the current state will beconcatenated to the observations for the main agent. This was an attempt to create a stronger adversarial policy, which could use this information, however in our experiments this didn't improve the adversarial policy :param adversarial_training: If set to True perform adversarial training using FGSM during training. :param save_freq: If not None save intermediate checkpoints during training with the given frequency :return: """ eval_env, eval_env_rule_based, eval_op, train_env, train_env_rule_based = _init_envs(image_observations, num_skip_steps, opponent_pred_obs, adversarial_training) # If fine tuning, load model to fine-tune from path if fine_tune_on is not None: path = Path(output_folder) / 'models' / fine_tune_on fine_tune_model = DQN.load(path) fine_tune_model.tensorboard_log = None if opponent_pred_obs: # We can't eval on agents that don't have a q_net so we change eval_op to the original model that is being # fine-tuned against, instead of the rule-based agent eval_op = fine_tune_model eval_env_rule_based.set_opponent(eval_op) eval_env_rule_based = OpponentPredictionObs(eval_env_rule_based) eval_env.set_opponent(eval_op) eval_env = OpponentPredictionObs(eval_env) else: fine_tune_model = None # Initialize first agent pre_train_agent = SimpleRuleBasedAgent(train_env_rule_based) previous_models = [pre_train_agent] # Load potentially saved previous models for opponent_id in range(1, max_agents): path = _make_model_path(output_folder, model_name, opponent_id) if os.path.isfile(path): model = DQN.load(path) previous_models.append(model) else: break # Initialize first round last_agent_id = len(previous_models) - 1 prev_num_steps = 0 patience_counter = 0 tb_path = Path(output_folder) / "tb-log" if last_agent_id == 0: # main_model = A2C('MlpPolicy', policy_kwargs=dict(optimizer_class=RMSpropTFLike, optimizer_kwargs=dict(eps=1e-5)), env=train_env, verbose=0, # tensorboard_log="output/tb-log") # main_model = A2C('MlpPolicy', train_env, verbose=0, tensorboard_log="output/tb-log") # , exploration_fraction=0.3) main_model = DQN('MlpPolicy', train_env_rule_based, verbose=0, tensorboard_log=tb_path) # , exploration_fraction=0.3) else: main_model = copy.deepcopy(previous_models[last_agent_id]) main_model.set_env(train_env) main_model.tensorboard_log = tb_path # Start training with self-play over several rounds opponent_id = last_agent_id while opponent_id < max_agents - 1: print(f"Running training round {opponent_id + 1}") if fine_tune_on is None: # Choose opponent based on setting if only_rule_based_op: current_train_env = train_env_rule_based # Use rule-based as opponent current_train_env.set_opponent(SimpleRuleBasedAgent(current_train_env)) else: if opponent_id == 0: current_train_env = train_env_rule_based else: current_train_env = train_env # Take opponent from the previous version of the model current_train_env.set_opponent(previous_models[opponent_id]) else: # Use passed fine-tune agent as opponent current_train_env = train_env current_train_env.set_opponent(fine_tune_model) # Train the model current_train_env.set_opponent_right_side(True) chosen_n_steps = num_learn_steps_pre_training if opponent_id == 0 else num_learn_steps # Iteration 0 is pre-training # In order to generate adversarial examples the adversarial training wrapper needs a references to the model that is # currently being trained if adversarial_training is not None: current_train_env.env.victim_model = main_model # Optionally add a callback to save intermediate checkpoints if save_freq is not None: checkpoint_callback = CheckpointCallback(save_freq=save_freq, save_path='./output/intermediate/', name_prefix=model_name + str(opponent_id + 1) + '_interm') else: checkpoint_callback = None # === LEARNING === main_model.learn(total_timesteps=chosen_n_steps, tb_log_name=model_name, callback=checkpoint_callback) # Do evaluation for this training round eval_env_rule_based.set_opponent(eval_op) avg_round_reward, num_steps = evaluate(main_model, eval_env_rule_based, num_eps=num_eval_eps) print(model_name) print(f"Average round reward after training: {avg_round_reward}") print(f"Average number of steps per episode: {num_steps / num_eval_eps}") # Check if there was improvement if num_steps > prev_num_steps: # Model improved compared to last print('Model improved') prev_num_steps = num_steps # Reset patience counter patience_counter = 0 # Save the further trained model to disk main_model.save(_make_model_path(output_folder, model_name, opponent_id + 1)) # Make a copy of the just saved model by loading it copy_of_model = DQN.load(_make_model_path(output_folder, model_name, opponent_id + 1)) # Save the copy to the list previous_models.append(copy_of_model) # From here we continue training the same main_model against itself opponent_id += 1 else: print('Model did not improve') patience_counter += 1 # Do not save the model if patience_counter > patience: print('Stopping early due to patience') break # Because our model did not improve compared to the previous one, we reset our main_model to the previous one main_model = DQN.load(_make_model_path(output_folder, model_name, opponent_id)) main_model.set_env(train_env) # Opponent does not change if not opponent_pred_obs: # Evaluate the last model against each of its previous iterations # evaluate_against_predecessors(previous_models, env_rule_based=eval_env_rule_based, env_normal=eval_env, num_eval_eps=num_eval_eps) pass # Not useful right now
# game = 'YarsRevenge-ram-v0' # game = 'Zaxxon-v0' # game = 'Zaxxon-ram-v0' #env = gym.make('Pong-v0') env = gym.make(game) # save_file = 'dqn_pong'; save_file = 'dqn_' + game print(env.action_space) print(env.get_action_meanings()) model = DQN(MlpPolicy, env, verbose=1) #model = DQN.load(save_file) model.set_env(env) # model = DQN(CnnPolicy, env, verbose=1) model.learn(total_timesteps=50000, log_interval=10) # model.save(save_file) obs = env.reset() score = 0 rewards_sum = 0 while True: # print(score) action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() score = score + 1
def train_adril(env, n=0, balanced=False): num_trajs = 20 expert_data = make_sa_dataset(env, max_trajs=num_trajs) n_expert = len(expert_data["obs"]) expert_sa = np.concatenate( (expert_data["obs"], np.reshape(expert_data["acts"], (n_expert, -1))), axis=1) for i in range(0, n): venv = AdRILWrapper(gym.make(env)) mean_rewards = [] std_rewards = [] # Create model if isinstance(venv.action_space, Discrete): model = DQN(SQLPolicy, venv, verbose=1, policy_kwargs=dict(net_arch=[64, 64]), learning_starts=1) else: model = SAC('MlpPolicy', venv, verbose=1, policy_kwargs=dict(net_arch=[256, 256]), ent_coef='auto', learning_rate=linear_schedule(7.3e-4), train_freq=64, gradient_steps=64, gamma=0.98, tau=0.02) model.replay_buffer = AdRILReplayBuffer(model.buffer_size, model.observation_space, model.action_space, model.device, 1, model.optimize_memory_usage, expert_data=expert_data, N_expert=num_trajs, balanced=balanced) if not balanced: for j in range(len(expert_sa)): obs = expert_data["obs"][j] act = expert_data["acts"][j] next_obs = expert_data["next_obs"][j] done = expert_data["dones"][j] model.replay_buffer.add(obs, next_obs, act, -1, done) for train_steps in range(400): # Train policy if train_steps > 0: if 'Bullet' in env: model.learn(total_timesteps=1250, log_interval=1000) else: model.learn(total_timesteps=25000, log_interval=1000) if train_steps % 1 == 0: # written to support more complex update schemes model.replay_buffer.set_iter(train_steps) model.replay_buffer.set_n_learner(venv.num_trajs) # Evaluate policy if train_steps % 20 == 0: model.set_env(gym.make(env)) mean_reward, std_reward = evaluate_policy(model, model.env, n_eval_episodes=10) mean_rewards.append(mean_reward) std_rewards.append(std_reward) print("{0} Steps: {1}".format(int(train_steps * 1250), mean_reward)) np.savez(os.path.join("learners", env, "adril_rewards_{0}".format(i)), means=mean_rewards, stds=std_rewards) # Update env if train_steps > 0: if train_steps % 1 == 0: venv.set_iter(train_steps + 1) model.set_env(venv)