def __init__(self, name, network_config, session, restore = True, learning_rate = 0.001): super(CriticModel, self).__init__() self.name = name.replace(" ", "_") self.network_config = network_config self.collections = [] self.restore = restore # TODO add ability to configure learning rate for network! self.learning_rate = learning_rate self.summaries = [] self.session = session logger.info("Building network for %s" % self.name) self.build_network() self.saver = tf.train.Saver() self.session.run(tf.global_variables_initializer()) # TODO # * Option to disable summaries clear_summary_path(self.network_config.summaries_path + "/" + self.name) self.summaries_writer = tf.summary.FileWriter(self.network_config.summaries_path + "/" + self.name) logger.info("Created network for %s " % self.name) self.restore_network()
def __init__(self, name, input_len, output_len, network_config, use_cuda, restore=True, learning_rate=0.0005): self.name = name model = _TransModel(input_len, output_len) self.use_cuda = use_cuda if use_cuda: logger.info("Network %s is using cuda " % self.name) model = model.cuda() super(TransModel, self).__init__(model, name, network_config, restore) self.network_config = network_config self.optimizer = Adam(self.model.parameters(), lr=self.network_config.learning_rate) self.loss_fn = nn.MSELoss(reduction='mean') summaries_path = self.network_config.summaries_path + "/" + self.name if not network_config.restore_network: clear_summary_path(summaries_path) self.summary = SummaryWriter(log_dir=summaries_path) else: self.summary = SummaryWriter(log_dir=summaries_path) logger.info("Created network for %s " % self.name)
def __init__(self, name, state_length, network_config, reinforce_config, feature_len, combine_decomposed_func, is_sigmoid=False, memory_resotre=True): super(SADQ_GQF, self).__init__() self.name = name #self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.memory = ReplayBuffer_decom(self.reinforce_config.memory_size) self.learning = True self.explanation = False self.state_length = state_length self.features = 0 self.feature_len = feature_len # Global self.steps = 0 self.reward_history = [] self.episode_time_history = [] self.best_reward_mean = -maxsize self.episode = 0 self.feature_len = feature_len self.features = None self.reset() self.memory_resotre = memory_resotre reinforce_summary_path = self.reinforce_config.summaries_path + "/" + self.name if not self.network_config.restore_network: clear_summary_path(reinforce_summary_path) else: self.restore_state() self.summary = SummaryWriter(log_dir=reinforce_summary_path) self.eval_model = feature_q_model(name, state_length, self.feature_len, self.network_config.output_shape, network_config) self.target_model = feature_q_model(name, state_length, self.feature_len, self.network_config.output_shape, network_config) # self.target_model.eval_mode() self.beta_schedule = LinearSchedule( self.reinforce_config.beta_timesteps, initial_p=self.reinforce_config.beta_initial, final_p=self.reinforce_config.beta_final) self.epsilon_schedule = LinearSchedule( self.reinforce_config.epsilon_timesteps, initial_p=self.reinforce_config.starting_epsilon, final_p=self.reinforce_config.final_epsilon)
def __init__(self, name, network_config, use_cuda, restore=True, learning_rate=0.001): self.name = name model = _DQNModel(network_config) model = nn.DataParallel(model) self.use_cuda = use_cuda if use_cuda: logger.info("Network %s is using cuda " % self.name) model = model.cuda() super(DQNModel, self).__init__(model, name, network_config, restore) self.network_config = network_config self.optimizer = Adam(self.model.parameters(), lr=self.network_config.learning_rate) self.loss_fn = nn.SmoothL1Loss() self.is_SmoothL1Loss = True # print("loss func: SmoothL1Loss") # self.loss_fn = nn.CrossEntropyLoss() # self.is_SmoothL1Loss = False # print("loss func: CrossEntropyLoss") summaries_path = self.network_config.summaries_path + "/" + self.name if not network_config.restore_network: clear_summary_path(summaries_path) self.summary = SummaryWriter(log_dir=summaries_path) else: self.summary = SummaryWriter(log_dir=summaries_path) logger.info("Created network for %s " % self.name)
def __init__(self, name, network_config, use_cuda, restore=True): self.network_config = network_config self.name = name summaries_path = self.network_config.summaries_path + "/" + self.name model = _HRAModel(network_config) if use_cuda: logger.info("Network %s is using cuda " % self.name) model = model.cuda() Model.__init__(self, model, name, network_config, restore) logger.info("Created network for %s " % self.name) self.optimizer = Adam(self.model.parameters(), lr=self.network_config.learning_rate) self.loss_fn = nn.SmoothL1Loss() if not network_config.restore_network: clear_summary_path(summaries_path) self.summary = SummaryWriter(log_dir=summaries_path) dummy_input = torch.rand(network_config.input_shape).unsqueeze(0) if use_cuda: dummy_input = dummy_input.cuda() self.summary.add_graph(self.model, dummy_input) else: self.summary = SummaryWriter(log_dir=summaries_path)
def __init__(self, name, choices, network_config, reinforce_config): super(A3CAdaptive, self).__init__() self.name = name self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.update_frequency = reinforce_config.update_frequency self.replay_memory = Memory(self.reinforce_config.memory_size) self.learning = True self.steps = 0 self.previous_state = None self.previous_action = None self.reward_types = len(self.network_config.networks) self.current_reward = 0 self.total_reward = 0 self.session = tf.Session() self.critic_model = CriticModel(self.name + "_critic", self.network_config, self.session) self.actor_model = ActorModel(self.name + "_actor", self.network_config, self.session) #TODO: # * Add more information/summaries related to reinforcement learning # * Option to disable summary? clear_summary_path(self.reinforce_config.summaries_path + "/" + self.name) self.summaries_writer = tf.summary.FileWriter(self.reinforce_config.summaries_path + "/" + self.name, graph = self.session.graph) self.episode = 0
def __init__(self, name, state_length, network_config, reinforce_config, reward_num, combine_decomposed_func, memory_resotre=True): super(SADQAdaptive, self).__init__() self.name = name #self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config if self.reinforce_config.use_prior_memory: self.memory = PrioritizedReplayBuffer( self.reinforce_config.memory_size, 0.6) else: self.memory = ReplayBuffer(self.reinforce_config.memory_size) self.learning = True self.state_length = state_length # Global self.steps = 0 self.best_reward_mean = 0 self.episode = 0 self.combine_decomposed_reward = combine_decomposed_func self.reward_num = reward_num self.reset() self.memory_resotre = memory_resotre reinforce_summary_path = self.reinforce_config.summaries_path + "/" + self.name if not self.network_config.restore_network: clear_summary_path(reinforce_summary_path) else: self.restore_state() self.summary = SummaryWriter(log_dir=reinforce_summary_path) self.target_model = DQNModel(self.name + "_target", self.network_config, use_cuda) self.eval_model = DQNModel(self.name + "_eval", self.network_config, use_cuda) # self.target_model.eval_mode() self.beta_schedule = LinearSchedule( self.reinforce_config.beta_timesteps, initial_p=self.reinforce_config.beta_initial, final_p=self.reinforce_config.beta_final) self.epsilon_schedule = LinearSchedule( self.reinforce_config.epsilon_timesteps, initial_p=self.reinforce_config.starting_epsilon, final_p=self.reinforce_config.final_epsilon)
def __init__(self, name, choices, network_config, reinforce_config): super(DQNAdaptive, self).__init__() self.name = name self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.memory = PrioritizedReplayBuffer( self.reinforce_config.memory_size, 0.6) self.learning = True self.explanation = False # Global self.steps = 0 self.reward_history = [] self.episode_time_history = [] self.best_reward_mean = -maxsize self.episode = 0 self.reset() reinforce_summary_path = self.reinforce_config.summaries_path + "/" + self.name if not self.network_config.restore_network: clear_summary_path(reinforce_summary_path) else: self.restore_state() self.summary = SummaryWriter(log_dir=reinforce_summary_path) self.target_model = DQNModel(self.name + "_target", self.network_config, use_cuda) self.eval_model = DQNModel(self.name + "_eval", self.network_config, use_cuda) self.beta_schedule = LinearSchedule( self.reinforce_config.beta_timesteps, initial_p=self.reinforce_config.beta_initial, final_p=self.reinforce_config.beta_final) self.epsilon_schedule = LinearSchedule( self.reinforce_config.epsilon_timesteps, initial_p=self.reinforce_config.starting_epsilon, final_p=self.reinforce_config.final_epsilon)
def __init__(self, name, choices, reward_types, network_config, reinforce_config): super(HRAAdaptive, self).__init__() self.name = name self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.update_frequency = reinforce_config.update_frequency self.replay_memory = PrioritizedReplayBuffer( self.reinforce_config.memory_size, 0.6) self.learning = True self.explanation = False self.steps = 0 self.previous_state = None self.previous_action = None self.reward_types = reward_types self.clear_rewards() self.total_reward = 0 self.eval_model = HRAModel(self.name + "_eval", self.network_config) self.target_model = HRAModel(self.name + "_target", self.network_config) clear_summary_path(self.reinforce_config.summaries_path + "/" + self.name) self.summary = SummaryWriter( log_dir=self.reinforce_config.summaries_path + "/" + self.name) self.episode = 0 self.beta_schedule = LinearSchedule(10 * 1000, initial_p=0.2, final_p=1.0)
def run_task(evaluation_config, network_config, reinforce_config): env = gym.make(evaluation_config.env) state = env.reset(state_representation="linear") LEFT, RIGHT, UP, DOWN = [0, 1, 2, 3] choices = [LEFT, RIGHT, UP, DOWN] agent = DQNAdaptive(name="FruitCollecter", choices=choices, network_config=network_config, reinforce_config=reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = SummaryWriter(training_summaries_path) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) # Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset(state_representation="linear") total_reward = 0 done = False steps = 0 while not done: steps += 1 action, q_values = agent.predict(state) state, reward, done, info = env.step(action) agent.reward(reward) total_reward += reward agent.end_episode(state) test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward, global_step=episode + 1) train_summary_writer.add_scalar(tag="Train/Steps to collect all Fruits", scalar_value=steps + 1, global_step=episode + 1) agent.disable_learning() # Test Episodes for episode in range(evaluation_config.test_episodes): state = env.reset(state_representation="linear") total_reward = 0 done = False steps = 0 while not done: steps += 1 action, q_values = agent.predict(state) if evaluation_config.render: env.render() time.sleep(0.5) state, reward, done, info = env.step(action) total_reward += reward agent.end_episode(state) test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward, global_step=episode + 1) test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits", scalar_value=steps + 1, global_step=episode + 1) env.close()
def run_task(evaluation_config, network_config, reinforce_config): env = gym.make(evaluation_config.env) state = env.reset(state_representation="rgb") LEFT, RIGHT, UP, DOWN = [0, 1, 2, 3] choices = [LEFT, RIGHT, UP, DOWN] pdx_explanation = PDX() reward_types = env.reward_types agent = HRAAdaptive(name="FruitCollecter", choices=choices, reward_types=reward_types, network_config=network_config, reinforce_config=reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = SummaryWriter(training_summaries_path) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) # Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset(state_representation="rgb") total_reward = 0 done = False steps = 0 while not done: steps += 1 action, q_values, combined_q_values = agent.predict(state) state, rewards, done, info = env.step(action, decompose_reward=True) for reward_type in rewards.keys(): agent.reward(reward_type, rewards[reward_type]) total_reward += sum(rewards.values()) agent.end_episode(state) test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward, global_step=episode + 1) train_summary_writer.add_scalar(tag="Train/Episode Steps", scalar_value=steps + 1, global_step=episode + 1) agent.disable_learning() # Test Episodes for episode in range(evaluation_config.test_episodes): state = env.reset(state_representation="rgb") total_reward = 0 done = False steps = 0 while not done: steps += 1 action, q_values, combined_q_values = agent.predict(state) if evaluation_config.render: env.render() pdx_explanation.render_decomposed_rewards( action, combined_q_values.data.numpy(), q_values.data.numpy(), env.action_names, env.reward_types) pdx_explanation.render_all_pdx( action, env.action_space, q_values.data, env.action_names, env.reward_types) time.sleep(evaluation_config.sleep) state, reward, done, info = env.step(action) total_reward += reward agent.end_episode(state) test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward, global_step=episode + 1) test_summary_writer.add_scalar(tag="Test/Episode Steps", scalar_value=steps + 1, global_step=episode + 1) env.close()
def run_task(evaluation_config, network_config, reinforce_config): import absl absl.flags.FLAGS(sys.argv[:1]) env = FourTowersSequentialEnvironment() max_episode_steps = 100 state = env.reset() print('Initial state is: {}'.format(state)) choices = [0,1,2,3] pdx_explanation = PDX() reward_types = ['roach', 'zergling', 'damageByRoach', 'damageByZergling', 'damageToRoach', 'damageToZergling'] agent = HRAAdaptive(name = "FourTowerSequential", choices = choices, reward_types = reward_types, network_config = network_config, reinforce_config = reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = SummaryWriter(training_summaries_path) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) # Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() total_reward = 0 done = False dead = False deciding = True running = True steps = 0 rewards = [] initial_state = np.array(state) while deciding: steps += 1 action, q_values, combined_q_values = agent.predict(state[0]) state, reward, done, dead, info = env.step(action) while running: action = 4 state, reward, done, dead, info = env.step(action) if done: break # TODO: Explain the meaning of the numerical constant 200 in this situation # eg. MaxPossibleDamage = 200 or RoachZerglingRatio = 200 if not dead: rewards = { 'roach': env.decomposed_rewards[len(env.decomposed_rewards) - 1][0], 'zergling': env.decomposed_rewards[len(env.decomposed_rewards) - 1][1], 'damageByRoach': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 1][2]) / 200), 'damageByZergling': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 1][3]) / 200), 'damageToRoach': (env.decomposed_rewards[len(env.decomposed_rewards) - 1][4] / 200), 'damageToZergling': (env.decomposed_rewards[len(env.decomposed_rewards) - 1][5] / 200) } else: rewards = { 'roach': env.decomposed_rewards[len(env.decomposed_rewards) - 2][0], 'zergling': env.decomposed_rewards[len(env.decomposed_rewards) - 2][1], 'damageByRoach': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 2][2]) / 200), 'damageByZergling': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 2][3]) / 200), 'damageToRoach': (env.decomposed_rewards[len(env.decomposed_rewards) - 2][4] / 200), 'damageToZergling': (env.decomposed_rewards[len(env.decomposed_rewards) - 2][5] / 200) } for reward_type in rewards.keys(): agent.reward(reward_type, rewards[reward_type]) total_reward += rewards[reward_type] if dead: break agent.end_episode(state[0]) test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward, global_step=episode + 1) train_summary_writer.add_scalar(tag="Train/Steps to collect all Fruits", scalar_value=steps + 1, global_step=episode + 1) print("EPISODE REWARD {}".format(rewards['roach'] + rewards['zergling'])) print("EPISODE {}".format(episode)) # TODO: Display XDAPS agent.disable_learning() # Test Episodes for episode in range(evaluation_config.test_episodes): state = env.reset() total_reward = 0 done = False steps = 0 deciding = True running = True while deciding: steps += 1 action, q_values, combined_q_values = agent.predict(state[0]) print(action) print(q_values) if evaluation_config.render: # env.render() pdx_explanation.render_all_pdx(action, 4, q_values, ['Top_Left', 'Top_Right', 'Bottom_Left', 'Bottom_Right'], ['roach', 'zergling', 'damageByRoach', 'damageByZergling', 'damageToRoach', 'damageToZergling']) time.sleep(evaluation_config.sleep) # This renders an image of the game and saves to test.jpg # imutil.show(self.last_timestep.observation['rgb_screen'], filename="test.jpg") state, reward, done, dead, info = env.step(action) while running: action = 4 state, reward, done, dead, info = env.step(action) if done: break if dead: break agent.end_episode(state) test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward, global_step=episode + 1) test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits", scalar_value=steps + 1, global_step=episode + 1)
def run_task(evaluation_config, network_config, reinforce_config, map_name=None, train_forever=False): if (use_cuda): print("~~~~~~~~~~~~~~~~~~~~~~~~~~") print("| USING CUDA |") print("~~~~~~~~~~~~~~~~~~~~~~~~~~") else: print("~~~~~~~~~~~~~~~~~~~~~~~~~~") print("| NOT USING CUDA |") print("~~~~~~~~~~~~~~~~~~~~~~~~~~") flags.FLAGS(sys.argv[:1]) max_episode_steps = 40 replay_dimension = evaluation_config.xai_replay_dimension env = TugOfWar(map_name = map_name, \ generate_xai_replay = evaluation_config.generate_xai_replay, xai_replay_dimension = replay_dimension) combine_sa = env.combine_sa state_1, state_2 = env.reset() models_path = "abp/examples/pysc2/tug_of_war/models_mb/" agent_1 = MBTSAdaptive(name="TugOfWar", state_length=len(state_1), network_config=network_config, reinforce_config=reinforce_config, models_path=models_path, depth=2, action_ranking=4, env=env) if not reinforce_config.is_random_agent_2: agent_2 = SADQAdaptive(name="TugOfWar", state_length=len(state_2), network_config=network_config, reinforce_config=reinforce_config, is_sigmoid=True, memory_resotre=False) agent_2.eval_model.replace(agent_1.q_model) print("sadq agent 2") else: print("random agent 2") path = './saved_models/tug_of_war/agents' agents_2 = [] agents_2.append(agent_2) if evaluation_config.generate_xai_replay and not reinforce_config.is_random_agent_2: files = [] # r=root, d=directories, f = files for r, d, f in os.walk(path): # print(d) if len(d) == 3: for file in f: if '.p' in file: new_weights = torch.load(path + "/" + file) new_agent_2 = SADQAdaptive( name=file, state_length=len(state_1), network_config=network_config, reinforce_config=reinforce_config) new_agent_2.load_weight(new_weights) new_agent_2.disable_learning(is_save=False) agents_2.append(new_agent_2) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) random_enemy = False while True: # if not reinforce_config.is_random_agent_2: # agent_2.disable_learning() # Test Episodes print( "======================================================================" ) print( "===============================Now testing============================" ) print( "======================================================================" ) print("There are {} enemies".format(len(agents_2))) for agent_2 in agents_2: print(agent_2.name) average_state = np.zeros(len(state_1)) total_rewwards_list = [] for episode in tqdm(range(10)): state = env.reset() total_reward_1 = 0 done = False skiping = True steps = 0 if evaluation_config.generate_xai_replay: recorder = XaiReplayRecorder2LaneNexus( env.sc2_env, episode, evaluation_config.env, action_component_names, replay_dimension) while skiping: state_1, state_2, done, dp = env.step([], 0) if evaluation_config.generate_xai_replay: #recorder.save_jpg() recorder.record_game_clock_tick( env.decomposed_reward_dict) if dp or done: break # input("done stepping to finish prior action") while not done and steps < max_episode_steps: steps += 1 # # Decision point # print('state:') # print(list(env.denormalization(state_1))) # print(list(env.denormalization(state_2))) actions_1 = env.get_big_A(state_1[env.miner_index], state_1[env.pylon_index]) actions_2 = env.get_big_A(state_2[env.miner_index], state_2[env.pylon_index]) # choice_1 = agent_1.predict(env.denormalization(state_1), env.denormalization(state_2)[env.miner_index]) # print(state_1) actions_1111111, node = agent_1.predict( state_1, state_2[env.miner_index], dp=steps) # print() # print() # print() # # node.print_tree(p_best_q_value = True, p_action = True, p_after_q_value = True) if evaluation_config.generate_xai_replay: path_whole_tree = recorder.json_pathname[:-5] + "_whole_tree/" print(path_whole_tree) path_partial_tree = recorder.json_pathname[:-5] + "_partial_tree/" print(path_partial_tree) if not os.path.exists(path_whole_tree): os.mkdir(path_whole_tree) if not os.path.exists(path_partial_tree): os.mkdir(path_partial_tree) node.save_into_json(path=path_whole_tree, dp=steps) node.save_into_json(path=path_partial_tree, dp=steps, is_partial=True) # input() # print(actions_1111111) # input() # input("state_1 checked") combine_states_2 = combine_sa(state_2, actions_2) if not reinforce_config.is_random_agent_2 and not random_enemy: choice_2, _ = agent_2.predict( env.normalization(combine_states_2)) else: choice_2 = randint(0, len(actions_2) - 1) if evaluation_config.generate_xai_replay: #recorder.save_jpg() recorder.record_decision_point( actions_1111111, actions_2[choice_2], state_1, state_2, env.decomposed_reward_dict) # env.step(list(actions_1[choice_1]), 1) # print(actions_2[choice_2]) # pretty_print(state_2, text = "state:") # input() env.step(list(actions_1111111), 1) env.step(list(actions_2[choice_2]), 2) # human play # env.step(list(get_human_action()), 2) # print(actions_1111111) while skiping: state_1, state_2, done, dp = env.step([], 0) #input(' step wating for done signal') if evaluation_config.generate_xai_replay: #recorder.save_jpg() recorder.record_game_clock_tick( env.decomposed_reward_dict) if dp or done: break if steps == max_episode_steps or done: if evaluation_config.generate_xai_replay: recorder.done_recording() win_lose = player_1_win_condition( state_1[27], state_1[28], state_1[29], state_1[30]) if win_lose == 1: env.decomposed_rewards[4] = 10000 env.decomposed_rewards[5] = 0 elif win_lose == -1: env.decomposed_rewards[4] = 0 env.decomposed_rewards[5] = 10000 reward_1, reward_2 = env.sperate_reward( env.decomposed_rewards) total_reward_1 += sum(reward_1) average_state += state_1 total_rewwards_list.append(total_reward_1) # print(total_rewwards_list) test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward_1, global_step=episode + 1) test_summary_writer.add_scalar( tag="Test/Steps to choosing Enemies", scalar_value=steps + 1, global_step=episode + 1) tr = sum(total_rewwards_list) / evaluation_config.test_episodes print("total reward:") print(tr) f = open("result_model_based_v3.txt", "a+") f.write(agent_2.name + "\n") f.write(str(tr) + "\n") f.write( np.array2string(average_state / evaluation_config.test_episodes, precision=2, separator=',', suppress_small=True) + "\n") f.close()
def run_task(evaluation_config, network_config, reinforce_config, map_name=None, train_forever=False): if (use_cuda): print("~~~~~~~~~~~~~~~~~~~~~~~~~~") print("| USING CUDA |") print("~~~~~~~~~~~~~~~~~~~~~~~~~~") else: print("~~~~~~~~~~~~~~~~~~~~~~~~~~") print("| NOT USING CUDA |") print("~~~~~~~~~~~~~~~~~~~~~~~~~~") flags.FLAGS(sys.argv[:1]) max_episode_steps = 40 replay_dimension = evaluation_config.xai_replay_dimension env = TugOfWar(map_name = map_name, \ generate_xai_replay = evaluation_config.generate_xai_replay, xai_replay_dimension = replay_dimension) reward_types = env.reward_types combine_sa = env.combine_sa state_1, state_2 = env.reset() if network_config.output_shape == 4: reward_num = 4 combine_decomposed_func = combine_decomposed_func_4 player_1_end_vector = player_1_end_vector_4 if network_config.output_shape == 8: reward_num = 8 combine_decomposed_func = combine_decomposed_func_8 player_1_end_vector = player_1_end_vector_8 if network_config.output_shape == 1: reward_num = 1 combine_decomposed_func = combine_decomposed_func_1 player_1_end_vector = player_1_end_vector_1 if not reinforce_config.is_random_agent_1: agent_1 = SADQAdaptive(name="TugOfWar", state_length=len(state_1), network_config=network_config, reinforce_config=reinforce_config, reward_num=reward_num, combine_decomposed_func=combine_decomposed_func) print("sadq agent 1") else: print("random agent 1") training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = SummaryWriter(training_summaries_path) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) agents_2 = ["random", "random_2"] round_num = 0 privous_result = [] update_wins_waves = 10 all_experiences = [] path = './saved_models/tug_of_war/agents/grid' exp_save_path = 'abp/examples/pysc2/tug_of_war/rand_v_rand.pt' if reinforce_config.collecting_experience and not reinforce_config.is_random_agent_2: agent_1_model = "TugOfWar_eval.pupdate_240" exp_save_path = 'abp/examples/pysc2/tug_of_war/all_experiences.pt' for r, d, f in os.walk(path): for file in f: if '.p' in file: new_weights = torch.load(path + "/" + file) new_agent_2 = SADQAdaptive( name=file, state_length=len(state_1), network_config=network_config, reinforce_config=reinforce_config, memory_resotre=False, reward_num=reward_num, combine_decomposed_func=combine_decomposed_func) new_agent_2.load_weight(new_weights) new_agent_2.disable_learning(is_save=False) agents_2.append(new_agent_2) if agent_1_model == file: print("********agent_1_model", file) agent_1.load_model(new_agent_2.eval_model) elif network_config.restore_network: restore_path = network_config.network_path for r, d, f in os.walk(restore_path): f = sorted(f) for file in f: if 'eval.pupdate' in file or 'eval.p_the_best' in file: new_weights = torch.load(restore_path + "/" + file) new_agent_2 = SADQAdaptive( name=file, state_length=len(state_1), network_config=network_config, reinforce_config=reinforce_config, memory_resotre=False, reward_num=reward_num, combine_decomposed_func=combine_decomposed_func) new_agent_2.load_weight(new_weights) new_agent_2.disable_learning(is_save=False) agents_2.append(new_agent_2) print("loaded agent:", file) # agent_1.steps = reinforce_config.epsilon_timesteps / 2 if evaluation_config.generate_xai_replay: agent_1_model = "TugOfWar_eval.pupdate_600" agent_2_model = "TugOfWar_eval.pupdate_560" agents_2 = [] if use_cuda: weights_1 = torch.load(path + "/" + agent_1_model) weights_2 = torch.load(path + "/" + agent_2_model) else: weights_1 = torch.load(path + "/" + agent_1_model, map_location=lambda storage, loc: storage) weights_2 = torch.load(path + "/" + agent_2_model, map_location=lambda storage, loc: storage) new_agent_2 = SADQAdaptive( name="record", state_length=len(state_1), network_config=network_config, reinforce_config=reinforce_config, memory_resotre=False, reward_num=reward_num, combine_decomposed_func=combine_decomposed_func) agent_1.load_weight(weights_1) new_agent_2.load_weight(weights_2) new_agent_2.disable_learning(is_save=False) agents_2.append(new_agent_2) if reinforce_config.is_use_sepcific_enemy: sepcific_SADQ_enemy_weights = torch.load(reinforce_config.enemy_path) sepcific_network_config = NetworkConfig.load_from_yaml( "./tasks/tug_of_war/sadq_2p_2l_decom/v2_8/network.yml") sepcific_network_config.restore_network = False sepcific_SADQ_enemy = SADQAdaptive( name="sepcific enemy", state_length=len(state_1), network_config=sepcific_network_config, reinforce_config=reinforce_config, memory_resotre=False, reward_num=sepcific_network_config.output_shape, combine_decomposed_func=combine_decomposed_func_8) sepcific_SADQ_enemy.load_weight(sepcific_SADQ_enemy_weights) sepcific_SADQ_enemy.disable_learning(is_save=False) agents_2 = [sepcific_SADQ_enemy] while True: print(sum(np.array(privous_result) >= 0.9)) if len(privous_result) >= update_wins_waves and \ sum(np.array(privous_result) >= 0.9) >= update_wins_waves and \ not reinforce_config.is_random_agent_2 and not reinforce_config.is_use_sepcific_enemy: privous_result = [] print("replace enemy agent's weight with self agent") # random_enemy = False f = open(evaluation_config.result_path, "a+") f.write("Update agent\n") f.close() new_agent_2 = SADQAdaptive( name="TugOfWar_" + str(round_num), state_length=len(state_2), network_config=network_config, reinforce_config=reinforce_config, memory_resotre=False, reward_num=reward_num, combine_decomposed_func=combine_decomposed_func) new_agent_2.load_model(agent_1.eval_model) new_agent_2.disable_learning(is_save=False) agents_2.append(new_agent_2) agent_1.steps = reinforce_config.epsilon_timesteps / 2 agent_1.best_reward_mean = 0 agent_1.save(force=True, appendix="update_" + str(round_num)) round_num += 1 print( "=======================================================================" ) print( "===============================Now training============================" ) print( "=======================================================================" ) print("Now training.") print("Now have {} enemy".format(len(agents_2))) for idx_enemy, enemy_agent in enumerate(agents_2): # break if reinforce_config.collecting_experience: break if type(enemy_agent) == type("random"): print(enemy_agent) else: print(enemy_agent.name) if idx_enemy == len(agents_2) - 1: training_num = evaluation_config.training_episodes else: training_num = 10 for episode in tqdm(range(training_num)): # if type(enemy_agent) == type("random"): # break state_1, state_2 = env.reset() total_reward = 0 skiping = True done = False steps = 0 # print(list(state_1)) # print(list(state_2)) while skiping: state_1, state_2, done, dp = env.step([], 0) if dp or done: break last_mineral = state_1[env.miner_index] while not done and steps < max_episode_steps: steps += 1 # w += 1 # print(w) # Decision point # print('state:') # print("=======================================================================") # pretty_print(state_1, text = "state 1") # pretty_print(state_2, text = "state 2") if agent_1.steps < reinforce_config.epsilon_timesteps: actions_1 = env.get_big_A(state_1[env.miner_index], state_1[env.pylon_index], is_train=1) else: actions_1 = env.get_big_A(state_1[env.miner_index], state_1[env.pylon_index], is_train=0) actions_2 = env.get_big_A(state_2[env.miner_index], state_2[env.pylon_index], is_train=1) assert state_1[-1] == state_2[-1] == steps, print( state_1, state_2, steps) if not reinforce_config.is_random_agent_1: combine_states_1 = combine_sa(state_1, actions_1) # print(combine_states_1) # print(env.normalization(combine_states_1)) # print(state_1[env.miner_index]) choice_1, _ = agent_1.predict( env.normalization(combine_states_1)) # input() # for cs1 in combine_states_1: # print(cs1.tolist()) else: # combine_states_1 = combine_sa(state_1, actions_1) choice_1 = randint(0, len(actions_1) - 1) if not reinforce_config.is_random_agent_2 and type( enemy_agent) != type("random"): combine_states_2 = combine_sa(state_2, actions_2) choice_2, _ = enemy_agent.predict( env.normalization(combine_states_2)) else: if enemy_agent == "random_2": actions_2 = env.get_big_A(state_2[env.miner_index], state_2[env.pylon_index]) choice_2 = randint(0, len(actions_2) - 1) # print("action list:") # print(actions_1) # print(actions_2) # assign action # print("choice:") # print(actions_1[choice_1]) # print(actions_2[choice_2]) # pretty_print(combine_states_1[choice_1], text = "after state:") # input("pause") # print(combine_states_2[choice_2].tolist()) # if state_1[env.miner_index] > 300: # input('pause') env.step(list(actions_1[choice_1]), 1) env.step(list(actions_2[choice_2]), 2) # if steps == 39: # env.step([3,0,0,0,0,0,0], 1) last_mineral = combine_states_1[choice_1][env.miner_index] l_m_1 = state_1[env.miner_index] l_m_2 = state_2[env.miner_index] while skiping: state_1, state_2, done, dp = env.step([], 0) # input('time_step') if dp or done: break # Check if the mineral is correct # if not done and steps < max_episode_steps and type(enemy_agent) != type("random"): # next_mineral_1 = combine_states_1[choice_1][env.miner_index] + 100 + combine_states_1[choice_1][env.pylon_index] * 75 # # if type(enemy_agent) != type("random"): # next_mineral_2 = combine_states_2[choice_2][env.miner_index] + 100 + combine_states_2[choice_2][env.pylon_index] * 75 # if next_mineral_1 > 1500: # next_mineral_1 = 1500 # if next_mineral_2 > 1500: # next_mineral_2 = 1500 # print(next_mineral_1, state_1[env.miner_index], combine_states_1[choice_1], actions_1[choice_1]) # # if type(enemy_agent) != type("random"): # print(next_mineral_2, state_2[env.miner_index], combine_states_2[choice_2], actions_2[choice_2]) # assert next_mineral_1 == state_1[env.miner_index], print(l_m_1, next_mineral_1, state_1[env.miner_index], combine_states_1[choice_1], actions_1[choice_1]) # # if type(enemy_agent) != type("random"): # assert next_mineral_2 == state_2[env.miner_index], print(l_m_2, next_mineral_2, state_2[env.miner_index], combine_states_2[choice_2], actions_2[choice_2]) reward = [0] * reward_num if steps == max_episode_steps or done: reward = player_1_end_vector(state_1[63], state_1[64], state_1[65], state_1[66], is_done=done) # reward_1, reward_2 = env.sperate_reward(env.decomposed_rewards) # print('reward:') # print(state_1[27], state_1[28], state_1[29], state_1[30]) # print(reward_1) # print(reward_2) # if steps == max_episode_steps or done: # input() if not reinforce_config.is_random_agent_1: agent_1.reward(reward) if not reinforce_config.is_random_agent_1: agent_1.end_episode(env.normalization(state_1)) # test_summary_writer.add_scalar(tag = "Train/Episode Reward", scalar_value = total_reward, # global_step = episode + 1) # train_summary_writer.add_scalar(tag = "Train/Steps to choosing Enemies", scalar_value = steps + 1, # global_step = episode + 1) if not reinforce_config.is_random_agent_1: agent_1.disable_learning( is_save=not reinforce_config.collecting_experience and not evaluation_config.generate_xai_replay) total_rewwards_list = [] # Test Episodes print( "======================================================================" ) print( "===============================Now testing============================" ) print( "======================================================================" ) tied_lose = 0 for idx_enemy, enemy_agent in enumerate(agents_2): average_end_state = np.zeros(len(state_1)) if type(enemy_agent) == type("random"): print(enemy_agent) else: print(enemy_agent.name) if idx_enemy == len( agents_2 ) - 1 and not reinforce_config.collecting_experience: test_num = evaluation_config.test_episodes else: test_num = 5 for episode in tqdm(range(test_num)): env.reset() total_reward_1 = 0 done = False skiping = True steps = 0 previous_state_1 = None previous_state_2 = None previous_action_1 = None previous_action_2 = None if evaluation_config.generate_xai_replay: recorder = XaiReplayRecorder2LaneNexus( env.sc2_env, episode, evaluation_config.env, action_component_names, replay_dimension) # print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%Starting episode%%%%%%%%%%%%%%%%%%%%%%%%%") # print(f"reinforce_config.collecting_experience {reinforce_config.collecting_experience}") while skiping: # print("about to call env.step() during skip") # start_time = time.time() state_1, state_2, done, dp = env.step([], 0) if evaluation_config.generate_xai_replay: #recorder.save_jpg() recorder.record_game_clock_tick( env.decomposed_reward_dict) if dp or done: # print(time.time() - start_time) break # input(f"dp is {dp} done is {done}") # print("done stepping to finish prior action") while not done and steps < max_episode_steps: # input(f"not done and steps == {steps} < {max_episode_steps}") steps += 1 # # Decision point if not reinforce_config.is_random_agent_1: actions_1 = env.get_big_A(state_1[env.miner_index], state_1[env.pylon_index]) combine_states_1 = combine_sa(state_1, actions_1) choice_1, _ = agent_1.predict( env.normalization(combine_states_1)) else: actions_1 = env.get_big_A(state_1[env.miner_index], state_1[env.pylon_index], is_train=1) combine_states_1 = combine_sa(state_1, actions_1) choice_1 = randint(0, len(actions_1) - 1) if not reinforce_config.is_random_agent_2 and type( enemy_agent) != type("random"): actions_2 = env.get_big_A(state_2[env.miner_index], state_2[env.pylon_index]) combine_states_2 = combine_sa(state_2, actions_2) choice_2, _ = enemy_agent.predict( env.normalization(combine_states_2)) else: if enemy_agent == "random_2": actions_2 = env.get_big_A(state_2[env.miner_index], state_2[env.pylon_index], is_train=0) else: actions_2 = env.get_big_A(state_2[env.miner_index], state_2[env.pylon_index], is_train=1) combine_states_2 = combine_sa(state_2, actions_2) choice_2 = randint(0, len(actions_2) - 1) # input("record dp if engaged") if evaluation_config.generate_xai_replay: #recorder.save_jpg() recorder.record_decision_point( actions_1[choice_1], actions_2[choice_2], state_1, state_2, env.decomposed_reward_dict) # input('stepped with command 2') ####### #experience collecting ###### # input("collect experience if configured so") if reinforce_config.collecting_experience: if previous_state_1 is not None and previous_state_2 is not None and previous_action_1 is not None and previous_action_2 is not None: previous_state_1[8:14] = previous_state_2[ 1:7] # Include player 2's action previous_state_1[ env.miner_index] += previous_state_1[ env.pylon_index] * 75 + 100 previous_state_1[-1] += 1 experience = [ previous_state_1, np.append(state_1, previous_reward_1) ] all_experiences.append(experience) if ((len(all_experiences)) % 100 == 0 ) and reinforce_config.collecting_experience: torch.save(all_experiences, exp_save_path) previous_state_1 = deepcopy(combine_states_1[choice_1]) previous_state_2 = deepcopy(combine_states_2[choice_2]) previous_action_1 = deepcopy(actions_1[choice_1]) previous_action_2 = deepcopy(actions_2[choice_2]) # input(f"step p1 with {list(actions_1[choice_1])}") env.step(list(actions_1[choice_1]), 1) # input(f"step p2 with {list(actions_2[choice_2])}") env.step(list(actions_2[choice_2]), 2) # # human play # pretty_print(state_2, text = "state:") # env.step(list(get_human_action()), 2) # reinforce_config.collecting_experience = False while skiping: # print("Get actions time:") # start_time = time.time() # input("step to move the game along and send the wave") state_1, state_2, done, dp = env.step([], 0) if evaluation_config.generate_xai_replay: #recorder.save_jpg() recorder.record_game_clock_tick( env.decomposed_reward_dict) #input(' step wating for done signal') if dp or done: # print(time.time() - start_time) break reward = [0] * reward_num if steps == max_episode_steps or done: reward = player_1_end_vector(state_1[63], state_1[64], state_1[65], state_1[66], is_done=done) # input("separate rewards...") # reward_1, reward_2 = env.sperate_reward(env.decomposed_rewards) # print(env.decomposed_rewards) # print(reward_1, reward_2) # for r1 in reward_1: if reward_num == 4: current_reward_1 = sum(reward[2:]) elif reward_num == 8: current_reward_1 = reward[2] + reward[3] + reward[ 6] + reward[7] elif reward_num == 1: current_reward_1 = sum(reward) # print(current_reward_1) total_reward_1 += current_reward_1 # print(total_reward_1) # if total_reward_1 > 14000 or total_reward_1 < -14000: # input() previous_reward_1 = current_reward_1 # print("collect experience again if configured so") if reinforce_config.collecting_experience: previous_state_1[8:14] = previous_state_2[ 1:7] # Include player 2's action previous_state_1[env.miner_index] += previous_state_1[ env.pylon_index] * 75 + 100 previous_state_1[-1] += 1 experience = [ previous_state_1, np.append(state_1, previous_reward_1) ] all_experiences.append(experience) if ((len(all_experiences)) % 100 == 0) and reinforce_config.collecting_experience: torch.save(all_experiences, exp_save_path) average_end_state += state_1 total_rewwards_list.append(total_reward_1) test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward_1, global_step=episode + 1) test_summary_writer.add_scalar( tag="Test/Steps to choosing Enemies", scalar_value=steps + 1, global_step=episode + 1) # if reinforce_config.collecting_experience: # break #print(test.size()) # print(total_rewwards_list) # print("should be done with episode...") total_rewards_list_np = np.array(total_rewwards_list) tied = np.sum(total_rewards_list_np[-test_num:] == 0) wins = np.sum(total_rewards_list_np[-test_num:] > 0) lose = np.sum(total_rewards_list_np[-test_num:] <= 0) tied_lose += (tied + lose) print("wins/lose/tied") print( str(wins / test_num * 100) + "% \t", str(lose / test_num * 100) + "% \t", ) # str(tied / test_num * 100) + "% \t") pretty_print(average_end_state / test_num) tr = sum(total_rewwards_list) / len(total_rewwards_list) print("total reward:") print(tr) privous_result.append(tr) if len(privous_result) > update_wins_waves: del privous_result[0] f = open(evaluation_config.result_path, "a+") f.write(str(tr) + "\n") f.close() if tied_lose == 0 and not reinforce_config.is_random_agent_1: agent_1.save(force=True, appendix="_the_best") if not reinforce_config.is_random_agent_1: agent_1.enable_learning()
def run_task(evaluation_config, network_config, reinforce_config): flags.FLAGS(sys.argv[:1]) # TODO Fix this! env = sc2_env.SC2Env( map_name="CollectMineralShards", step_mul=8, visualize=False, save_replay_episodes=0, replay_dir='replay', game_steps_per_episode=10000, agent_interface_format=features.AgentInterfaceFormat( feature_dimensions=features.Dimensions(screen=32, minimap=32), use_feature_units=True), ) choices = ["Up", "Down", "Left", "Right"] agent = DQNAdaptive(name="ShardsCollector", choices=choices, network_config=network_config, reinforce_config=reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" if evaluation_config.training_episodes > 0: clear_summary_path(training_summaries_path) train_summary_writer = SummaryWriter(training_summaries_path) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) # Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() actions = ActionWrapper(state, grid_size=32).select(["SelectMarine1"]) state = env.step(actions) total_reward = 0 done = False steps = 0 model_time = 0 while not done: steps += 1 model_start_time = time.time() action, q_values = agent.predict( state[0].observation.feature_screen) model_time += (time.time() - model_start_time) actions = ActionWrapper(state, grid_size=32).select([action]) state = env.step(actions) agent.reward(state[0].reward) total_reward += state[0].reward done = state[0].step_type == environment.StepType.LAST agent.end_episode(state[0].observation.feature_screen) test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward, global_step=episode + 1) train_summary_writer.add_scalar( tag="Train/Steps to collect all shards", scalar_value=steps + 1, global_step=episode + 1) agent.disable_learning() # Test Episodes for episode in range(evaluation_config.test_episodes): state = env.reset() actions = ActionWrapper(state, grid_size=32).select(["SelectMarine1"]) state = env.step(actions) total_reward = 0 done = False steps = 0 model_time = 0 while steps < 1000 and not done: steps += 1 model_start_time = time.time() action, q_values = agent.predict( state[0].observation.feature_screen) if evaluation_config.render: time.sleep(evaluation_config.sleep) model_time += (time.time() - model_start_time) actions = ActionWrapper(state, grid_size=32).select([action]) state = env.step(actions) total_reward += state[0].reward done = state[0].step_type == environment.StepType.LAST test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward, global_step=episode + 1) test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits", scalar_value=steps + 1, global_step=episode + 1) env.close()
def run_task(evaluation_config, network_config, reinforce_config, map_name=None, train_forever=False, agent_model=None): if (use_cuda): print("~~~~~~~~~~~~~~~~~~~~~~~~~~") print("| USING CUDA |") print("~~~~~~~~~~~~~~~~~~~~~~~~~~") else: print("~~~~~~~~~~~~~~~~~~~~~~~~~~") print("| NOT USING CUDA |") print("~~~~~~~~~~~~~~~~~~~~~~~~~~") flags.FLAGS(sys.argv[:1]) max_episode_steps = 40 replay_dimension = evaluation_config.xai_replay_dimension env = TugOfWar(map_name = None, \ generate_xai_replay = evaluation_config.generate_xai_replay, xai_replay_dimension = replay_dimension) reward_types = env.reward_types combine_sa = env.combine_sa state = env.reset() if not reinforce_config.is_random_agent_1: agent = SADQAdaptive(name="TugOfWar", state_length=len(state), network_config=network_config, reinforce_config=reinforce_config) print("sadq agent 1") models_path = "abp/examples/pysc2/tug_of_war/models_mb/" agent = MBTSAdaptive(name="TugOfWar", state_length=len(state), network_config=network_config, reinforce_config=reinforce_config, models_path=models_path, depth=2, action_ranking=float('inf'), env=env) else: print("random agent 1") training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = SummaryWriter(training_summaries_path) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) round_num = 0 all_experiences = [] path = './saved_models/tug_of_war/agents/' # if agent_model is not None and not reinforce_config.is_random_agent_1: # new_weights = torch.load(path + "/" + agent_model) # agent.load_weight(new_weights) # agent.disable_learning(is_save = False) # evaluation_config.training_episodes = 0 while True: round_num += 1 print( "=======================================================================" ) print( "===============================Now training============================" ) print( "=======================================================================" ) print("Now training.") for episode in tqdm(range(evaluation_config.training_episodes)): state = env.reset() total_reward = 0 skiping = True done = False steps = 0 print(list(state)) while skiping: state, done, dp = env.step([]) if dp or done: break while not done and steps < max_episode_steps: steps += 1 # Decision point print('state:') print( "=======================================================================" ) pretty_print(state, text="state") actions = env.get_big_A(state[env.miner_index], state[env.pylon_index], is_train=True) # assert state[-1] == steps, print(state, steps) if not reinforce_config.is_random_agent_1: combine_states = combine_sa(state, actions) choice, _ = agent.predict( env.normalization(combine_states)) input() for cs in combine_states: print(cs.tolist()) else: choice = randint(0, len(actions) - 1) print("action list:") print(actions) # assign action print("choice:") print(actions[choice]) pretty_print(combine_states[choice], text="after state:") input("pause") env.step(list(actions[choice])) while skiping: state, done, dp = env.step([]) if dp or done: break if steps == max_episode_steps or done: win_lose = agent_win_condition(state[27], state[28], state[29], state[30]) if win_lose == 1: env.decomposed_rewards[4] = 10000 env.decomposed_rewards[5] = 0 elif win_lose == -1: env.decomposed_rewards[4] = 0 env.decomposed_rewards[5] = -10000 print("reward:") print(env.decomposed_rewards) if not reinforce_config.is_random_agent_1: agent_1.reward(sum(env.decomposed_rewards)) if not reinforce_config.is_random_agent_1: agent.end_episode(state) test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward, global_step=episode + 1) train_summary_writer.add_scalar( tag="Train/Steps to choosing Enemies", scalar_value=steps + 1, global_step=episode + 1) # if not reinforce_config.is_random_agent_1: # agent.disable_learning(is_save = not reinforce_config.collecting_experience) total_rewwards_list = [] # Test Episodes print( "======================================================================" ) print( "===============================Now testing============================" ) print( "======================================================================" ) tied_lose = 0 average_end_state = np.zeros(len(state)) for episode in tqdm(range(evaluation_config.test_episodes)): state = env.reset() total_reward = 0 skiping = True done = False steps = 0 total_reward = 0 while skiping: state, done, dp = env.step([]) if dp or done: break while not done and steps < max_episode_steps: steps += 1 # Decision point print('state:') print( "=======================================================================" ) pretty_print(state, text="state") actions = env.get_big_A(state[env.miner_index], state[env.pylon_index], is_train=True) # assert state[-1] == steps, print(state, steps) combine_states = combine_sa(state, actions) # if not reinforce_config.is_random_agent_1: # choice, _ = agent.predict(env.normalization(combine_states)) # input() # for cs in combine_states: # print(cs.tolist()) # else: # choice = randint(0, len(actions) - 1) # print("action list:") # print(actions) # # assign action # print("choice:") # print(actions[choice]) # pretty_print(combine_states[choice], text = "after state:") # input("pause") # for model base agent action_model_base = agent.predict( state, int(env.data['P1Minerals']) - 1) print(action_model_base) env.step(action_model_base) # model free agent # env.step(list(actions[choice])) while skiping: state, done, dp = env.step([]) if dp or done: break if steps == max_episode_steps or done: win_lose = agent_win_condition(state[27], state[28], state[29], state[30]) if win_lose == 1: env.decomposed_rewards[4] = 10000 env.decomposed_rewards[5] = 0 elif win_lose == -1: env.decomposed_rewards[4] = 0 env.decomposed_rewards[5] = -10000 # print("reward:") # print(env.decomposed_rewards) total_reward += sum(env.decomposed_rewards) average_end_state += state total_rewwards_list.append(total_reward) test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward_1, global_step=episode + 1) test_summary_writer.add_scalar( tag="Test/Steps to choosing Enemies", scalar_value=steps + 1, global_step=episode + 1) total_rewards_list_np = np.array(total_rewwards_list) tied = np.sum(total_rewards_list_np[-test_num:] == 0) wins = np.sum(total_rewards_list_np[-test_num:] > 0) lose = np.sum(total_rewards_list_np[-test_num:] < 0) tied_lose += (tied + lose) print("wins/lose/tied") print( str(wins / test_num * 100) + "% \t", str(lose / test_num * 100) + "% \t", str(tied / test_num * 100) + "% \t") pretty_print(average_end_state / test_num) tr = sum(total_rewwards_list) / len(total_rewwards_list) print("total reward:") print(tr) f = open("result_self_play_2l_human_play.txt", "a+") f.write(str(tr) + "\n") f.close()
def run_task(evaluation_config, network_config, reinforce_config): env = TowerExample() reward_types = sorted(env.reward_types()) decomposed_rewards = {} for type in reward_types: decomposed_rewards[type] = 0 max_episode_steps = 10000 state = env.reset() actions = env.actions()['actions'] actions = sorted(actions.items(), key=operator.itemgetter(1)) choice_descriptions = list(map(lambda x: x[0], actions)) choices = list(map(lambda x: x[1], actions)) choose_tower = HRAAdaptive(name="Tower", choices=choices, reward_types=reward_types, network_config=network_config, reinforce_config=reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = tf.summary.FileWriter(training_summaries_path) #Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() total_reward = 0 episode_summary = tf.Summary() step = 1 while not state.is_terminal(): step += 1 tower_to_kill, q_values = choose_tower.predict(state.state) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = True state = env.act(action) for reward_type, reward in state.typed_reward.items(): choose_tower.reward(reward_type, reward) total_reward += state.reward choose_tower.end_episode(state.state) logger.info("Episode %d : %d, Step: %d" % (episode + 1, total_reward, step)) episode_summary.value.add(tag="Train/Reward", simple_value=total_reward) train_summary_writer.add_summary(episode_summary, episode + 1) train_summary_writer.flush() choose_tower.disable_learning() test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = tf.summary.FileWriter(test_summaries_path) #Test Episodes for episode in range(evaluation_config.test_episodes): contrastive = True explanation = SkyExplanation("Tower Capture", (40, 40)) layer_names = [ "HP", "Agent Location", "Small Towers", "Big Towers", "Friend", "Enemy" ] adaptive_explanation = Explanation(choose_tower) state = env.reset(visualize=evaluation_config.render, record=True) total_reward = 0 episode_summary = tf.Summary() while not state.is_terminal(): tower_to_kill, q_values = choose_tower.predict(state.state) combined_q_values = np.sum(q_values, axis=0) saliencies = adaptive_explanation.generate_saliencies( state.state, contrastive) charts = [] decomposed_q_chart = BarChart("Q Values", "Actions", "QVal By Reward Type") for choice_idx, choice in enumerate(choices): key = choice_descriptions[choice_idx] explanation.add_layers(layer_names, saliencies[choice]["all"], key=key) group = BarGroup("Attack {}".format(key), saliency_key=key) for reward_index, reward_type in enumerate(reward_types): key = "{}_{}".format(choice, reward_type) bar = Bar(reward_type, q_values[reward_index][choice_idx], saliency_key=key) explanation.add_layers(layer_names, saliencies[choice][reward_type], key=key) group.add_bar(bar) decomposed_q_chart.add_bar_group(group) explanation.with_bar_chart(decomposed_q_chart) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = False if evaluation_config.render else True state = env.act(action, explanation=explanation) time.sleep(0.5) total_reward += state.reward logger.info("Episode %d : %d, Step: %d" % (episode + 1, total_reward, step)) episode_summary.value.add(tag="Test/Episode Reward", simple_value=total_reward) test_summary_writer.add_summary(episode_summary, episode + 1) test_summary_writer.flush()
def run_task(evaluation_config, network_config, reinforce_config, map_name=None, train_forever=False): if (use_cuda): print("~~~~~~~~~~~~~~~~~~~~~~~~~~") print("| USING CUDA |") print("~~~~~~~~~~~~~~~~~~~~~~~~~~") else: print("~~~~~~~~~~~~~~~~~~~~~~~~~~") print("| NOT USING CUDA |") print("~~~~~~~~~~~~~~~~~~~~~~~~~~") flags.FLAGS(sys.argv[:1]) max_episode_steps = 40 replay_dimension = evaluation_config.xai_replay_dimension env = TugOfWar(map_name = map_name, \ generate_xai_replay = evaluation_config.generate_xai_replay, xai_replay_dimension = replay_dimension) reward_types = env.reward_types combine_sa = env.combine_sa state_1, state_2 = env.reset() if network_config.output_shape == 4: reward_num = 4 combine_decomposed_func = combine_decomposed_func_4 player_1_end_vector = player_1_end_vector_4 if network_config.output_shape == 8: reward_num = 8 combine_decomposed_func = combine_decomposed_func_8 player_1_end_vector = player_1_end_vector_8 trans_model = TransAdaptive("Tug_of_war", network_config=network_config, reinforce_config=reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = SummaryWriter(training_summaries_path) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) agents_1 = ["random", "random_2"] agents_2 = ["random", "random_2"] round_num = 0 all_experiences = [] path = './saved_models/tug_of_war/agents/grid_decom' if reinforce_config.collecting_experience: exp_save_path = 'abp/examples/pysc2/tug_of_war/all_experiences.pt' for r, d, f in os.walk(path): for file in f: if '_eval' in file: new_agent_1 = SADQAdaptive( name=file, state_length=len(state_1), network_config=network_config, reinforce_config=reinforce_config, memory_resotre=False, reward_num=reward_num, combine_decomposed_func=combine_decomposed_func) new_weights = torch.load(path + "/" + file, map_location=device) # print(HP_state_dict) new_state_dict = OrderedDict() the_weight = list(new_weights.values()) new_keys = list( new_agent_1.eval_model.model.state_dict().keys()) for i in range(len(the_weight)): new_state_dict[new_keys[i]] = the_weight[i] new_agent_1.load_weight(new_state_dict) new_agent_1.disable_learning(is_save=False) agents_1.append(new_agent_1) new_agent_2 = SADQAdaptive( name=file, state_length=len(state_1), network_config=network_config, reinforce_config=reinforce_config, memory_resotre=False, reward_num=reward_num, combine_decomposed_func=combine_decomposed_func) new_agent_2.load_weight(new_state_dict) new_agent_2.disable_learning(is_save=False) agents_2.append(new_agent_2) print(file) # w = 0 while True: print( "======================================================================" ) print( "===============================Now testing============================" ) print( "======================================================================" ) tied_lose = 0 for idx_self, self_agent in enumerate(agents_1): if type(self_agent) == type("random"): agent_1_name = self_agent else: agent_1_name = self_agent.name for idx_enemy, enemy_agent in enumerate(agents_2): print(agent_1_name) print("vs") average_end_state = np.zeros(len(state_1)) if type(enemy_agent) == type("random"): print(enemy_agent) else: print(enemy_agent.name) total_rewwards_list = [] for episode in tqdm(range(evaluation_config.test_episodes)): env.reset() total_reward_1 = 0 done = False skiping = True steps = 0 previous_state_1 = None previous_state_2 = None while skiping: state_1, state_2, done, dp = env.step([], 0) if dp or done: break while not done and steps < max_episode_steps: steps += 1 # # Decision point if type(self_agent) != type("random"): actions_1 = env.get_big_A(state_1[env.miner_index], state_1[env.pylon_index]) combine_states_1 = combine_sa(state_1, actions_1) choice_1, _ = self_agent.predict( env.normalization(combine_states_1)) else: if self_agent == "random_2": actions_1 = env.get_big_A( state_1[env.miner_index], state_1[env.pylon_index], is_train=0) else: actions_1 = env.get_big_A( state_1[env.miner_index], state_1[env.pylon_index], is_train=1) actions_1 = env.get_big_A(state_1[env.miner_index], state_1[env.pylon_index], is_train=1) combine_states_1 = combine_sa(state_1, actions_1) choice_1 = randint(0, len(actions_1) - 1) if type(enemy_agent) != type("random"): actions_2 = env.get_big_A(state_2[env.miner_index], state_2[env.pylon_index]) combine_states_2 = combine_sa(state_2, actions_2) choice_2, _ = enemy_agent.predict( env.normalization(combine_states_2)) else: if enemy_agent == "random_2": actions_2 = env.get_big_A( state_2[env.miner_index], state_2[env.pylon_index], is_train=0) else: actions_2 = env.get_big_A( state_2[env.miner_index], state_2[env.pylon_index], is_train=1) combine_states_2 = combine_sa(state_2, actions_2) choice_2 = randint(0, len(actions_2) - 1) ####### #experience collecting ###### if previous_state_1 is not None and previous_state_2 is not None: previous_state_1[8:15] = previous_state_2[ 1:8].copy() # Include player 2's action previous_state_1[ env.miner_index] += previous_state_1[ env.pylon_index] * 75 + 100 if previous_state_1[env.miner_index] > 1500: previous_state_1[env.miner_index] = 1500 previous_state_1[-1] += 1 if np.sum(previous_state_1[0:15] == state_1[0:15]) != 15: print(1) pretty_print(previous_state_1, text="previous state") pretty_print(state_1, text="current state") input() if np.sum( previous_state_1[-1] == state_1[-1]) != 1: print(2) pretty_print(previous_state_1, text="previous state") pretty_print(state_1, text="current state") input() # pretty_print(previous_state_1, text = "previous state") # pretty_print(state_1, text = "current state") trans_model.add_memory( env.normalization(previous_state_1), env.normalization(state_1)) # input() if reinforce_config.collecting_experience: experience = [previous_state_1, state_1] all_experiences.append(experience) if ( (len(all_experiences)) % 100 == 0 ) and reinforce_config.collecting_experience: torch.save(all_experiences, exp_save_path) previous_state_1 = combine_states_1[choice_1].copy() previous_state_2 = combine_states_2[choice_2].copy() env.step(list(actions_1[choice_1]), 1) env.step(list(actions_2[choice_2]), 2) # # human play # pretty_print(state_2, text = "state:") # env.step(list(get_human_action()), 2) # reinforce_config.collecting_experience = False while skiping: state_1, state_2, done, dp = env.step([], 0) if dp or done: break reward = [0] * reward_num if steps == max_episode_steps or done: reward = player_1_end_vector(state_1[63], state_1[64], state_1[65], state_1[66], is_done=done) if reward_num == 4: current_reward_1 = sum(reward[2:]) elif reward_num == 8: current_reward_1 = reward[2] + reward[3] + reward[ 6] + reward[7] total_reward_1 += current_reward_1 if previous_state_1 is not None and previous_state_2 is not None: previous_state_1[8:15] = previous_state_2[1:8].copy( ) # Include player 2's action previous_state_1[env.miner_index] += previous_state_1[ env.pylon_index] * 75 + 100 if previous_state_1[env.miner_index] > 1500: previous_state_1[env.miner_index] = 1500 previous_state_1[-1] += 1 if reinforce_config.collecting_experience: experience = [previous_state_1, state_1] all_experiences.append(experience) if ((len(all_experiences)) % 100 == 0 ) and reinforce_config.collecting_experience: torch.save(all_experiences, exp_save_path) average_end_state += state_1 total_rewwards_list.append(total_reward_1) total_rewards_list_np = np.array(total_rewwards_list) print(total_rewards_list_np) tied = np.sum( total_rewards_list_np[-evaluation_config.test_episodes:] == 0) wins = np.sum( total_rewards_list_np[-evaluation_config.test_episodes:] > 0) lose = np.sum( total_rewards_list_np[-evaluation_config.test_episodes:] <= 0) tied_lose += (tied + lose) print("wins/lose/tied") print( str(wins / evaluation_config.test_episodes * 100) + "% \t", str(lose / evaluation_config.test_episodes * 100) + "% \t", ) # str(tied / test_num * 100) + "% \t") pretty_print(average_end_state / evaluation_config.test_episodes) tr = sum(total_rewwards_list) / len(total_rewwards_list) print("total reward:") print(tr)
def run_task(evaluation_config, network_config, reinforce_config, map_name=None, train_forever=False): if (use_cuda): print("~~~~~~~~~~~~~~~~~~~~~~~~~~") print("| USING CUDA |") print("~~~~~~~~~~~~~~~~~~~~~~~~~~") else: print("~~~~~~~~~~~~~~~~~~~~~~~~~~") print("| NOT USING CUDA |") print("~~~~~~~~~~~~~~~~~~~~~~~~~~") flags.FLAGS(sys.argv[:1]) max_episode_steps = 40 #pdx_explanation = PDX() replay_dimension = evaluation_config.xai_replay_dimension env = TugOfWar(map_name = map_name, \ generate_xai_replay = evaluation_config.generate_xai_replay, xai_replay_dimension = replay_dimension) reward_types = env.reward_types combine_sa = env.combine_sa state_1, state_2 = env.reset() if not reinforce_config.is_random_agent_1: agent_1 = SADQAdaptive(name="TugOfWar", state_length=len(state_1), network_config=network_config, reinforce_config=reinforce_config, is_sigmoid=True) print("sadq agent 1") else: print("random agent 1") # if not reinforce_config.is_random_agent_2: # agent_2 = SADQAdaptive(name = "TugOfWar", # state_length = len(state_2), # network_config = network_config, # reinforce_config = reinforce_config) # print("sadq agent 2") # else: # print("random agent 2") training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = SummaryWriter(training_summaries_path) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) agents_2 = ["random"] ###############test################### # new_agent_2 = SADQAdaptive(name = "TugOfWar_test1", # state_length = len(state_2), # network_config = network_config, # reinforce_config = reinforce_config) # new_agent_2.load_model(agent_1.eval_model) # new_agent_2.disable_learning(is_save = False) # agents_2.append(new_agent_2) # new_agent_2 = SADQAdaptive(name = "TugOfWar_test2", # state_length = len(state_2), # network_config = network_config, # reinforce_config = reinforce_config) # new_agent_2.load_model(agent_1.eval_model) # new_agent_2.disable_learning(is_save = False) # agents_2.append(new_agent_2) ###################################### # random_enemy = True round_num = 0 privous_result = [] update_wins_waves = 10 all_experiences = [] exp_save_path = 'abp/examples/pysc2/tug_of_war/rand_v_rand.pt' path = './saved_models/tug_of_war/agents/' if reinforce_config.collecting_experience and not reinforce_config.is_random_agent_2: agent_1_model = "TugOfWar_eval.pupdate_429" exp_save_path = 'abp/examples/pysc2/tug_of_war/all_experiences.pt' files = [] # r=root, d=directories, f = files for r, d, f in os.walk(path): for file in f: if '.p' in file: new_weights = torch.load(path + "/" + file) new_agent_2 = SADQAdaptive( name=file, state_length=len(state_1), network_config=network_config, reinforce_config=reinforce_config) new_agent_2.load_weight(new_weights) new_agent_2.disable_learning(is_save=False) agents_2.append(new_agent_2) if agent_1_model == file: print("********agent_1_model", file) agent_1.load_model(new_agent_2.eval_model) elif network_config.restore_network: agents_2 = [] restore_path = network_config.network_path for r, d, f in os.walk(restore_path): f = sorted(f) for file in f: if 'eval.pupdate' in file or 'eval.p_the_best' in file: new_weights = torch.load(restore_path + "/" + file) new_agent_2 = SADQAdaptive( name=file, state_length=len(state_1), network_config=network_config, reinforce_config=reinforce_config, memory_resotre=False) new_agent_2.load_weight(new_weights) new_agent_2.disable_learning(is_save=False) agents_2.append(new_agent_2) print("loaded agent:", file) if evaluation_config.generate_xai_replay: agent_1_model = "TugOfWar_eval.pupdate_240" agent_2_model = "TugOfWar_eval.pupdate_429_one_agent_top" agents_2 = [] weights_1 = torch.load(path + "/" + agent_1_model) weights_2 = torch.load(path + "/" + agent_2_model) new_agent_2 = SADQAdaptive(name="record", state_length=len(state_1), network_config=network_config, reinforce_config=reinforce_config) agent_1.load_weight(weights_1) new_agent_2.load_weight(weights_2) new_agent_2.disable_learning(is_save=False) agents_2.append(new_agent_2) while True: if len(privous_result) >= update_wins_waves and \ sum(privous_result) / update_wins_waves > 0.95 and \ not reinforce_config.is_random_agent_2: privous_result = [] print("replace enemy agent's weight with self agent") # random_enemy = False f = open("result_self_play_2l_deexp.txt", "a+") f.write("Update agent\n") f.close() new_agent_2 = SADQAdaptive(name="TugOfWar_" + str(round_num), state_length=len(state_2), network_config=network_config, reinforce_config=reinforce_config) new_agent_2.load_model(agent_1.eval_model) new_agent_2.disable_learning(is_save=False) agents_2.append(new_agent_2) agent_1.steps = reinforce_config.epsilon_timesteps / 2 agent_1.best_reward_mean = 0 agent_1.save(force=True, appendix="update_" + str(round_num)) round_num += 1 print( "=======================================================================" ) print( "===============================Now training============================" ) print( "=======================================================================" ) print("Now training.") print("Now have {} enemy".format(len(agents_2))) for idx_enemy, enemy_agent in enumerate(agents_2[::-1]): # break if reinforce_config.collecting_experience or evaluation_config.training_episodes == 0: break if enemy_agent == "random": print(enemy_agent) else: print(enemy_agent.name) if idx_enemy == 0: training_num = evaluation_config.training_episodes else: training_num = 10 for episode in tqdm(range(training_num)): state_1, state_2 = env.reset() total_reward = 0 skiping = True done = False steps = 0 # print(list(state_1)) # print(list(state_2)) while skiping: state_1, state_2, done, dp = env.step([], 0) if dp or done: break while not done and steps < max_episode_steps: steps += 1 # Decision point # print('state:') # print("=======================================================================") # pretty_print(state_1, text = "state 1") # pretty_print(state_2, text = "state 2") actions_1 = env.get_big_A(state_1[env.miner_index], state_1[env.pylon_index], is_train=True) actions_2 = env.get_big_A(state_2[env.miner_index], state_2[env.pylon_index], is_train=True) assert state_1[-1] == state_2[-1] == steps, print( state_1, state_2, steps) if not reinforce_config.is_random_agent_1: combine_states_1 = combine_sa(state_1, actions_1) # print(combine_states_1) # print(env.normalization(combine_states_1)) # print(state_1[env.miner_index]) choice_1, _ = agent_1.predict( env.normalization(combine_states_1)) # input() # for cs1 in combine_states_1: # print(cs1.tolist()) else: # combine_states_1 = combine_sa(state_1, actions_1) choice_1 = randint(0, len(actions_1) - 1) if not reinforce_config.is_random_agent_2 and enemy_agent != "random": combine_states_2 = combine_sa(state_2, actions_2) choice_2, _ = enemy_agent.predict( env.normalization(combine_states_2)) else: choice_2 = randint(0, len(actions_2) - 1) # print("action list:") # print(actions_1) # print(actions_2) # assign action # print("choice:") # print(actions_1[choice_1]) # print(actions_2[choice_2]) # pretty_print(combine_states_1[choice_1], text = "after state:") # input("pause") # print(combine_states_2[choice_2].tolist()) # if state_1[env.miner_index] > 300: # input('pause') env.step(list(actions_1[choice_1]), 1) env.step(list(actions_2[choice_2]), 2) # if steps == 1: # env.step((1,0,0,0,0,0,0), 1) # env.step((1,0,0,0,0,0,0), 2) # if steps > 2: # env.step((1,0,0,0,0,0,0), 1) # env.step((0,0,0,0,0,0,1), 2) while skiping: state_1, state_2, done, dp = env.step([], 0) # input('time_step') if dp or done: break # if steps == max_episode_steps or done: # win_lose = player_1_win_condition(state_1[27], state_1[28], state_1[29], state_1[30]) # if win_lose == 1: # env.decomposed_rewards[4] = 10000 # env.decomposed_rewards[5] = 0 # elif win_lose == -1: # env.decomposed_rewards[4] = 0 # env.decomposed_rewards[5] = 10000 reward = [] if steps == max_episode_steps or done: win_lose = player_1_win_condition( state_1[27], state_1[28], state_1[29], state_1[30]) if win_lose == 1: reward = [1] elif win_lose == -1: reward = [0] # reward_1, reward_2 = env.sperate_reward(env.decomposed_rewards) # print('reward:') # print(state_1[27], state_1[28], state_1[29], state_1[30]) # print(reward_1) # print(reward_2) # if steps == max_episode_steps or done: # input() if not reinforce_config.is_random_agent_1: agent_1.reward(sum(reward)) if not reinforce_config.is_random_agent_1: agent_1.end_episode(state_1) test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward, global_step=episode + 1) train_summary_writer.add_scalar( tag="Train/Steps to choosing Enemies", scalar_value=steps + 1, global_step=episode + 1) if not reinforce_config.is_random_agent_1: agent_1.disable_learning( is_save=not reinforce_config.collecting_experience and not evaluation_config.training_episodes) total_rewwards_list = [] # Test Episodes print( "======================================================================" ) print( "===============================Now testing============================" ) print( "======================================================================" ) tied_lose = 0 for idx_enemy, enemy_agent in enumerate(agents_2[::-1]): average_end_state = np.zeros(len(state_1)) if enemy_agent == "random": print(enemy_agent) else: print(enemy_agent.name) if idx_enemy == 0 and not reinforce_config.collecting_experience: test_num = evaluation_config.test_episodes else: test_num = 5 for episode in tqdm(range(test_num)): env.reset() total_reward_1 = 0 done = False skiping = True steps = 0 previous_state_1 = None previous_state_2 = None previous_action_1 = None previous_action_2 = None if evaluation_config.generate_xai_replay: recorder = XaiReplayRecorder2LaneNexus( env.sc2_env, episode, evaluation_config.env, action_component_names, replay_dimension) # print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%Starting episode%%%%%%%%%%%%%%%%%%%%%%%%%") # print(f"reinforce_config.collecting_experience {reinforce_config.collecting_experience}") while skiping: # print("about to call env.step() during skip") # start_time = time.time() state_1, state_2, done, dp = env.step([], 0) if evaluation_config.generate_xai_replay: #recorder.save_jpg() recorder.record_game_clock_tick( env.decomposed_reward_dict) if dp or done: # print(time.time() - start_time) break # input(f"dp is {dp} done is {done}") # print("done stepping to finish prior action") while not done and steps < max_episode_steps: # input(f"not done and steps == {steps} < {max_episode_steps}") steps += 1 # # Decision point # print('state:') # print(list(state_1)) # print(list(state_2)) # print("Get actions time:") # start_time = time.time() # print(time.time() - start_time) choose_rand = 1 if reinforce_config.collecting_experience: choose_rand = 0.95 if not reinforce_config.is_random_agent_1 and random( ) <= choose_rand: actions_1 = env.get_big_A(state_1[env.miner_index], state_1[env.pylon_index]) combine_states_1 = combine_sa(state_1, actions_1) choice_1, _ = agent_1.predict( env.normalization(combine_states_1)) else: actions_1 = env.get_big_A(state_1[env.miner_index], state_1[env.pylon_index], is_train=True) combine_states_1 = combine_sa(state_1, actions_1) choice_1 = randint(0, len(actions_1) - 1) if not reinforce_config.is_random_agent_2 and enemy_agent != "random" and random( ) <= choose_rand: actions_2 = env.get_big_A(state_2[env.miner_index], state_2[env.pylon_index]) combine_states_2 = combine_sa(state_2, actions_2) choice_2, _ = enemy_agent.predict( env.normalization(combine_states_2)) else: actions_2 = env.get_big_A(state_2[env.miner_index], state_2[env.pylon_index], is_train=True) combine_states_2 = combine_sa(state_2, actions_2) choice_2 = randint(0, len(actions_2) - 1) if evaluation_config.generate_xai_replay: #recorder.save_jpg() recorder.record_decision_point( actions_1[choice_1], actions_2[choice_2], state_1, state_2, env.decomposed_reward_dict) ####### #experience collecting ###### if reinforce_config.collecting_experience: if previous_state_1 is not None and previous_state_2 is not None and previous_action_1 is not None and previous_action_2 is not None: previous_state_1[8:14] = previous_state_2[ 1:7] # Include player 2's action previous_state_1[ env.miner_index] += previous_state_1[ env.pylon_index] * 75 + 100 previous_state_1[-1] += 1 experience = [ previous_state_1, np.append(state_1, previous_reward_1) ] all_experiences.append(experience) if ((len(all_experiences)) % 100 == 0 ) and reinforce_config.collecting_experience: torch.save(all_experiences, exp_save_path) previous_state_1 = deepcopy(combine_states_1[choice_1]) previous_state_2 = deepcopy(combine_states_2[choice_2]) previous_action_1 = deepcopy(actions_1[choice_1]) previous_action_2 = deepcopy(actions_2[choice_2]) #input(f"step p2 with {list(actions_2[choice_2])}") # input(f"step p1 with {list(actions_1[choice_1])}") env.step(list(actions_1[choice_1]), 1) # input(f"step p2 with {list(actions_2[choice_2])}") env.step(list(actions_2[choice_2]), 2) # # human play # pretty_print(state_2, text = "state:") # env.step(list(get_human_action()), 2) # reinforce_config.collecting_experience = False while skiping: # print("Get actions time:") # start_time = time.time() state_1, state_2, done, dp = env.step([], 0) if evaluation_config.generate_xai_replay: #recorder.save_jpg() recorder.record_game_clock_tick( env.decomposed_reward_dict) if dp or done: # print(time.time() - start_time) break # current_reward_1 = 0 # input(f"dp is {dp} done is {done}") # if steps == max_episode_steps or done: # recorder.done_recording() # win_lose = player_1_win_condition(state_1[27], state_1[28], state_1[29], state_1[30]) # if win_lose == 1: # env.decomposed_rewards[4] = 10000 # env.decomposed_rewards[5] = 0 # elif win_lose == -1: # env.decomposed_rewards[4] = 0 # env.decomposed_rewards[5] = 10000 reward = [] if steps == max_episode_steps or done: win_lose = player_1_win_condition( state_1[27], state_1[28], state_1[29], state_1[30]) if win_lose == 1: reward = [1] elif win_lose == -1: reward = [0] # reward_1, reward_2 = env.sperate_reward(env.decomposed_rewards) # print(env.decomposed_rewards) # print(reward_1, reward_2) # for r1 in reward_1: current_reward_1 = sum(reward) # print(current_reward_1) total_reward_1 += current_reward_1 # print(total_reward_1) # if total_reward_1 > 14000 or total_reward_1 < -14000: # input() previous_reward_1 = current_reward_1 # print("collect experience again if configured so") if reinforce_config.collecting_experience: previous_state_1[8:14] = previous_state_2[ 1:7] # Include player 2's action previous_state_1[env.miner_index] += previous_state_1[ env.pylon_index] * 75 + 100 previous_state_1[-1] += 1 experience = [ previous_state_1, np.append(state_1, previous_reward_1) ] all_experiences.append(experience) if ((len(all_experiences)) % 100 == 0) and reinforce_config.collecting_experience: torch.save(all_experiences, exp_save_path) average_end_state += state_1 total_rewwards_list.append(total_reward_1) test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward_1, global_step=episode + 1) test_summary_writer.add_scalar( tag="Test/Steps to choosing Enemies", scalar_value=steps + 1, global_step=episode + 1) # if reinforce_config.collecting_experience: # break #print(test.size()) # print(total_rewwards_list) # print("should be done with episode...") total_rewards_list_np = np.array(total_rewwards_list) tied = 0 # tied = np.sum(total_rewards_list_np[-test_num:] == 0) wins = np.sum(total_rewards_list_np[-test_num:] > 0) lose = np.sum(total_rewards_list_np[-test_num:] <= 0) tied_lose += (tied + lose) print("wins/lose/tied") print( str(wins / test_num * 100) + "% \t", str(lose / test_num * 100) + "% \t", ) # str(tied / test_num * 100) + "% \t") pretty_print(average_end_state / test_num) tr = sum(total_rewwards_list) / len(total_rewwards_list) print("total reward:") print(tr) privous_result.append(tr) if len(privous_result) > update_wins_waves: del privous_result[0] f = open("result_self_play_2l_deexp.txt", "a+") f.write(str(tr) + "\n") f.close() if tied_lose == 0 and not reinforce_config.is_random_agent_1: agent_1.save(force=True, appendix="_the_best") if not reinforce_config.is_random_agent_1: agent_1.enable_learning()
def run_task(evaluation_config, network_config, reinforce_config): flags.FLAGS(sys.argv[:1]) # TODO Fix this! env = sc2_env.SC2Env( map_name="CollectMineralShards", step_mul=8, visualize=False, save_replay_episodes=0, replay_dir='replay', game_steps_per_episode=10000, agent_interface_format=features.AgentInterfaceFormat( feature_dimensions=features.Dimensions(screen=10, minimap=10), use_feature_units=True), ) choices = ["Up", "Down", "Left", "Right"] pdx_explanation = PDX() reward_types = [(x, y) for x in range(10) for y in range(10)] reward_names = ["loc (%d, %d)" % (x, y) for x, y in reward_types] # Configure network for reward type networks = [] for reward_type in reward_types: name = reward_type layers = [{"type": "FC", "neurons": 32}] networks.append({"name": name, "layers": layers}) network_config.networks = networks agent = HRAAdaptive(name="ShardsCollector", choices=choices, reward_types=reward_types, network_config=network_config, reinforce_config=reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" if evaluation_config.training_episodes > 0: clear_summary_path(training_summaries_path) train_summary_writer = SummaryWriter(training_summaries_path) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) # Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() actions = ActionWrapper(state).select(["SelectMarine1"]) reward_wrapper = RewardWrapper(state, reward_types) state = env.step(actions) total_reward = 0 done = False steps = 0 model_time = 0 env_time = 0 while not done: steps += 1 model_start_time = time.time() action, q_values, combined_q_values = agent.predict( state[0].observation.feature_screen.player_relative.flatten()) model_time += (time.time() - model_start_time) actions = ActionWrapper(state).select([action]) env_time -= time.time() state = env.step(actions) env_time += time.time() decomposed_reward = reward_wrapper.reward(state) for reward_type in reward_types: agent.reward(reward_type, decomposed_reward[reward_type]) total_reward += sum(decomposed_reward.values()) done = state[0].step_type == environment.StepType.LAST agent.end_episode( state[0].observation.feature_screen.player_relative.flatten()) test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward, global_step=episode + 1) train_summary_writer.add_scalar( tag="Train/Steps to collect all shards", scalar_value=steps + 1, global_step=episode + 1) agent.disable_learning() # Test Episodes for episode in range(evaluation_config.test_episodes): state = env.reset() actions = ActionWrapper(state).select(["SelectMarine1"]) reward_wrapper = RewardWrapper(state, reward_types) state = env.step(actions) total_reward = 0 done = False steps = 0 model_time = 0 while steps < 1000 and not done: steps += 1 model_start_time = time.time() action, q_values, combined_q_values = agent.predict( state[0].observation.feature_screen.player_relative.flatten()) if evaluation_config.render: action_index = choices.index(action) combined_q_values = combined_q_values.cpu().data.numpy() q_values = q_values.cpu().data.numpy() pdx_explanation.render_decomposed_rewards( action_index, combined_q_values, q_values, choices, reward_names) pdx_explanation.render_all_pdx(action_index, len(choices), q_values, choices, reward_names) model_time += (time.time() - model_start_time) actions = ActionWrapper(state).select([action]) state = env.step(actions) decomposed_reward = reward_wrapper.reward(state) total_reward += sum(decomposed_reward.values()) done = state[0].step_type == environment.StepType.LAST print("Episode", episode + 1, total_reward) test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward, global_step=episode + 1) test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits", scalar_value=steps + 1, global_step=episode + 1) env.close()
def run_task(evaluation_config, network_config, reinforce_config, log=True): env = gym.make(evaluation_config.env) max_episode_steps = env._max_episode_steps state = env.reset() threshold_angle = 0.087266463 threshold_x = 1.5 LEFT, RIGHT = [0, 1] agent = DQNAdaptive(name="cartpole", choices=[LEFT, RIGHT], network_config=network_config, reinforce_config=reinforce_config) if log: training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = SummaryWriter(training_summaries_path) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) # Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() total_reward = 0 for steps in range(max_episode_steps): action, q_values = agent.predict(state) state, reward, done, info = env.step(action) cart_position, cart_velocity, pole_angle, pole_velocity = state agent.reward(reward) # Reward for every step # Reward for pole angle increase or decrease if -threshold_angle < pole_angle < threshold_angle: agent.reward(1) else: agent.reward(-1) if steps < max_episode_steps and done: agent.reward(-40) # Reward for terminal state if -threshold_x < cart_position < threshold_x: agent.reward(1) else: agent.reward(-1) total_reward += reward if done: agent.end_episode(state) if log: train_summary_writer.add_scalar(tag="Episode Reward", scalar_value=total_reward, global_step=episode + 1) break # train_summary_writer.flush() agent.disable_learning() for episode in range(evaluation_config.test_episodes): state = env.reset() total_reward = 0 for step in range(max_episode_steps): if evaluation_config.render: env.render() action, q_values = agent.predict(state) state, reward, done, info = env.step(action) total_reward += reward if done: if log: test_summary_writer.add_scalar(tag="Episode Reward", scalar_value=total_reward, global_step=episode + 1) print('Episode Reward:', total_reward) break env.close() pass
def run_task(evaluation_config, network_config, reinforce_config): env = TowerExample() max_episode_steps = 10000 state = env.reset() TOWER_BR, TOWER_BL, TOWER_TR, TOWER_TL = [1, 2, 3, 4] choose_tower = DQNAdaptive( name="tower", choices=[TOWER_BR, TOWER_BL, TOWER_TR, TOWER_TL], network_config=network_config, reinforce_config=reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = tf.summary.FileWriter(training_summaries_path) #Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() total_reward = 0 episode_summary = tf.Summary() start_time = time.time() tower_to_kill, _ = choose_tower.predict(state.state) end_time = time.time() action = env.new_action() env_start_time = time.time() action.attack_quadrant(tower_to_kill) state = env.act(action) counter = 0 choose_tower.reward(state.reward) total_reward += state.reward if state.is_terminal(): logger.info("End Episode of episode %d!" % (episode + 1)) logger.info("Total Reward %d!" % (total_reward)) env_end_time = time.time() logger.debug("Counter: %d" % counter) logger.debug("Neural Network Time: %.2f" % (end_time - start_time)) logger.debug("Env Time: %.2f" % (env_end_time - env_start_time)) choose_tower.end_episode(state.state) episode_summary.value.add(tag="Reward", simple_value=total_reward) train_summary_writer.add_summary(episode_summary, episode + 1) train_summary_writer.flush() choose_tower.disable_learning() test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = tf.summary.FileWriter(test_summaries_path) choose_tower.explanation = True explanation = Explanation("Tower Capture", (40, 40)) chart = BarChart("Move Explanation", "Actions", "QVal By Reward Type") layer_names = ["HP", "Type 1", "Type 2", "Type 3", "Friend", "Enemy"] #Test Episodes for episode in range(evaluation_config.test_episodes): state = env.reset(visualize=evaluation_config.render, record=True) total_reward = 0 episode_summary = tf.Summary() tower_to_kill, q_values, saliencies = choose_tower.predict(state.state) choices = env.actions()['actions'] for choice, action_value in choices.items(): key = choice explanation.add_layers(layer_names, saliencies[action_value - 1], key=key) group = BarGroup("Attack {}".format(choice), saliency_key=key) key = choice + "_Overall" explanation.add_layers(layer_names, saliencies[action_value - 1], key=key) bar = Bar("Attack {}".format(choice), q_values[action_value - 1], saliency_key=key) group.add_bar(bar) chart.add_bar_group(group) explanation.with_bar_chart(chart) action = env.new_action() action.attack_quadrant(tower_to_kill) action.skip = False if evaluation_config.render else True state = env.act(action, explanation=explanation) while not state.is_terminal(): time.sleep(0.5) action = env.new_action() action.skip = False state = env.act(action, explanation=explanation) total_reward += state.reward time.sleep(10) if state.is_terminal(): logger.info("End Episode of episode %d!" % (episode + 1)) logger.info("Total Reward %d!" % (total_reward)) episode_summary.value.add(tag="Reward", simple_value=total_reward) test_summary_writer.add_summary(episode_summary, episode + 1) test_summary_writer.flush()
def run_task(evaluation_config, network_config, reinforce_config): env = FourTowersSequentialMultiUnitEnvironment() max_episode_steps = 100 state = env.reset() # print(state) choices = [0,1,2,3] pdx_explanation = PDX() reward_types = ['damageToZealot', 'damageToZergling', 'damageToRoach', 'damageToStalker', 'damageToMarine', 'damageToHydralisk'] agent = HRAAdaptive(name = "FourTowerSequential", choices = choices, reward_types = reward_types, network_config = network_config, reinforce_config = reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = SummaryWriter(training_summaries_path) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) totalDamageToZealot = 0 totalDamageToZergling = 0 totalDamageToRoach = 0 totalDamageToStalker = 0 totalDamageToMarine = 0 totalDamageToHydralisk = 0 # Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() total_reward = 0 done = False dead = False deciding = True running = True steps = 0 rewards = [] initial_state = np.array(state) while deciding: steps += 1 action, q_values, _ = agent.predict(state) state, reward, done, dead, info = env.step(action) while running: action = 4 state, reward, done, dead, info = env.step(action) if done: break if not dead: rewards = {'damageToZealot': env.decomposed_rewards[len(env.decomposed_rewards) - 1][0], 'damageToZergling': env.decomposed_rewards[len(env.decomposed_rewards) - 1][1], 'damageToRoach': env.decomposed_rewards[len(env.decomposed_rewards) - 1][2], 'damageToStalker': env.decomposed_rewards[len(env.decomposed_rewards) - 1][3], 'damageToMarine': env.decomposed_rewards[len(env.decomposed_rewards) - 1][4], 'damageToHydralisk': env.decomposed_rewards[len(env.decomposed_rewards) - 1][5]} else: rewards = {'damageToZealot': env.decomposed_rewards[len(env.decomposed_rewards) - 2][0], 'damageToZergling': env.decomposed_rewards[len(env.decomposed_rewards) - 2][1], 'damageToRoach': env.decomposed_rewards[len(env.decomposed_rewards) - 2][2], 'damageToStalker': env.decomposed_rewards[len(env.decomposed_rewards) - 2][3], 'damageToMarine': env.decomposed_rewards[len(env.decomposed_rewards) - 2][4], 'damageToHydralisk': env.decomposed_rewards[len(env.decomposed_rewards) - 2][5]} for reward_type in rewards.keys(): agent.reward(reward_type, rewards[reward_type]) total_reward += rewards['damageToZealot'] + rewards['damageToZergling'] + rewards['damageToRoach'] + rewards['damageToStalker'] + rewards['damageToMarine'] + rewards['damageToHydralisk'] if dead: break totalDamageToZealot += rewards['damageToZealot'] totalDamageToZergling += rewards['damageToZergling'] totalDamageToRoach += rewards['damageToRoach'] totalDamageToStalker += rewards['damageToStalker'] totalDamageToMarine += rewards['damageToMarine'] totalDamageToHydralisk += rewards['damageToHydralisk'] print("Damage to Zealot: {}".format(totalDamageToZealot)) print("Damage to Zergling: {}".format(totalDamageToZergling)) print("Damage to Roach: {}".format(totalDamageToRoach)) print("Damage to Stalker: {}".format(totalDamageToStalker)) print("Damage to Marine: {}".format(totalDamageToMarine)) print("Damage to Hydralisk: {}".format(totalDamageToHydralisk)) agent.end_episode(state) test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward, global_step=episode + 1) train_summary_writer.add_scalar(tag="Train/Steps to collect all Fruits", scalar_value=steps + 1, global_step=episode + 1) print("EPISODE REWARD {}".format(total_reward)) print("EPISODE {}".format(episode)) agent.disable_learning() # Test Episodes for episode in range(evaluation_config.test_episodes): state = env.reset() total_reward = 0 done = False steps = 0 deciding = True running = True while deciding: steps += 1 action, q_values, combined_q_values = agent.predict(state) print(action) print(q_values) if evaluation_config.render: # env.render() pdx_explanation.render_all_pdx(action, 4, q_values, ['Top_Left', 'Top_Right', 'Bottom_Left', 'Bottom_Right'], ['damageToZealot', 'damageToZergling', 'damageToRoach', 'damageToStalker', 'damageToMarine', 'damageToHydralisk']) time.sleep(evaluation_config.sleep) # This renders an image of the game and saves to test.jpg state, reward, done, dead, info = env.step(action) while running: action = 4 state, reward, done, dead, info = env.step(action) if done: break if dead: break agent.end_episode(state) test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward, global_step=episode + 1) test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits", scalar_value=steps + 1, global_step=episode + 1)
def run_task(evaluation_config, network_config, reinforce_config, map_name=None, train_forever=False): if (use_cuda): print("~~~~~~~~~~~~~~~~~~~~~~~~~~") print("| USING CUDA |") print("~~~~~~~~~~~~~~~~~~~~~~~~~~") else: print("~~~~~~~~~~~~~~~~~~~~~~~~~~") print("| NOT USING CUDA |") print("~~~~~~~~~~~~~~~~~~~~~~~~~~") flags.FLAGS(sys.argv[:1]) max_episode_steps = 40 # evaluation_config.generate_xai_replay = False replay_dimension = evaluation_config.xai_replay_dimension env = TugOfWar(map_name = map_name, \ generate_xai_replay = evaluation_config.generate_xai_replay, xai_replay_dimension = replay_dimension) combine_sa = env.combine_sa state_1, state_2 = env.reset() if network_config.output_shape == 4: reward_num = 4 combine_decomposed_func = combine_decomposed_func_4 player_1_end_vector = player_1_end_vector_4 if network_config.output_shape == 8: reward_num = 8 combine_decomposed_func = combine_decomposed_func_8 player_1_end_vector = player_1_end_vector_8 models_path = "abp/examples/pysc2/tug_of_war/models_mb/" agent_1 = MBTSAdaptive(name="TugOfWar", state_length=len(state_1), network_config=network_config, reinforce_config=reinforce_config, models_path=models_path, depth=2, action_ranking=[20, 10, 5, 3], env=env, is_F_all_unit=True) if not reinforce_config.is_random_agent_2: agent_2 = SADQAdaptive(name="self_model_free", state_length=len(state_2), network_config=network_config, reinforce_config=reinforce_config, memory_resotre=False, reward_num=reward_num, combine_decomposed_func=combine_decomposed_func) agent_2.eval_model.replace(agent_1.q_model) agent_2.disable_learning(is_save=False) print("sadq agent 2") else: print("random agent 2") path = './saved_models/tug_of_war/agents/grid_decom_test' agents_2 = [] agents_2.append(agent_2) test_performance = True # test_performance = False network_config.restore_network = False if (evaluation_config.generate_xai_replay and not reinforce_config.is_random_agent_2) or test_performance: files = [] # r=root, d=directories, f = files for r, d, f in os.walk(path): # print(d) # if len(d) == 3: for file in f: if '.p' in file: print(file) new_weights = torch.load(path + "/" + file, map_location=device) new_agent_2 = SADQAdaptive( name=file, state_length=len(state_2), network_config=network_config, reinforce_config=reinforce_config, memory_resotre=False, reward_num=reward_num, combine_decomposed_func=combine_decomposed_func) # print(list(new_weights.keys())) if "module" in list(new_weights.keys())[0]: new_agent_2.load_weight(new_weights) else: new_agent_2.eval_model.model.module.load_state_dict( new_weights) new_agent_2.disable_learning(is_save=False) agents_2.append(new_agent_2) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) random_enemy = False count_num = 0 while True: # Test Episodes print( "======================================================================" ) print( "===============================Now testing============================" ) print( "======================================================================" ) print("There are {} enemies".format(len(agents_2))) if count_num > 7: break for agent_2 in agents_2: print(agent_2.name) average_state = np.zeros(len(state_1)) total_rewwards_list = [] evaluation_config.test_episodes = 1 count_num += 1 if count_num > 7: break for episode in tqdm(range(evaluation_config.test_episodes)): state = env.reset() total_reward_1 = 0 done = False skiping = True steps = 0 recorder = XaiReplayRecorder2LaneNexus(env.sc2_env, episode, evaluation_config.env, action_component_names, replay_dimension) while skiping: state_1, state_2, done, dp = env.step([], 0) if evaluation_config.generate_xai_replay: recorder.save_jpg() recorder.record_game_clock_tick( env.decomposed_reward_dict) if dp or done: break # input("done stepping to finish prior action") while not done and steps < max_episode_steps: steps += 1 # # Decision point # print('state:') # print(list(env.denormalization(state_1))) # print(list(env.denormalization(state_2))) actions_1 = env.get_big_A(state_1[env.miner_index], state_1[env.pylon_index]) actions_2 = env.get_big_A(state_2[env.miner_index], state_2[env.pylon_index]) # choice_1 = agent_1.predict(env.denormalization(state_1), env.denormalization(state_2)[env.miner_index]) # print(state_1) # print() # start_time = time.time() actions_1111111, node = agent_1.predict( state_1, state_2[env.miner_index], dp=steps) # print(time.time() - start_time) # print() # print() # print() # # # node.print_tree(p_best_q_value = True, p_action = True, p_after_q_value = True) # node.print_children_prob(node) # input() if evaluation_config.generate_xai_replay: # print(111111111111) path_whole_tree = recorder.json_pathname[:-5] + "_whole_tree/" # print(path_whole_tree) path_partial_tree = recorder.json_pathname[:-5] + "_partial_tree/" # print(path_partial_tree) if not os.path.exists(path_whole_tree): os.mkdir(path_whole_tree) if not os.path.exists(path_partial_tree): os.mkdir(path_partial_tree) node.save_into_json(path=path_whole_tree, dp=steps) node.save_into_json(path=path_partial_tree, dp=steps, is_partial=True) combine_states_2 = combine_sa(state_2, actions_2) if not reinforce_config.is_random_agent_2 and not random_enemy: choice_2, _ = agent_2.predict( env.normalization(combine_states_2)) else: choice_2 = randint(0, len(actions_2) - 1) if evaluation_config.generate_xai_replay: recorder.save_jpg() recorder.record_decision_point( actions_1111111, actions_2[choice_2], state_1, state_2, env.decomposed_reward_dict) # env.step(list(actions_1[choice_1]), 1) # print(actions_2[choice_2]) # pretty_print(state_2, text = "state:") # input() env.step(list(actions_1111111), 1) env.step(list(actions_2[choice_2]), 2) # human play # env.step(list(get_human_action()), 2) # print(actions_1111111) while skiping: state_1, state_2, done, dp = env.step([], 0) #input(' step wating for done signal') if evaluation_config.generate_xai_replay: recorder.save_jpg() recorder.record_game_clock_tick( env.decomposed_reward_dict) if dp or done: break reward = [0] * reward_num if steps == max_episode_steps or done: reward = player_1_end_vector(state_1[63], state_1[64], state_1[65], state_1[66], is_done=done) current_reward_1 = 0 if steps == max_episode_steps or done: if evaluation_config.generate_xai_replay: recorder.done_recording() if reward_num == 4: current_reward_1 = sum(reward[2:]) elif reward_num == 8: current_reward_1 = reward[2] + reward[3] + reward[ 6] + reward[7] total_reward_1 += current_reward_1 average_state += state_1 total_rewwards_list.append(total_reward_1) # print(total_rewwards_list) test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward_1, global_step=episode + 1) test_summary_writer.add_scalar( tag="Test/Steps to choosing Enemies", scalar_value=steps + 1, global_step=episode + 1) tr = sum(total_rewwards_list) / evaluation_config.test_episodes print("total reward:") print(tr) f = open("result_model_based_final_results_mf.txt", "a+") f.write(agent_2.name + "\n") f.write(str(tr) + "\n") f.write( np.array2string(average_state / evaluation_config.test_episodes, precision=2, separator=',', suppress_small=True) + "\n") f.close()
def run_task(evaluation_config, network_config, reinforce_config): import absl absl.flags.FLAGS(sys.argv[:1]) env = FourTowerSequential() max_episode_steps = 100 state = env.reset() # actions = env.actions()['actions'] # actions = sorted(actions.items(), key=operator.itemgetter(1)) # choice_descriptions = list(map(lambda x: x[0], actions)) print('Initial state is: {}'.format(state)) choice_descriptions = ['Q4', 'Q1', 'Q3', 'Q2'] choices = [0, 1, 2, 3] pdx_explanation = PDX() reward_types = [ 'roach', 'zergling', 'damageByRoach', 'damageByZergling', 'damageToRoach', 'damageToZergling' ] agent = HRAAdaptive(name="FourTowerSequential", choices=choices, reward_types=reward_types, network_config=network_config, reinforce_config=reinforce_config) training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = SummaryWriter(training_summaries_path) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) # Training Episodes for episode in range(evaluation_config.training_episodes): state = env.reset() total_reward = 0 done = False dead = False deciding = True running = True steps = 0 rewards = [] initial_state = np.array(state) while deciding: steps += 1 action, q_values, combined_q_values = agent.predict(state[0]) state, reward, done, dead, info = env.step(action) while running: action = 4 state, reward, done, dead, info = env.step(action) if done: break # TODO: Explain the meaning of the numerical constant 200 in this situation # eg. MaxPossibleDamage = 200 or RoachZerglingRatio = 200 if not dead: rewards = { 'roach': env.decomposed_rewards[len(env.decomposed_rewards) - 1][0], 'zergling': env.decomposed_rewards[len(env.decomposed_rewards) - 1][1], 'damageByRoach': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 1][2]) / 200), 'damageByZergling': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 1][3]) / 200), 'damageToRoach': (env.decomposed_rewards[len(env.decomposed_rewards) - 1][4] / 200), 'damageToZergling': (env.decomposed_rewards[len(env.decomposed_rewards) - 1][5] / 200) } else: rewards = { 'roach': env.decomposed_rewards[len(env.decomposed_rewards) - 2][0], 'zergling': env.decomposed_rewards[len(env.decomposed_rewards) - 2][1], 'damageByRoach': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 2][2]) / 200), 'damageByZergling': (-(env.decomposed_rewards[len(env.decomposed_rewards) - 2][3]) / 200), 'damageToRoach': (env.decomposed_rewards[len(env.decomposed_rewards) - 2][4] / 200), 'damageToZergling': (env.decomposed_rewards[len(env.decomposed_rewards) - 2][5] / 200) } for reward_type in rewards.keys(): agent.reward(reward_type, rewards[reward_type]) total_reward += rewards[reward_type] if dead: break agent.end_episode(state[0]) test_summary_writer.add_scalar(tag="Train/Episode Reward", scalar_value=total_reward, global_step=episode + 1) train_summary_writer.add_scalar( tag="Train/Steps to collect all Fruits", scalar_value=steps + 1, global_step=episode + 1) print("EPISODE REWARD {}".format(rewards['roach'] + rewards['zergling'])) print("EPISODE {}".format(episode)) # TODO: Display XDAPS agent.disable_learning() # TODO: Start a new env that has rgb enabled for visualization # Test Episodes for episode in range(evaluation_config.test_episodes): state = env.reset() total_reward = 0 done = False steps = 0 deciding = True running = True layer_names = [ "height_map", "visibility_map", "creep", "power", "player_id", "player_relative", "unit_type", "selected", "unit_hit_points", "unit_hit_points_ratio", "unit_energy", "unit_energy_ratio", "unit_shields", "unit_shields_ratio", "unit_density", "unit_density_aa", "effects" ] saliency_explanation = Saliency(agent) while deciding: steps += 1 action, q_values, combined_q_values = agent.predict(state[0]) print(action) print(q_values) print('STATE SHAPE') print(state.shape) saliencies = saliency_explanation.generate_saliencies( steps, state[0], choice_descriptions, layer_names, reshape=state.shape) if evaluation_config.render: # env.render() pdx_explanation.render_all_pdx( action, 4, q_values, ['Top_Left', 'Top_Right', 'Bottom_Left', 'Bottom_Right'], [ 'roach', 'zergling', 'damageByRoach', 'damageByZergling', 'damageToRoach', 'damageToZergling' ]) time.sleep(evaluation_config.sleep) # This renders an image of the game and saves to test.jpg # imutil.show(self.last_timestep.observation['rgb_screen'], filename="test.jpg") state, reward, done, dead, info = env.step(action) while running: action = 4 state, reward, done, dead, info = env.step(action) if done: # print("DONE") break if dead: break agent.end_episode(state) test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward, global_step=episode + 1) test_summary_writer.add_scalar(tag="Test/Steps to collect all Fruits", scalar_value=steps + 1, global_step=episode + 1)