def simulated(self, state_now, action_now, v_preds_now, dyna_steps, append_to_buffer=True): game_state = GameState(dynamic_net=self.dynamic_net, state=state_now) sim_buffer = Buffer() for _ in range(dyna_steps): # simulate next state next_game_state = game_state.play(action_now, verbose=False) state_last = state_now action_last = action_now state_now = next_game_state.obs() v_preds_last = v_preds_now v_preds_now = self.net.policy.get_values(state_now) v_preds_now = self.get_values(v_preds_now) reward = state_now[1] - state_last[1] if append_to_buffer: sim_buffer.append(state_last, action_last, state_now, reward, v_preds_last, v_preds_now) action_now, v_preds_now = self.net.policy.get_action(state_now, verbose=False) game_state = next_game_state #print('sim_buffer:', sim_buffer) self.global_buffer.add(sim_buffer, add_return=False)
class MiniSourceAgent(base_agent.BaseAgent): """Agent for source game of starcraft.""" def __init__(self, index=0, rl_training=False, restore_model=False, global_buffer=None, net=None, strategy_agent=None, greedy_action=False, extract_save_dir=None): super(MiniSourceAgent, self).__init__() self.net = net self.index = index self.global_buffer = global_buffer self.restore_model = restore_model # model in brain self.strategy_agent = strategy_agent self.strategy_act = None # count num self.step = 0 self.strategy_wait_secs = 2 self.strategy_flag = False self.policy_wait_secs = 2 self.policy_flag = True self.env = None self.obs = None # buffer self.local_buffer = Buffer() self.num_players = 2 self.on_select = None self._result = None self._gases = None self.is_end = False self.greedy_action = greedy_action self.rl_training = rl_training self.extract_save_dir = extract_save_dir def reset(self): super(MiniSourceAgent, self).reset() self.step = 0 self.obs = None self._result = None self._gases = None self.is_end = False self.strategy_flag = False self.policy_flag = True self.local_buffer.reset() if self.strategy_agent is not None: self.strategy_agent.reset() def set_env(self, env): self.env = env def init_network(self): self.net.initialize() if self.restore_model: self.net.restore_policy() def reset_old_network(self): self.net.reset_old_network() def save_model(self): self.net.save_policy() def update_policy(self): self.net.Update_policy(self.global_buffer) def update_result(self, result_list): self.net.update_result(result_list) def update_network(self, result_list): self.net.Update_policy(self.global_buffer) self.net.Update_result(result_list) def update_summary(self, counter): return self.net.Update_summary(counter) def mini_step(self, action): if action == ProtossAction.Build_probe.value: M.mineral_worker(self) elif action == ProtossAction.Build_zealot.value: M.train_army(self, C._TRAIN_ZEALOT) elif action == ProtossAction.Build_Stalker.value: M.train_army(self, C._TRAIN_STALKER) elif action == ProtossAction.Build_pylon.value: no_unit_index = U.get_unit_mask_screen(self.obs, size=2) pos = U.get_pos(no_unit_index) M.build_by_idle_worker(self, C._BUILD_PYLON_S, pos) elif action == ProtossAction.Build_gateway.value: power_index = U.get_power_mask_screen(self.obs, size=5) pos = U.get_pos(power_index) M.build_by_idle_worker(self, C._BUILD_GATEWAY_S, pos) elif action == ProtossAction.Build_Assimilator.value: if self._gases is not None: #U.find_gas_pos(self.obs, 1) gas_1 = self._gases[0] gas_2 = self._gases[1] if gas_1 is not None and not U.is_assimilator_on_gas( self.obs, gas_1): gas_1_pos = T.world_to_screen_pos(self.env.game_info, gas_1.pos, self.obs) M.build_by_idle_worker(self, C._BUILD_ASSIMILATOR_S, gas_1_pos) elif gas_2 is not None and not U.is_assimilator_on_gas( self.obs, gas_2): gas_2_pos = T.world_to_screen_pos(self.env.game_info, gas_2.pos, self.obs) M.build_by_idle_worker(self, C._BUILD_ASSIMILATOR_S, gas_2_pos) elif action == ProtossAction.Build_CyberneticsCore.value: power_index = U.get_power_mask_screen(self.obs, size=3) pos = U.get_pos(power_index) M.build_by_idle_worker(self, C._BUILD_CYBER_S, pos) elif action == ProtossAction.Attack.value: M.attack_step(self) elif action == ProtossAction.Retreat.value: M.retreat_step(self) elif action == ProtossAction.Do_nothing.value: self.safe_action(C._NO_OP, 0, []) def get_the_input(self): high_input, tech_cost, pop_num = U.get_input(self.obs) controller_input = np.concatenate([high_input, tech_cost, pop_num], axis=0) return controller_input def get_the_input_right(self, obs): high_input, tech_cost, pop_num = U.get_input(obs) controller_input = np.concatenate([high_input, tech_cost, pop_num], axis=0) return controller_input def mapping_source_to_mini_by_rule(self, source_state): simple_input = np.zeros([20], dtype=np.int16) simple_input[0] = 0 # self.time_seconds simple_input[1] = source_state[28] # self.mineral_worker_nums simple_input[2] = source_state[30] + source_state[ 32] # self.gas_worker_nums simple_input[3] = source_state[2] # self.mineral simple_input[4] = source_state[3] # self.gas simple_input[5] = source_state[6] # self.food_cup simple_input[6] = source_state[7] # self.food_used simple_input[7] = source_state[10] # self.army_nums simple_input[8] = source_state[16] # self.gateway_num simple_input[9] = source_state[14] # self.pylon_num simple_input[10] = source_state[15] # self.Assimilator_num simple_input[11] = source_state[17] # self.CyberneticsCore_num simple_input[12] = source_state[12] # self.zealot_num simple_input[13] = source_state[13] # self.Stalker_num simple_input[14] = source_state[11] # self.probe_num simple_input[15] = source_state[4] + source_state[ 2] # self.collected_mineral simple_input[16] = source_state[4] # self.spent_mineral simple_input[17] = source_state[5] + source_state[ 3] # self.collected_gas simple_input[18] = source_state[5] # self.spent_gas simple_input[19] = 1 # self.Nexus_num return simple_input def play(self, verbose=False): self.play_train_mini(verbose=verbose) def sample(self, verbose=False, use_image=False): is_attack = False state_last = None random_generated_int = random.randint(0, 2**31 - 1) filename = self.extract_save_dir + "/" + str( random_generated_int) + ".npz" recording_obs = [] recording_img = [] recording_action = [] np.random.seed(random_generated_int) tf.set_random_seed(random_generated_int) self.safe_action(C._NO_OP, 0, []) self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) self._gases = U.find_initial_gases(self.obs) while True: self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) if self.policy_flag and (not self.is_end): state_now = self.mapping_source_to_mini_by_rule( self.get_the_input()) recording_obs.append(state_now) if use_image: recording_img.append(U.get_simple_map_data(self.obs)) action, v_preds = self.net.policy.get_action(state_now, verbose=False) recording_action.append(action) self.mini_step(action) if state_last is not None: if False: print('state_last:', state_last, ', action_last:', action_last, ', state_now:', state_now) v_preds_next = self.net.policy.get_values(state_now) v_preds_next = self.get_values(v_preds_next) reward = 0 self.local_buffer.append(state_last, action_last, state_now, reward, v_preds, v_preds_next) state_last = state_now action_last = action self.policy_flag = False if self.is_end: if True: #note this will not consider the minerals larger than 256! recording_obs = np.array(recording_obs, dtype=np.uint16) recording_action = np.array(recording_action, dtype=np.uint8) if not use_image: np.savez_compressed(filename, obs=recording_obs, action=recording_action) else: recording_img = np.array(recording_img, dtype=np.float16) np.savez_compressed(filename, obs=recording_obs, img=recording_img, action=recording_action) break def play_train_mini(self, continues_attack=False, verbose=False): is_attack = False state_last = None self.safe_action(C._NO_OP, 0, []) self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) self._gases = U.find_initial_gases(self.obs) while True: self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) if self.policy_flag and (not self.is_end): state_now = self.mapping_source_to_mini_by_rule( self.get_the_input()) if self.greedy_action: action_prob, v_preds = self.net.policy.get_action_probs( state_now, verbose=False) action = np.argmax(action_prob) else: action, v_preds = self.net.policy.get_action(state_now, verbose=False) # print(ProtossAction(action).name) self.mini_step(action) if state_last is not None: if False: print('state_last:', state_last, ', action_last:', action_last, ', state_now:', state_now) v_preds_next = self.net.policy.get_values(state_now) v_preds_next = self.get_values(v_preds_next) reward = 0 if False: print(state_last, action_last, state_now, reward, v_preds, v_preds_next) self.local_buffer.append(state_last, action_last, state_now, reward, v_preds, v_preds_next) # continuous attack, consistent with mind-game if continues_attack: if action == ProtossAction.Attack.value: is_attack = True if is_attack: self.mini_step(ProtossAction.Attack.value) state_last = state_now action_last = action self.policy_flag = False if self.is_end: if self.rl_training: self.local_buffer.rewards[-1] += 1 * self.result[ 'reward'] # self.result['win'] #print(self.local_buffer.values) #print(self.local_buffer.values_next) #print(self.local_buffer.rewards) self.global_buffer.add(self.local_buffer) print("add %d buffer!" % (len(self.local_buffer.rewards))) #print("returns:", self.global_buffer.returns) #print("gaes:", self.global_buffer.gaes) break def play_right(self, verbose=False): # note this is a right version of game play prev_state = None prev_action = None prev_value = None self.safe_action(C._NO_OP, 0, []) self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) self._gases = U.find_initial_gases(self.obs) while True: self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) if self.policy_flag and (not self.is_end): # get the state state = self.mapping_source_to_mini_by_rule( self.get_the_input_right(self.obs)) # get the action and value accoding to state action, value = self.net.policy.get_action(state, verbose=verbose) # if this is not the fisrt state, store things to buffer if prev_state is not None: # try reward = self.obs.reward reward = self.obs.reward if verbose: print(prev_state, prev_action, state, reward, prev_value, value) self.local_buffer.append(prev_state, prev_action, state, reward, prev_value, value) self.mini_step(action) # the evn step to new states prev_state = state prev_action = action prev_value = value self.policy_flag = False if self.is_end: # get the last state and reward # get the state state = self.mapping_source_to_mini_by_rule( self.get_the_input_right(self.obs)) value = self.net.policy.get_values(state) # the value of the last state is defined somewhat different value = self.get_values_right(value) # if this is not the fisrt state, store things to buffer if prev_state is not None: reward = self.obs.reward if verbose: print(prev_state, prev_action, state, reward, prev_value, value) self.local_buffer.append(prev_state, prev_action, state, reward, prev_value, value) break if self.rl_training: if verbose: print(self.local_buffer.values) print(self.local_buffer.values_next) #print(self.local_buffer.rewards) self.global_buffer.add(self.local_buffer) #print("add %d buffer!" % (len(self.local_buffer.rewards))) #print("returns:", self.global_buffer.returns) #print("gaes:", self.global_buffer.gaes) def set_flag(self): if self.step % C.time_wait(self.strategy_wait_secs) == 1: self.strategy_flag = True if self.step % C.time_wait(self.policy_wait_secs) == 1: self.policy_flag = True def safe_action(self, action, unit_type, args): if M.check_params(self, action, unit_type, args, 1): obs = self.env.step([sc2_actions.FunctionCall(action, args)])[0] self.obs = obs self.step += 1 self.update_result() self.set_flag() def select(self, action, unit_type, args): # safe select if M.check_params(self, action, unit_type, args, 0): self.obs = self.env.step([sc2_actions.FunctionCall(action, args)])[0] self.on_select = unit_type self.update_result() self.step += 1 self.set_flag() @property def result(self): return self._result def update_result(self): if self.obs is None: return if self.obs.last() or self.env.state == environment.StepType.LAST: self.is_end = True outcome = 0 o = self.obs.raw_observation player_id = o.observation.player_common.player_id for r in o.player_result: if r.player_id == player_id: outcome = sc2_env._possible_results.get(r.result, 0) frames = o.observation.game_loop result = {} result['outcome'] = outcome result['reward'] = self.obs.reward result['frames'] = frames self._result = result print('play end, total return', self.obs.reward) self.step = 0 def get_values(self, values): # check if the game is end if self.is_end and self.result['reward'] != 0: return 0 else: return values def get_values_right(self, values): # if the game ends with a win or loss (the result reward is 1 or -1), the value is set to 0 # else if the game ends without a result (the result reward is 1 or -1), the value is set to asbefore if self.is_end and self.result['reward'] != 0: return 0 else: return values
class MiniSourceAgent(base_agent.BaseAgent): """Agent for source game of starcraft.""" def __init__(self, index=0, rl_training=False, restore_model=False, global_buffer=None, net=None, strategy_agent=None, greedy_action=False): super(MiniSourceAgent, self).__init__() self.net = net self.index = index self.global_buffer = global_buffer self.restore_model = restore_model # model in brain self.strategy_agent = strategy_agent self.strategy_act = None # count num self.step = 0 self.strategy_wait_secs = 2 self.strategy_flag = False self.policy_wait_secs = 2 self.policy_flag = True self.env = None self.obs = None # buffer self.local_buffer = Buffer() self.num_players = 2 self.on_select = None self._result = None self._gases = None self.is_end = False self.greedy_action = greedy_action self.rl_training = rl_training def reset(self): super(MiniSourceAgent, self).reset() self.step = 0 self.obs = None self._result = None self._gases = None self.is_end = False self.strategy_flag = False self.policy_flag = True self.local_buffer.reset() if self.strategy_agent is not None: self.strategy_agent.reset() def set_env(self, env): self.env = env def init_network(self): self.net.initialize() if self.restore_model: self.net.restore_policy() def reset_old_network(self): self.net.reset_old_network() def save_model(self): self.net.save_policy() def update_network(self, result_list): self.net.Update_policy(self.global_buffer) self.net.Update_result(result_list) def update_summary(self, counter): return self.net.Update_summary(counter) def mini_step(self, action): if action == ProtossAction.Build_probe.value: M.mineral_worker(self) elif action == ProtossAction.Build_zealot.value: M.train_army(self, C._TRAIN_ZEALOT) elif action == ProtossAction.Build_Stalker.value: M.train_army(self, C._TRAIN_STALKER) elif action == ProtossAction.Build_pylon.value: no_unit_index = U.get_unit_mask_screen(self.obs, size=2) pos = U.get_pos(no_unit_index) M.build_by_idle_worker(self, C._BUILD_PYLON_S, pos) elif action == ProtossAction.Build_gateway.value: power_index = U.get_power_mask_screen(self.obs, size=5) pos = U.get_pos(power_index) M.build_by_idle_worker(self, C._BUILD_GATEWAY_S, pos) elif action == ProtossAction.Build_Assimilator.value: if self._gases is not None: #U.find_gas_pos(self.obs, 1) gas_1 = self._gases[0] gas_2 = self._gases[1] if gas_1 is not None and not U.is_assimilator_on_gas(self.obs, gas_1): gas_1_pos = T.world_to_screen_pos(self.env.game_info, gas_1.pos, self.obs) M.build_by_idle_worker(self, C._BUILD_ASSIMILATOR_S, gas_1_pos) elif gas_2 is not None and not U.is_assimilator_on_gas(self.obs, gas_2): gas_2_pos = T.world_to_screen_pos(self.env.game_info, gas_2.pos, self.obs) M.build_by_idle_worker(self, C._BUILD_ASSIMILATOR_S, gas_2_pos) elif action == ProtossAction.Build_CyberneticsCore.value: power_index = U.get_power_mask_screen(self.obs, size=3) pos = U.get_pos(power_index) M.build_by_idle_worker(self, C._BUILD_CYBER_S, pos) elif action == ProtossAction.Attack.value: M.attack_step(self) elif action == ProtossAction.Retreat.value: M.retreat_step(self) elif action == ProtossAction.Do_nothing.value: self.safe_action(C._NO_OP, 0, []) def get_the_input(self): high_input, tech_cost, pop_num = U.get_input(self.obs) controller_input = np.concatenate([high_input, tech_cost, pop_num], axis=0) return controller_input def mapping_source_to_mini_by_rule(self, source_state): simple_input = np.zeros([20]) simple_input[0] = 0 # self.time_seconds simple_input[1] = source_state[28] # self.mineral_worker_nums simple_input[2] = source_state[30] + source_state[32] # self.gas_worker_nums simple_input[3] = source_state[2] # self.mineral simple_input[4] = source_state[3] # self.gas simple_input[5] = source_state[6] # self.food_cup simple_input[6] = source_state[7] # self.food_used simple_input[7] = source_state[10] # self.army_nums simple_input[8] = source_state[16] # self.gateway_num simple_input[9] = source_state[14] # self.pylon_num simple_input[10] = source_state[15] # self.Assimilator_num simple_input[11] = source_state[17] # self.CyberneticsCore_num simple_input[12] = source_state[12] # self.zealot_num simple_input[13] = source_state[13] # self.Stalker_num simple_input[14] = source_state[11] # self.probe_num simple_input[15] = source_state[4] + source_state[2] # self.collected_mineral simple_input[16] = source_state[4] # self.spent_mineral simple_input[17] = source_state[5] + source_state[3] # self.collected_gas simple_input[18] = source_state[5] # self.spent_gas simple_input[19] = 1 # self.Nexus_num return simple_input def play(self, verbose=False): self.play_train_mini(verbose=verbose) def play_train_mini(self, verbose=False): is_attack = False state_last = None self.safe_action(C._NO_OP, 0, []) self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) self._gases = U.find_initial_gases(self.obs) while True: self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) if self.policy_flag and (not self.is_end): state_now = self.mapping_source_to_mini_by_rule(self.get_the_input()) if self.greedy_action: action_prob, v_preds = self.net.policy.get_action_probs(state_now, verbose=False) action = np.argmax(action_prob) else: action, v_preds = self.net.policy.get_action(state_now, verbose=False) # print(ProtossAction(action).name) self.mini_step(action) if state_last is not None: if 0: print('state_last:', state_last, ', action_last:', action_last, ', state_now:', state_now) v_preds_next = self.net.policy.get_values(state_now) v_preds_next = self.get_values(v_preds_next) reward = 0 self.local_buffer.append(state_last, action_last, state_now, reward, v_preds, v_preds_next) # continuous attack, consistent with mind-game if action == ProtossAction.Attack.value: is_attack = True if is_attack: self.mini_step(ProtossAction.Attack.value) state_last = state_now action_last = action self.policy_flag = False if self.is_end: if self.rl_training: self.local_buffer.rewards[-1] += 1 * self.result['reward'] # self.result['win'] print(self.local_buffer.rewards) self.global_buffer.add(self.local_buffer) print("add %d buffer!" % (len(self.local_buffer.rewards))) break def set_flag(self): if self.step % C.time_wait(self.strategy_wait_secs) == 1: self.strategy_flag = True if self.step % C.time_wait(self.policy_wait_secs) == 1: self.policy_flag = True def safe_action(self, action, unit_type, args): if M.check_params(self, action, unit_type, args, 1): obs = self.env.step([sc2_actions.FunctionCall(action, args)])[0] self.obs = obs self.step += 1 self.update_result() self.set_flag() def select(self, action, unit_type, args): # safe select if M.check_params(self, action, unit_type, args, 0): self.obs = self.env.step([sc2_actions.FunctionCall(action, args)])[0] self.on_select = unit_type self.update_result() self.step += 1 self.set_flag() @property def result(self): return self._result def update_result(self): if self.obs is None: return if self.obs.last() or self.env.state == environment.StepType.LAST: self.is_end = True outcome = 0 o = self.obs.raw_observation player_id = o.observation.player_common.player_id for r in o.player_result: if r.player_id == player_id: outcome = sc2_env._possible_results.get(r.result, 0) frames = o.observation.game_loop result = {} result['outcome'] = outcome result['reward'] = self.obs.reward result['frames'] = frames self._result = result print('play end, total return', self.obs.reward) self.step = 0 def get_values(self, values): # check if the game is end if self.is_end and self.result['reward'] != 0: return 0 else: return values
class MultiAgent(base_agent.BaseAgent): """My first agent for starcraft.""" def __init__(self, index=0, rl_training=False, restore_model=False, restore_internal_model=False, global_buffer=None, net=None, use_mcts=False, num_reads=0, policy_in_mcts=None, dynamic_net=None, use_dyna=False, dyna_steps_fisrt=0, dyna_decrese_counter=0): super(MultiAgent, self).__init__() self.net = net self.index = index self.global_buffer = global_buffer self.restore_model = restore_model self.restore_dynamic = restore_internal_model # count num self.step = 0 self.policy_wait_secs = 2 self.policy_flag = True self.env = None self.obs = None # buffer self.local_buffer = Buffer() self.num_players = 2 self.on_select = None self._result = None self.is_end = False self.rl_training = rl_training self.reward_type = 0 # mcts about self.use_mcts = use_mcts self.num_reads = num_reads self.policy_in_mcts = policy_in_mcts self.dynamic_net = dynamic_net # dyna about self.use_dyna = use_dyna self.dyna_steps_fisrt = dyna_steps_fisrt self.dyna_decrese_counter = dyna_decrese_counter self.dyna_steps = dyna_steps_fisrt def reset(self): super(MultiAgent, self).reset() self.step = 0 self.obs = None self._result = None self.is_end = False self.policy_flag = True self.local_buffer.reset() def set_env(self, env): self.env = env def init_network(self): self.net.initialize() if self.restore_model: self.net.restore_policy() if self.restore_dynamic: # print('self.net.restore_dynamic()') self.net.restore_dynamic("") # self.dynamic_net.restore_sl_model("") def reset_old_network(self): self.net.reset_old_network() def save_model(self): self.net.save_policy() def update_network(self, result_list): self.net.Update_policy(self.global_buffer) # self.net.Update_internal_model(self.global_buffer) self.net.Update_result(result_list) # self.update_policy_in_mcts() def update_policy_in_mcts(self): values = self.global_buffer.values values_array = np.array(values).astype(dtype=np.float32).reshape(-1) print('values_array:', values_array) min_v = np.min(values_array) print('min_v:', min_v) max_v = np.max(values_array) print('max_v:', max_v) self.policy_in_mcts.update_min_max_v(min_v, max_v) mean_v = np.mean(values_array) print('mean_v:', mean_v) std_v = np.std(values_array) print('std_v:', std_v) self.policy_in_mcts.update_mean_std_v(mean_v, std_v) def update_summary(self, counter): self.net.Update_summary(counter) #self.global_update_count = counter # every some global_update_count dyna_step-1 # if self.use_dyna: # self.dyna_steps = 5 - self.global_update_count // 20 #logging("global_update_count: %d, dyna_steps: %d" % (self.global_update_count, self.dyna_steps)) def get_policy_input(self, obs): high_input, tech_cost, pop_num = U.get_input(obs) policy_input = np.concatenate([high_input, tech_cost, pop_num], axis=0) return policy_input def tech_step(self, tech_action): if tech_action == 0: # nothing self.safe_action(C._NO_OP, 0, []) elif tech_action == 1: # worker M.mineral_worker(self) elif tech_action == 2: # pylon no_unit_index = U.get_unit_mask_screen(self.obs, size=2) pos = U.get_pos(no_unit_index) M.build_by_idle_worker(self, C._BUILD_PYLON_S, pos) def get_simple_state(self, obs): simple_state = U.get_simple_state(obs) return simple_state def set_dyna_steps(self): global_steps = self.net.get_global_steps() # every some global_update_count dyna_step-1 self.dyna_steps = max( self.dyna_steps_fisrt - global_steps // self.dyna_decrese_counter, 0) logging("global_update_count: %d, dyna_steps: %d" % (global_steps, self.dyna_steps)) def play(self, verbose=False): M.set_source(self) if self.use_dyna: self.set_dyna_steps() tech_act, v_preds = np.zeros(2) last_obs, state_last = None, None action_last, state_now = None, None step = 0 while True: self.safe_action(C._NO_OP, 0, []) # only one second do one thing if self.policy_flag: now_obs = self.obs state_now = self.get_simple_state(now_obs) # (s_last, action) -> s_now, if last_obs: #rule_state_diff = self.predict_state_diff_by_rule(state_last, action_last) #print('state_last:', state_last, ', action_last:', action_last) #print('rule_state_diff:', rule_state_diff, 'state_diff:', state_now - state_last) if verbose: print('state_last:', state_last, ', action_last:', action_last, ', state_now:', state_now) # add data to buffer reward = self.get_mineral_reward(last_obs, now_obs) if self.reward_type == 0: reward = 0 if verbose: print("reward: ", reward) v_preds_next = self.net.policy.get_values(state_now) v_preds_next = self.get_values(v_preds_next) self.local_buffer.append(state_last, action_last, state_now, reward, v_preds, v_preds_next) # predict action tech_act, v_preds = self.net.policy.get_action(state_now, verbose=False) # print mcts choose action if self.use_mcts: game_state = GameState(dynamic_net=self.dynamic_net, state=state_now) mcts_act = UCT_search(game_state=game_state, num_reads=self.num_reads, policy_in_mcts=self.policy_in_mcts) if 1: #print('state_now:', state_now) print('mcts_act: ', mcts_act) print('\n') tech_act = mcts_act[0] # use dyna to add predicted trace if self.use_dyna: self.simulated(state_now, tech_act, v_preds, self.dyna_steps) # do action self.tech_step(tech_act) # finish step += 1 last_obs = now_obs state_last = state_now action_last = tech_act self.policy_flag = False if self.is_end: if self.rl_training: if self.reward_type == 0: final_mineral = now_obs.raw_observation.observation.player_common.minerals self.local_buffer.rewards[-1] += final_mineral print('final_mineral:', final_mineral) if verbose: print('final_reward:', self.local_buffer.rewards[-1]) self.global_buffer.add(self.local_buffer) break def simulated(self, state_now, action_now, v_preds_now, dyna_steps, append_to_buffer=True): game_state = GameState(dynamic_net=self.dynamic_net, state=state_now) sim_buffer = Buffer() for _ in range(dyna_steps): # simulate next state next_game_state = game_state.play(action_now, verbose=False) state_last = state_now action_last = action_now state_now = next_game_state.obs() v_preds_last = v_preds_now v_preds_now = self.net.policy.get_values(state_now) v_preds_now = self.get_values(v_preds_now) reward = state_now[1] - state_last[1] if append_to_buffer: sim_buffer.append(state_last, action_last, state_now, reward, v_preds_last, v_preds_now) action_now, v_preds_now = self.net.policy.get_action(state_now, verbose=False) game_state = next_game_state #print('sim_buffer:', sim_buffer) self.global_buffer.add(sim_buffer, add_return=False) def get_mineral_reward(self, old_obs, now_obs): state_last = self.get_simple_state(old_obs) state_now = self.get_simple_state(now_obs) mineral_reward = state_now[1] - state_last[1] return mineral_reward def set_flag(self): if self.step % C.time_wait(self.policy_wait_secs) == 1: self.policy_flag = True def safe_action(self, action, unit_type, args): if M.check_params(self, action, unit_type, args, 1): obs = self.env.step([sc2_actions.FunctionCall(action, args)])[0] self.obs = obs self.step += 1 self.update_result() self.set_flag() def select(self, action, unit_type, args): # safe select if M.check_params(self, action, unit_type, args, 0): self.obs = self.env.step([sc2_actions.FunctionCall(action, args)])[0] self.on_select = unit_type self.update_result() self.step += 1 self.set_flag() # else: # print('Unavailable_actions id:', action, ' and type:', unit_type, ' and args:', args) @property def result(self): return self._result def update_result(self): if self.obs is None: return if self.obs.last() or self.env.state == environment.StepType.LAST: self.is_end = True outcome = 0 o = self.obs.raw_observation player_id = o.observation.player_common.player_id for r in o.player_result: if r.player_id == player_id: outcome = sc2_env._possible_results.get(r.result, 0) frames = o.observation.game_loop result = {} result['outcome'] = outcome result['reward'] = self.obs.reward result['frames'] = frames self._result = result # print('play end, total return', self.obs.reward) def get_values(self, values): # check if the game is end if self.is_end and self.result['reward'] != 0: return 0 else: return values
class SourceAgent(base_agent.BaseAgent): """Agent for source game of starcraft.""" def __init__(self, index=0, rl_training=False, restore_model=False, global_buffer=None, net=None, strategy_agent=None): super(SourceAgent, self).__init__() self.net = net self.index = index self.global_buffer = global_buffer self.restore_model = restore_model # model in brain self.strategy_agent = strategy_agent self.strategy_act = None # count num self.step = 0 self.strategy_wait_secs = 4 self.strategy_flag = False self.policy_wait_secs = 2 self.policy_flag = True self.env = None self.obs = None # buffer self.local_buffer = Buffer() self.mini_state = [] self.mini_state_mapping = [] self.num_players = 2 self.on_select = None self._result = None self.is_end = False self.is_attack = False self._gases = None self.rl_training = rl_training self.reward_type = 0 def reset(self): super(SourceAgent, self).reset() self.step = 0 self.obs = None self._result = None self.is_end = False self.is_attack = False self._gases = None self.policy_flag = True self.local_buffer.reset() if self.strategy_agent is not None: self.strategy_agent.reset() def set_env(self, env): self.env = env def init_network(self): self.net.initialize() if self.restore_model: self.net.restore_policy() def reset_old_network(self): self.net.reset_old_network() def save_model(self): self.net.save_policy() def update_network(self, result_list): self.net.Update_policy(self.global_buffer) self.net.Update_result(result_list) def update_summary(self, counter): self.net.Update_summary(counter) def get_policy_input(self, obs): high_input, tech_cost, pop_num = U.get_input(obs) policy_input = np.concatenate([high_input, tech_cost, pop_num], axis=0) return policy_input def tech_step(self, tech_action): # to execute a tech_action # [pylon, gas1, gas2, gateway, cyber] if tech_action == 0: # pylon no_unit_index = U.get_unit_mask_screen(self.obs, size=2) pos = U.get_pos(no_unit_index) M.build_by_idle_worker(self, C._BUILD_PYLON_S, pos) elif tech_action == 1 and not U.find_gas(self.obs, 1): # gas_1 gas_1 = U.find_geyser_pos(self.obs, 1) gas_1_pos = T.world_to_screen_pos(self.env.game_info, gas_1.pos, self.obs) M.build_by_idle_worker(self, C._BUILD_ASSIMILATOR_S, gas_1_pos) elif tech_action == 1 and not U.find_gas(self.obs, 2): # gas_2 gas_2 = U.find_geyser_pos(self.obs, 2) gas_2_pos = T.world_to_screen_pos(self.env.game_info, gas_2.pos, self.obs) M.build_by_idle_worker(self, C._BUILD_ASSIMILATOR_S, gas_2_pos) elif tech_action == 2: # gateway power_index = U.get_power_mask_screen(self.obs, size=5) pos = U.get_pos(power_index) M.build_by_idle_worker(self, C._BUILD_GATEWAY_S, pos) elif tech_action == 3: # cyber power_index = U.get_power_mask_screen(self.obs, size=3) pos = U.get_pos(power_index) M.build_by_idle_worker(self, C._BUILD_CYBER_S, pos) else: self.safe_action(C._NO_OP, 0, []) def pop_step(self, pop_action): # to execute a pop_action # [ mineral_probe, zealot, stalker] #print('pop_action', pop_action) if pop_action == 0: # mineral_probe M.mineral_worker(self) # print('mineral_worker') elif pop_action == 1: # zealot M.train_army(self, C._TRAIN_ZEALOT) # print('_TRAIN_ZEALOT') elif pop_action == 2: # stalker M.train_army(self, C._TRAIN_STALKER) # print('_TRAIN_STALKER') else: self.safe_action(C._NO_OP, 0, []) def battle_step(self, battle_action): if battle_action == 0: # attack M.attack_step(self) elif battle_action == 1: # retreat M.retreat_step(self) else: self.safe_action(C._NO_OP, 0, []) def mini_step(self, action): if action == ZergAction.Build_drone.value: M.mineral_worker(self) elif action == ZergAction.Build_extractor.value: if self._gases is not None: #U.find_gas_pos(self.obs, 1) gas_1 = self._gases[0] gas_2 = self._gases[1] if gas_1 is not None and not U.is_extractor_on_gas( self.obs, gas_1): gas_1_pos = T.world_to_screen_pos(self.env.game_info, gas_1.pos, self.obs) M.build_by_idle_worker(self, C._BUILD_EXTRACTOR_S, gas_1_pos) elif gas_2 is not None and not U.is_extractor_on_gas( self.obs, gas_2): gas_2_pos = T.world_to_screen_pos(self.env.game_info, gas_2.pos, self.obs) M.build_by_idle_worker(self, C._BUILD_EXTRACTOR_S, gas_2_pos) elif action == ZergAction.Gather_gas.value: M.gather_resource(self, 'gas') elif action == ZergAction.Gather_mineral.value: M.gather_resource(self, 'mineral') elif action == ZergAction.Build_queen.value: M.train_army(self, C._TRAIN_QUEEN) elif action == ZergAction.Build_zergling.value: M.train_army(self, C._TRAIN_ZERGLING) elif action == ZergAction.Build_roach.value: M.train_army(self, C._TRAIN_ROACH) elif action == ZergAction.Build_overlord.value: M.train_army(self, C._TRAIN_OVERLORD) elif action == ZergAction.Build_spawningpool.value: creep_index = U.get_creep_mask_screen(self.obs, size=2) pos = U.get_pos(creep_index) M.build_by_idle_worker(self, C._BUILD_SPAWNINGPOOL_S, pos) elif action == ZergAction.Build_roachwarren.value: creep_index = U.get_creep_mask_screen(self.obs, size=2) pos = U.get_pos(creep_index) M.build_by_idle_worker(self, C._BUILD_ROACHWARREN_S, pos) elif action == ZergAction.Build_evolutionchamber.value: creep_index = U.get_creep_mask_screen(self.obs, size=2) pos = U.get_pos(creep_index) M.build_by_idle_worker(self, C._BUILD_EVOLUTIONCHAMBER_S, pos) elif action == ZergAction.Build_spinecrawler.value: creep_index = U.get_creep_mask_screen(self.obs, size=2) pos = U.get_pos(creep_index) M.build_by_idle_worker(self, C._BUILD_SPINECRAWLER_S, pos) elif action == ZergAction.Attack.value: self.is_attack = True elif action == ZergAction.Defend.value: M.retreat_step(self) else: self.safe_action(C._NO_OP, 0, []) if self.is_attack: M.attack_step(self) # if any queen exists, try to inject lavra to hatchery. M.inject_larva(self) def get_the_input(self): high_input, tech_cost, pop_num = U.get_input(self.obs) controller_input = np.concatenate([high_input, tech_cost, pop_num], axis=0) return controller_input def combine_state_and_mini_action(self, state, strategy_act): act = np.zeros((1, 1)) act[0, 0] = strategy_act action_array = self.one_hot_label(act, C._SIZE_MINI_ACTIONS)[0] combined_state = np.concatenate([state, action_array], axis=0) return combined_state def mapping_source_to_mini(self, source_state): mini_state = self.net.mapping.predict_func(source_state, use_transform=False) return mini_state def mapping_source_to_mini_by_rule(self, source_state): simple_input = np.zeros([17]) simple_input[0] = 0 # self.time_seconds simple_input[1] = source_state[31] # self.mineral_worker_nums simple_input[2] = source_state[33] + source_state[ 35] # self.gas_worker_nums simple_input[3] = source_state[2] # self.mineral simple_input[4] = source_state[3] # self.gas simple_input[5] = source_state[6] # self.food_cap simple_input[6] = source_state[7] # self.food_used simple_input[7] = source_state[10] # self.army_nums simple_input[8] = source_state[11] # self.larva_num simple_input[9] = source_state[15] # self.overlord_num simple_input[10] = source_state[17] # self.spawningpool_num simple_input[11] = source_state[18] # self.roachwarren_num simple_input[12] = source_state[13] # self.zergling_num simple_input[13] = source_state[14] # self.roach_num simple_input[14] = source_state[16] # self.extractor_num simple_input[15] = source_state[19] # self.evolutionchamber_num simple_input[16] = source_state[20] # self.queen_num return simple_input def play(self, verbose=False): is_attack = False state_last = None self.safe_action(C._NO_OP, 0, []) self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) self._gases = U.find_initial_gases(self.obs) while True: #self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) self.safe_action(C._NO_OP, 0, []) if self.policy_flag and (not self.is_end): state_now = self.mapping_source_to_mini_by_rule( self.get_the_input()) action, v_preds = self.net.policy.get_action(state_now, verbose=False) #action = 3 self.mini_step(action) if state_last is not None: # print(state_now) # time.sleep(0.5) v_preds_next = self.net.policy.get_values(state_now) v_preds_next = self.get_values(v_preds_next) reward = 0 self.local_buffer.append(state_last, action_last, state_now, reward, v_preds, v_preds_next) if action == ZergAction.Attack.value: is_attack = True if is_attack: self.mini_step(ZergAction.Attack.value) state_last = state_now action_last = action self.policy_flag = False if self.is_end: if self.rl_training: self.local_buffer.rewards[-1] += 1 * self.result[ 'reward'] # self.result['win'] print(self.local_buffer.rewards) self.global_buffer.add(self.local_buffer) print("add %d buffer!" % (len(self.local_buffer.rewards))) break # def play(self, verbose=False): # is_attack = False # while True: # #self.safe_action(C._NO_OP, 0, []) # self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) # if self.policy_flag and (not self.is_end): # mini_state_mapping = self.mapping_source_to_mini_by_rule(self.get_the_input()) # #print('state:', mini_state_mapping) # mini_act = self.strategy_agent.get_action_by_policy(mini_state_mapping)[0] # print("Action: ", ZergAction(int(mini_act)).name) # self.mini_step(mini_act) # if mini_act == ZergAction.Attack.value: # is_attack = True # if is_attack: # self.mini_step(ZergAction.Attack.value) # self.policy_flag = False # if self.is_end: # break def set_flag(self): if self.step % C.time_wait(self.strategy_wait_secs) == 1: self.strategy_flag = True if self.step % C.time_wait(self.policy_wait_secs) == 1: self.policy_flag = True def safe_action(self, action, unit_type, args): if M.check_params(self, action, unit_type, args, 1): self.obs = self.env.step([sc2_actions.FunctionCall(action, args)])[0] self.step += 1 self.update_result() self.set_flag() def select(self, action, unit_type, args): # safe select if M.check_params(self, action, unit_type, args, 0): self.obs = self.env.step([sc2_actions.FunctionCall(action, args)])[0] self.on_select = unit_type self.update_result() self.step += 1 self.set_flag() @property def result(self): return self._result def update_result(self): if self.obs is None: return if self.obs.last() or self.env.state == environment.StepType.LAST: self.is_end = True outcome = 0 o = self.obs.raw_observation player_id = o.observation.player_common.player_id for r in o.player_result: if r.player_id == player_id: outcome = sc2_env._possible_results.get(r.result, 0) frames = o.observation.game_loop result = {} result['outcome'] = outcome result['reward'] = self.obs.reward result['frames'] = frames result['win'] = 0 if result['reward'] == 1: result['win'] = 1 self._result = result print('play end, total return', self.obs.reward) self.step = 0 def one_hot_label(self, action_type_array, action_max_num): rows = action_type_array.shape[0] cols = action_max_num data = np.zeros((rows, cols)) for i in range(rows): data[i, int(action_type_array[i])] = 1 return data def get_values(self, values): # check if the game is end if self.is_end and self.result['reward'] != 0: return 0 else: return values
class SourceAgent(base_agent.BaseAgent): """Agent for source game of starcraft.""" def __init__(self, index=0, rl_training=False, restore_model=False, global_buffer=None, net=None, strategy_agent=None): super(SourceAgent, self).__init__() self.net = net self.index = index self.global_buffer = global_buffer self.restore_model = restore_model # model in brain self.strategy_agent = strategy_agent self.strategy_act = None # count num self.step = 0 self.strategy_wait_secs = 4 self.strategy_flag = False self.policy_wait_secs = 2 self.policy_flag = True self.env = None self.obs = None # buffer self.local_buffer = Buffer() self.mini_state = [] self.mini_state_mapping = [] self.num_players = 2 self.on_select = None self._result = None self.is_end = False self.rl_training = rl_training self.reward_type = 0 def reset(self): super(SourceAgent, self).reset() self.step = 0 self.obs = None self._result = None self.is_end = False self.policy_flag = True self.local_buffer.reset() self.strategy_agent.reset() def set_env(self, env): self.env = env def init_network(self): self.net.initialize() if self.restore_model: self.net.restore_policy() def reset_old_network(self): self.net.reset_old_network() def save_model(self): self.net.save_policy() def update_network(self, result_list): self.net.Update_policy(self.global_buffer) self.net.Update_result(result_list) def update_summary(self, counter): self.net.Update_summary(counter) def get_policy_input(self, obs): high_input, tech_cost, pop_num = U.get_input(obs) policy_input = np.concatenate([high_input, tech_cost, pop_num], axis=0) return policy_input def tech_step(self, tech_action): # to execute a tech_action # [pylon, gas1, gas2, gateway, cyber] if tech_action == 0: # pylon no_unit_index = U.get_unit_mask_screen(self.obs, size=2) pos = U.get_pos(no_unit_index) M.build_by_idle_worker(self, C._BUILD_PYLON_S, pos) elif tech_action == 1 and not U.find_gas(self.obs, 1): # gas_1 gas_1 = U.find_gas_pos(self.obs, 1) gas_1_pos = T.world_to_screen_pos(self.env.game_info, gas_1.pos, self.obs) M.build_by_idle_worker(self, C._BUILD_ASSIMILATOR_S, gas_1_pos) elif tech_action == 1 and not U.find_gas(self.obs, 2): # gas_2 gas_2 = U.find_gas_pos(self.obs, 2) gas_2_pos = T.world_to_screen_pos(self.env.game_info, gas_2.pos, self.obs) M.build_by_idle_worker(self, C._BUILD_ASSIMILATOR_S, gas_2_pos) elif tech_action == 2: # gateway power_index = U.get_power_mask_screen(self.obs, size=5) pos = U.get_pos(power_index) M.build_by_idle_worker(self, C._BUILD_GATEWAY_S, pos) elif tech_action == 3: # cyber power_index = U.get_power_mask_screen(self.obs, size=3) pos = U.get_pos(power_index) M.build_by_idle_worker(self, C._BUILD_CYBER_S, pos) else: self.safe_action(C._NO_OP, 0, []) def pop_step(self, pop_action): # to execute a pop_action # [ mineral_probe, zealot, stalker] #print('pop_action', pop_action) if pop_action == 0: # mineral_probe M.mineral_worker(self) # print('mineral_worker') elif pop_action == 1: # zealot M.train_army(self, C._TRAIN_ZEALOT) # print('_TRAIN_ZEALOT') elif pop_action == 2: # stalker M.train_army(self, C._TRAIN_STALKER) # print('_TRAIN_STALKER') else: self.safe_action(C._NO_OP, 0, []) def battle_step(self, battle_action): if battle_action == 0: # attack M.attack_step(self) elif battle_action == 1: # retreat M.retreat_step(self) else: self.safe_action(C._NO_OP, 0, []) def mini_step(self, action): if action == ProtossAction.Build_worker.value: M.mineral_worker(self) elif action == ProtossAction.Build_zealot.value: M.train_army(self, C._TRAIN_ZEALOT) elif action == ProtossAction.Build_pylon.value: no_unit_index = U.get_unit_mask_screen(self.obs, size=2) pos = U.get_pos(no_unit_index) M.build_by_idle_worker(self, C._BUILD_PYLON_S, pos) elif action == ProtossAction.Build_gateway.value: power_index = U.get_power_mask_screen(self.obs, size=5) pos = U.get_pos(power_index) M.build_by_idle_worker(self, C._BUILD_GATEWAY_S, pos) elif action == ProtossAction.Attack.value: M.attack_step(self) elif action == ProtossAction.Defend.value: M.retreat_step(self) elif action == ProtossAction.Build_sub_base.value: self.safe_action(C._NO_OP, 0, []) elif action == ProtossAction.Build_cannon.value: self.safe_action(C._NO_OP, 0, []) else: self.safe_action(C._NO_OP, 0, []) def get_the_input(self): high_input, tech_cost, pop_num = U.get_input(self.obs) controller_input = np.concatenate([high_input, tech_cost, pop_num], axis=0) return controller_input def combine_state_and_mini_action(self, state, strategy_act): act = np.zeros((1, 1)) act[0, 0] = strategy_act action_array = self.one_hot_label(act, C._SIZE_MINI_ACTIONS)[0] combined_state = np.concatenate([state, action_array], axis=0) return combined_state def mapping_source_to_mini(self, source_state): mini_state = self.net.mapping.predict_func(source_state, use_transform=False) return mini_state def mapping_source_to_mini_by_rule(self, source_state): simple_input = np.zeros([11]) simple_input[0] = 0 # self.time_seconds simple_input[1] = source_state[28] # self.mineral_worker_nums simple_input[2] = source_state[30] + source_state[ 32] # self.gas_worker_nums simple_input[3] = source_state[2] # self.mineral simple_input[4] = source_state[3] # self.gas simple_input[5] = source_state[6] # self.food_cup simple_input[6] = source_state[7] # self.food_used simple_input[7] = source_state[10] # self.army_nums simple_input[8] = source_state[16] # self.gateway_num simple_input[9] = source_state[14] # self.pylon_num simple_input[10] = source_state[12] # self.zealot_num return simple_input def play_bak(self, verbose=False): # self.safe_action(C._NO_OP, 0, []) state_last = None mini_state = self.strategy_agent.obs() while True: self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) source_state = self.get_the_input() mini_state_mapping = self.mapping_source_to_mini_by_rule( source_state) if 0: print('source_state:', source_state) print('mini_state_mapping:', mini_state_mapping) # test use mini_state_mapping strategy_state = mini_state_mapping mini_act = self.strategy_agent.get_action_by_policy( strategy_state)[0] #print('strategy_act:', mini_act) self.strategy_agent.set_obs(strategy_state) mini_state = self.strategy_agent.get_next_state(mini_act) self.strategy_act = mini_act self.strategy_flag = False while (not self.strategy_flag) and (not self.is_end): self.safe_action(C._NO_OP, 0, []) if self.policy_flag and (not self.is_end): state_now = self.combine_state_and_mini_action( self.get_the_input(), self.strategy_act) #print('state_now:', state_now) action, v_preds = self.net.policy.get_action(state_now, verbose=False) #print('action:', action) print('action:', self.strategy_act) self.mini_step(self.strategy_act) ''' if action < C._SIZE_TECH_NET_OUT: reward = self.tech_step(action) elif action < C._SIZE_TECH_NET_OUT + C._SIZE_POP_NET_OUT: reward = self.pop_step(action - C._SIZE_TECH_NET_OUT) elif action < C._SIZE_TECH_NET_OUT + C._SIZE_POP_NET_OUT + C._SIZE_BATTLE_NET_OUT: reward = self.battle_step(action - C._SIZE_TECH_NET_OUT - C._SIZE_POP_NET_OUT) else: self.safe_action(C._NO_OP, 0, []) reward = 0 ''' if state_last is not None: if 0: print('state_last:', state_last, ', action_last:', action_last, ', state_now:', state_now) v_preds_next = self.net.policy.get_values(state_now) v_preds_next = self.get_values(v_preds_next) reward = 0 self.local_buffer.append(state_last, action_last, state_now, reward, v_preds, v_preds_next) state_last = state_now action_last = action self.policy_flag = False if self.is_end: if self.rl_training: self.local_buffer.rewards[-1] += 1 * self.result['reward'] print(self.local_buffer.rewards) self.global_buffer.add(self.local_buffer) print("add %d buffer!" % (len(self.local_buffer.rewards))) break def play(self, verbose=False): is_attack = False while True: #self.safe_action(C._NO_OP, 0, []) self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) if self.policy_flag and (not self.is_end): mini_state_mapping = self.mapping_source_to_mini_by_rule( self.get_the_input()) #print('state:', mini_state_mapping) mini_act = self.strategy_agent.get_action_by_policy( mini_state_mapping)[0] print('action:', mini_act) self.mini_step(mini_act) if mini_act == ProtossAction.Attack.value: is_attack = True if is_attack: self.mini_step(ProtossAction.Attack.value) self.policy_flag = False if self.is_end: break def set_flag(self): if self.step % C.time_wait(self.strategy_wait_secs) == 1: self.strategy_flag = True if self.step % C.time_wait(self.policy_wait_secs) == 1: self.policy_flag = True def safe_action(self, action, unit_type, args): if M.check_params(self, action, unit_type, args, 1): obs = self.env.step([sc2_actions.FunctionCall(action, args)])[0] self.obs = obs self.step += 1 self.update_result() self.set_flag() def select(self, action, unit_type, args): # safe select if M.check_params(self, action, unit_type, args, 0): self.obs = self.env.step([sc2_actions.FunctionCall(action, args)])[0] self.on_select = unit_type self.update_result() self.step += 1 self.set_flag() @property def result(self): return self._result def update_result(self): if self.obs is None: return if self.obs.last() or self.env.state == environment.StepType.LAST: self.is_end = True outcome = 0 o = self.obs.raw_observation player_id = o.observation.player_common.player_id for r in o.player_result: if r.player_id == player_id: outcome = sc2_env._possible_results.get(r.result, 0) frames = o.observation.game_loop result = {} result['outcome'] = outcome result['reward'] = self.obs.reward result['frames'] = frames result['win'] = 0 if result['reward'] == 1: result['win'] = 1 self._result = result print('play end, total return', self.obs.reward) self.step = 0 def one_hot_label(self, action_type_array, action_max_num): rows = action_type_array.shape[0] cols = action_max_num data = np.zeros((rows, cols)) for i in range(rows): data[i, int(action_type_array[i])] = 1 return data def get_values(self, values): # check if the game is end if self.is_end and self.result['reward'] != 0: return 0 else: return values
class MiniSourceAgent(base_agent.BaseAgent): """Agent for source game of starcraft.""" def __init__(self, index=0, rl_training=False, restore_model=False, global_buffer=None, net=None, strategy_agent=None, greedy_action=False): super(MiniSourceAgent, self).__init__() self.net = net self.index = index self.global_buffer = global_buffer self.restore_model = restore_model # model in brain self.strategy_agent = strategy_agent self.strategy_act = None # count num self.step = 0 self.strategy_wait_secs = 3 self.strategy_flag = False self.policy_wait_secs = 2 self.policy_flag = True self.env = None self.obs = None # buffer self.local_buffer = Buffer() self.num_players = 2 self.on_select = None self._result = None self._gases = None self.is_end = False self.greedy_action = greedy_action self.rl_training = rl_training self.reset_tc() def reset(self): super(MiniSourceAgent, self).reset() self.step = 0 self.obs = None self._result = None self._gases = None self.is_end = False self.strategy_flag = False self.policy_flag = True self.local_buffer.reset() if self.strategy_agent is not None: self.strategy_agent.reset() self.reset_tc() def reset_tc(self): self.num_pylon = 0 self.num_gateway = 0 self.num_cyber = 0 self.max_pylon = 20 self.max_gateway = 5 self.max_cyber = 3 self.enemy_pos = None self.retreat_pos = None self.rally_pos = [455, 165] self.base = None self.resourceUnits = [] self.vespeneUnits = [] self.initial_scout = False self.initial = False def set_env(self, env): self.env = env def set_obs(self, state): self.obs = state def init_network(self): self.net.initialize() if self.restore_model: self.net.restore_policy() def reset_old_network(self): self.net.reset_old_network() def save_model(self): self.net.save_policy() def update_network(self, result_list): self.net.Update_policy(self.global_buffer) self.net.Update_result(result_list) def update_summary(self, counter): return self.net.Update_summary(counter) def mini_step(self, action): if action == ProtossAction.Build_probe.value: M.train_unit(self, T.Protoss_Nexus, T.Protoss_Probe) elif action == ProtossAction.Build_zealot.value: M.train_unit(self, T.Protoss_Gateway, T.Protoss_Zealot) elif action == ProtossAction.Build_Stalker.value: M.train_unit(self, T.Protoss_Gateway, T.Protoss_Dragoon) elif action == ProtossAction.Build_pylon.value: if not self.initial_scout: M.scout_manager(self) self.initial_scout = True pos = self.placer_manager(self.base, T.Protoss_Pylon) M.build_by_worker(self, T.Protoss_Probe, T.Protoss_Pylon, pos) elif action == ProtossAction.Build_gateway.value: pos = self.placer_manager(self.base, T.Protoss_Gateway) M.build_by_worker(self, T.Protoss_Probe, T.Protoss_Gateway, pos) elif action == ProtossAction.Build_Assimilator.value: pos = self.placer_manager(self.base, T.Protoss_Assimilator) M.build_by_worker(self, T.Protoss_Probe, T.Protoss_Assimilator, pos) elif action == ProtossAction.Build_CyberneticsCore.value: pos = self.placer_manager(self.base, T.Protoss_Cybernetics_Core) M.build_by_worker(self, T.Protoss_Probe, T.Protoss_Cybernetics_Core, pos) elif action == ProtossAction.Attack.value: M.attack_step(self, [T.Protoss_Zealot, T.Protoss_Dragoon], self.enemy_pos) elif action == ProtossAction.Retreat.value: M.retreat_step(self, [T.Protoss_Zealot, T.Protoss_Dragoon], self.retreat_pos) #pass elif action == ProtossAction.Do_nothing.value: M.no_op(self) def calculate_features(self): state = self.obs myunits = state.units[state.player_id] self.mineral_worker_nums = 0 self.gas_worker_nums = 0 self.spent_mineral = 0 self.spent_gas = 0 self.probe_num = 0 self.zealot_num = 0 self.Stalker_num = 0 self.army_nums = 0 self.gateway_num = 0 self.pylon_num = 0 self.Assimilator_num = 0 self.CyberneticsCore_num = 0 for unit in myunits: if unit.type == T.Protoss_Probe: self.spent_mineral += P.Probe().mineral_price if unit.completed: self.probe_num += 1 if unit.gathering_minerals: self.mineral_worker_nums += 1 if unit.gathering_gas: self.gas_worker_nums += 1 if unit.type == T.Protoss_Zealot: self.spent_mineral += P.Zealot().mineral_price if unit.completed: if unit.visible: self.zealot_num += 1 self.army_nums += 1 else: print('find invisible Zealot') if unit.type == T.Protoss_Dragoon: self.spent_mineral += P.Stalker().mineral_price self.spent_gas += P.Stalker().gas_price if unit.completed: if unit.visible: self.Stalker_num += 1 self.army_nums += 1 else: print('find invisible Dragoon') if unit.type == T.Protoss_Pylon: self.spent_mineral += P.Pylon().mineral_price if unit.completed: self.pylon_num += 1 if unit.type == T.Protoss_Gateway: self.spent_mineral += P.Gateway().mineral_price if unit.completed: self.gateway_num += 1 if unit.type == T.Protoss_Assimilator: self.spent_mineral += P.Assimilator().mineral_price if unit.completed: self.Assimilator_num += 1 if unit.type == T.Protoss_Cybernetics_Core: self.spent_mineral += P.CyberneticsCore().mineral_price if unit.completed: self.CyberneticsCore_num += 1 self.mineral = state.frame.resources[state.player_id].ore self.gas = state.frame.resources[state.player_id].gas self.food_used = state.frame.resources[state.player_id].used_psi self.food_cup = state.frame.resources[state.player_id].total_psi def mapping_source_to_mini_by_rule(self, state): simple_input = np.zeros([20]) simple_input[0] = 0 # self.time_seconds simple_input[1] = self.mineral_worker_nums # self.mineral_worker_nums simple_input[2] = self.gas_worker_nums # self.gas_worker_nums simple_input[3] = self.mineral # self.mineral simple_input[4] = self.gas # self.gas simple_input[5] = self.food_cup # self.food_cup simple_input[6] = self.food_used # self.food_used simple_input[7] = self.army_nums # self.army_nums simple_input[8] = self.gateway_num # self.gateway_num simple_input[9] = self.pylon_num # self.pylon_num simple_input[10] = self.Assimilator_num # self.Assimilator_num simple_input[11] = self.CyberneticsCore_num # self.CyberneticsCore_num simple_input[12] = self.zealot_num # self.zealot_num simple_input[13] = self.Stalker_num # self.Stalker_num simple_input[14] = self.probe_num # self.probe_num simple_input[ 15] = self.mineral + self.spent_mineral # self.collected_mineral simple_input[16] = self.spent_mineral # self.spent_mineral simple_input[17] = self.gas + self.spent_gas # self.collected_gas simple_input[18] = self.spent_gas # self.spent_gas simple_input[19] = 1 # self.Nexus_num return simple_input def play(self, verbose=False): self.play_train_mini(verbose=verbose) def play_train_mini(self, verbose=False): is_attack = False state_last = None while not self.obs.game_ended: #print('self.step:', self.step) #print('self.frame_from_bwapi:', self.obs.frame_from_bwapi) if self.obs.game_ended: break if self.step >= C.time_wait_sc1(900): self.env.send([[tcc.restart]]) self.obs = self.env.recv() self.update_result(time_out=True) continue if not self.initial: self.initial_manager() self.initial = True self.safe_action([]) if self.policy_flag and (not self.is_end): self.calculate_features() state_now = self.mapping_source_to_mini_by_rule(self.obs) if self.greedy_action: action_prob, v_preds = self.net.policy.get_action_probs( state_now, verbose=False) action = np.argmax(action_prob) else: action, v_preds = self.net.policy.get_action(state_now, verbose=False) #print(ProtossAction(action).name) self.mini_step(action) if self.is_end: #self.env.send([[tcc.restart]]) #self.obs = self.env.recv() break if state_last is not None: if 0: print('state_last:', state_last, ', action_last:', action_last, ', state_now:', state_now) v_preds_next = self.net.policy.get_values(state_now) v_preds_next = self.get_values(v_preds_next) reward = 0 self.local_buffer.append(state_last, action_last, state_now, reward, v_preds, v_preds_next) # continuous attack, consistent with mind-game if action == ProtossAction.Attack.value: is_attack = True if is_attack: self.mini_step(ProtossAction.Attack.value) if self.is_end: break state_last = state_now action_last = action self.policy_flag = False if self.strategy_flag and (not self.is_end): #print('self.strategy_flag:', self.strategy_flag) M.worker_manager(self) self.strategy_flag = False if self.rl_training: self.local_buffer.rewards[-1] += 1 * self.result[ 'reward'] # self.result['win'] #print(self.local_buffer.rewards) self.global_buffer.add(self.local_buffer) #print("add %d buffer!" % (len(self.local_buffer.rewards))) def set_flag(self): if self.step % C.time_wait_sc1(self.strategy_wait_secs) == 1: self.strategy_flag = True if self.step % C.time_wait_sc1(self.policy_wait_secs) == 1: self.policy_flag = True def safe_action(self, actions): if len(actions) > 0: pass #print("Sending actions: " + str(actions)) self.env.send(actions) self.obs = self.env.recv() self.step += 1 self.update_result() self.set_flag() @property def result(self): return self._result def judge_if_we_win_or_draw(self): enemy_untis = None for i in self.obs.units: if i != self.obs.player_id and i != self.obs.neutral_id: enemy_untis = self.obs.units[i] if enemy_untis is None: return True else: if len(enemy_untis) <= 4: army = M.selectArmy(self, [T.Protoss_Zealot, T.Protoss_Dragoon]) if army is not None: if len(army) >= 6: return True else: print('len(army):', len(army)) print('len(enemy_untis):', len(enemy_untis)) return False def update_result(self, time_out=False): if self.obs is None: return if self.obs.waiting_for_restart: print("WAITING FOR RESTART...") if self.obs.game_ended: self.is_end = True frames = self.obs.frame_from_bwapi outcome = self.obs.game_won reward = self.obs.game_won if time_out: if self.judge_if_we_win_or_draw(): reward = 1 outcome = 1 else: reward = 0 outcome = 0 elif not self.obs.game_won: reward = -1 outcome = -1 result = {} result['outcome'] = outcome result['reward'] = reward result['frames'] = frames self._result = result #print('play end, total result', self._result) self.step = 0 def get_values(self, values): # check if the game is end if self.is_end and self.result['reward'] != 0: return 0 else: return values def placer_manager(self, base, unit_type): pylon_size = 8 gateway_size = 16 cyber_size = 16 if unit_type == T.Protoss_Pylon: initial_polyon_x = base.x - 16 initial_polyon_y = base.y + 8 #print(initial_polyon_x, initial_polyon_y) colum_index = 0 row_index = self.num_pylon if self.num_pylon > 0.5 * self.max_pylon: row_index = self.num_pylon - 0.5 * self.max_pylon colum_index = 1 target_x = initial_polyon_x - int(colum_index * 8) target_y = initial_polyon_y + int(row_index * pylon_size) self.num_pylon = (self.num_pylon + 1) % self.max_pylon return [target_x, target_y] elif unit_type == T.Protoss_Gateway: initial_gateway_x = base.x - 8 initial_gateway_y = base.y + 10 #print(initial_gateway_x, initial_gateway_y) target_x = initial_gateway_x target_y = initial_gateway_y + self.num_gateway * gateway_size self.num_gateway = (self.num_gateway + 1) % self.max_gateway return [target_x, target_y] elif unit_type == T.Protoss_Cybernetics_Core: initial_cyber_x = base.x - 36 initial_cyber_y = base.y + 10 #print(initial_cyber_x, initial_cyber_y) target_x = initial_cyber_x target_y = initial_cyber_y + self.num_cyber * cyber_size self.num_cyber = (self.num_cyber + 1) % self.max_cyber return [target_x, target_y] elif unit_type == T.Protoss_Assimilator: if len(self.vespeneUnits) > 0: vespene = self.vespeneUnits[0] target_x = vespene.x - 8 target_y = vespene.y - 4 #print(target_x, target_y) return [target_x, target_y] return [-1, -1] def initial_manager(self): self.obs = self.env.recv() state = self.obs frame_no = state.frame_from_bwapi #print('begin frame_no:', frame_no) myunits = state.units[state.player_id] # initial base for unit in myunits: if unit.type == T.Protoss_Nexus: self.base = unit break # initial mineral and gas neutralUnits = state.units[state.neutral_id] for u in neutralUnits: if u.type == T.Resource_Mineral_Field or u.type == T.Resource_Mineral_Field_Type_2 \ or u.type == T.Resource_Mineral_Field_Type_3: if u.visible: self.resourceUnits.append(u) if u.type == T.Resource_Vespene_Geyser: if u.visible: self.vespeneUnits.append(u) #print('resourceUnits:', len(self.resourceUnits)) #print('vespeneUnits:', len(self.vespeneUnits)) # initial workers actions = [] for unit in myunits: if unit.type == T.Protoss_Probe and unit.completed: if unit.idle: target = M.get_closest(unit.x, unit.y, self.resourceUnits) actions.append([ tcc.command_unit, unit.id, tcc.unitcommandtypes.Right_Click_Unit, target.id, ]) self.safe_action(actions)
class SourceAgent(base_agent.BaseAgent): """Agent for source game of starcraft.""" def __init__(self, index=0, rl_training=False, restore_model=False, global_buffer=None, net=None, strategy_agent=None): super(SourceAgent, self).__init__() self.net = net self.index = index self.global_buffer = global_buffer self.restore_model = restore_model # model in brain self.strategy_agent = strategy_agent self.strategy_act = None # count num self.step = 0 self.strategy_wait_secs = 4 self.strategy_flag = False self.policy_wait_secs = 2 self.policy_flag = True self.env = None self.obs = None # buffer self.local_buffer = Buffer() self.mini_state = [] self.mini_state_mapping = [] self.num_players = 2 self.on_select = None self._result = None self.is_end = False self.is_attack = False self._gases = None self.rl_training = rl_training self.reward_type = 0 def reset(self): super(SourceAgent, self).reset() self.step = 0 self.obs = None self._result = None self.is_end = False self.is_attack = False self._gases = None self.policy_flag = True self.local_buffer.reset() if self.strategy_agent is not None: self.strategy_agent.reset() def set_env(self, env): self.env = env def init_network(self): self.net.initialize() if self.restore_model: self.net.restore_policy() def reset_old_network(self): self.net.reset_old_network() def save_model(self): self.net.save_policy() def update_network(self, result_list): self.net.Update_policy(self.global_buffer) self.net.Update_result(result_list) def update_summary(self, counter): return self.net.Update_summary(counter) def get_policy_input(self, obs): high_input, tech_cost, pop_num = U.get_input(obs) policy_input = np.concatenate([high_input, tech_cost, pop_num], axis=0) return policy_input def mini_step(self, action): if action == TerranAction.Build_SCV.value: M.mineral_worker(self) elif action == TerranAction.Build_Refinery.value: if self._gases is not None: # U.find_gas_pos(self.obs, 1) gas_1 = self._gases[0] gas_2 = self._gases[1] if gas_1 is not None and not U.is_Refinery_on_gas(self.obs, gas_1): gas_1_pos = T.world_to_screen_pos(self.env.game_info, gas_1.pos, self.obs) M.build_by_idle_worker(self, C._BUILD_REFINERY_S, gas_1_pos) elif gas_2 is not None and not U.is_Refinery_on_gas(self.obs, gas_2): gas_2_pos = T.world_to_screen_pos(self.env.game_info, gas_2.pos, self.obs) M.build_by_idle_worker(self, C._BUILD_REFINERY_S, gas_2_pos) elif action == TerranAction.Gather_gas.value: M.gather_resource(self, 'gas') elif action == TerranAction.Gather_mineral.value: M.gather_resource(self, 'mineral') elif action == TerranAction.Build_Marine.value: M.train_army(self, C._TRAIN_MARINE) elif action == TerranAction.Build_Reaper.value: M.train_army(self, C._TRAIN_REAPER) elif action == TerranAction.Build_SupplyDepot.value: valid_index = U.get_valid_mask_screen(self.obs, size=3) pos = U.get_pos(valid_index) M.build_by_idle_worker(self, C._BUILD_SUPPLYDEPOT_S, pos) elif action == TerranAction.Build_Barracks.value: valid_index = U.get_valid_mask_screen(self.obs, size=3) pos = U.get_pos(valid_index) M.build_by_idle_worker(self, C._BUILD_BARRACKS_S, pos) elif action == TerranAction.Attack.value: self.is_attack = True elif action == TerranAction.Defend.value: M.retreat_step(self) else: self.safe_action(C._NO_OP, 0, []) if self.is_attack: M.attack_step(self) # if any queen exists, try to inject lavra to hatchery. # M.inject_larva(self) TODO def get_the_input(self): high_input, tech_cost, pop_num = U.get_input(self.obs) controller_input = np.concatenate([high_input, tech_cost, pop_num], axis=0) return controller_input def combine_state_and_mini_action(self, state, strategy_act): act = np.zeros((1, 1)) act[0, 0] = strategy_act action_array = self.one_hot_label(act, C._SIZE_MINI_ACTIONS)[0] combined_state = np.concatenate([state, action_array], axis=0) return combined_state def mapping_source_to_mini(self, source_state): mini_state = self.net.mapping.predict_func(source_state, use_transform=False) return mini_state def mapping_source_to_mini_by_rule(self, source_state): simple_input = np.zeros([17]) simple_input[0] = 0 # self.time_seconds simple_input[1] = source_state[27] # self.mineral_worker_nums simple_input[2] = source_state[29] + source_state[31] # self.gas_worker_nums simple_input[3] = source_state[2] # self.mineral simple_input[4] = source_state[3] # self.gas simple_input[5] = source_state[6] # self.food_cap simple_input[6] = source_state[7] # self.food_used simple_input[7] = source_state[10] # self.army_nums simple_input[8] = source_state[11] # self.Refinery_num simple_input[9] = source_state[12] # self.SupplyDepot_num simple_input[10] = source_state[13] # self.Barracks_num simple_input[11] = source_state[15] # self.Marine_num simple_input[12] = source_state[16] # self.Reaper_num simple_input[13] = source_state[4] + source_state[2] simple_input[14] = source_state[5] + source_state[3] simple_input[15] = source_state[4] simple_input[16] = source_state[5] return simple_input def play(self, verbose=False): is_attack = False state_last = None self.safe_action(C._NO_OP, 0, []) self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) self._gases = U.find_initial_gases(self.obs) while True: # self.mini_step(3) self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) #self.safe_action(C._NO_OP, 0, []) if self.policy_flag and (not self.is_end): state_now = self.mapping_source_to_mini_by_rule(self.get_the_input()) # action_prob, v_preds = self.net.policy.get_action_probs(state_now, verbose=False) # action = np.argmax(action_prob) # print(action) action, v_preds = self.net.policy.get_action(state_now, verbose=False) # print(action) self.mini_step(action) if state_last is not None: # print(state_now) # print(TerranAction(int(action)).name) # time.sleep(0.5) v_preds_next = self.net.policy.get_values(state_now) v_preds_next = self.get_values(v_preds_next) reward = 0 self.local_buffer.append(state_last, action_last, state_now, reward, v_preds, v_preds_next) state_last = state_now action_last = action self.policy_flag = False if self.is_end: if self.rl_training: self.local_buffer.rewards[-1] += 1 * self.result['reward'] # self.result['win'] # print(self.local_buffer.rewards) self.global_buffer.add(self.local_buffer) # print("add %d buffer!" % (len(self.local_buffer.rewards))) break # def play(self, verbose=False): # is_attack = False # while True: # #self.safe_action(C._NO_OP, 0, []) # self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) # if self.policy_flag and (not self.is_end): # mini_state_mapping = self.mapping_source_to_mini_by_rule(self.get_the_input()) # #print('state:', mini_state_mapping) # mini_act = self.strategy_agent.get_action_by_policy(mini_state_mapping)[0] # print("Action: ", ZergAction(int(mini_act)).name) # self.mini_step(mini_act) # if mini_act == ZergAction.Attack.value: # is_attack = True # if is_attack: # self.mini_step(ZergAction.Attack.value) # self.policy_flag = False # if self.is_end: # break def set_flag(self): if self.step % C.time_wait(self.strategy_wait_secs) == 1: self.strategy_flag = True if self.step % C.time_wait(self.policy_wait_secs) == 1: self.policy_flag = True def safe_action(self, action, unit_type, args): if M.check_params(self, action, unit_type, args, 1): self.obs = self.env.step([sc2_actions.FunctionCall(action, args)])[0] self.step += 1 self.update_result() self.set_flag() def select(self, action, unit_type, args): # safe select if M.check_params(self, action, unit_type, args, 0): self.obs = self.env.step([sc2_actions.FunctionCall(action, args)])[0] self.on_select = unit_type self.update_result() self.step += 1 self.set_flag() @property def result(self): return self._result def update_result(self): if self.obs is None: return if self.obs.last() or self.env.state == environment.StepType.LAST: self.is_end = True outcome = 0 o = self.obs.raw_observation player_id = o.observation.player_common.player_id for r in o.player_result: if r.player_id == player_id: outcome = sc2_env._possible_results.get(r.result, 0) frames = o.observation.game_loop result = {} result['outcome'] = outcome result['reward'] = self.obs.reward result['frames'] = frames result['win'] = 0 if result['reward'] == 1: result['win'] = 1 self._result = result print('play end, total return', self.obs.reward) self.step = 0 def one_hot_label(self, action_type_array, action_max_num): rows = action_type_array.shape[0] cols = action_max_num data = np.zeros((rows, cols)) for i in range(rows): data[i, int(action_type_array[i])] = 1 return data def get_values(self, values): # check if the game is end if self.is_end and self.result['reward'] != 0: return 0 else: return values
class MiniSourceAgent(base_agent.BaseAgent): """Agent for source game of starcraft.""" def __init__(self, index=0, rl_training=False, restore_model=False, global_buffer=None, net=None, sec_net=None, strategy_agent=None, greedy_action=False, extract_save_dir=None): super(MiniSourceAgent, self).__init__() self.net = net self.sec_net = sec_net self.index = index self.global_buffer = global_buffer self.restore_model = restore_model # count num self.step = 0 self.strategy_wait_secs = 2 self.strategy_flag = False self.policy_wait_secs = 2 self.policy_flag = True self.env = None self.obs = None # buffer self.local_buffer = Buffer() self.num_players = 2 self.on_select = None self._result = None self._gases = None self.is_end = False self.rl_training = rl_training self.rnn_state = None self.zero_state = self.sec_net.rnn_init_state() self.extract_save_dir = extract_save_dir self.reset() def reset(self): super(MiniSourceAgent, self).reset() self.step = 0 self.obs = None self._result = None self._gases = None self.is_end = False self.strategy_flag = False self.policy_flag = True self.local_buffer.reset() self.rnn_state = self.zero_state def set_env(self, env): self.env = env def init_network(self): self.net.initialize() if self.restore_model: self.net.restore_policy() def reset_old_network(self): self.net.reset_old_network() def save_model(self): self.net.save_policy() def update_policy(self): self.net.Update_policy(self.global_buffer) def update_result(self, result_list): self.net.update_result(result_list) def update_network(self, result_list): self.net.Update_policy(self.global_buffer) self.net.Update_result(result_list) def update_summary(self, counter): return self.net.Update_summary(counter) def mini_step(self, action): if action == ProtossAction.Build_probe.value: M.mineral_worker(self) elif action == ProtossAction.Build_zealot.value: M.train_army(self, C._TRAIN_ZEALOT) elif action == ProtossAction.Build_Stalker.value: M.train_army(self, C._TRAIN_STALKER) elif action == ProtossAction.Build_pylon.value: no_unit_index = U.get_unit_mask_screen(self.obs, size=2) pos = U.get_pos(no_unit_index) M.build_by_idle_worker(self, C._BUILD_PYLON_S, pos) elif action == ProtossAction.Build_gateway.value: power_index = U.get_power_mask_screen(self.obs, size=5) pos = U.get_pos(power_index) M.build_by_idle_worker(self, C._BUILD_GATEWAY_S, pos) elif action == ProtossAction.Build_Assimilator.value: if self._gases is not None: #U.find_gas_pos(self.obs, 1) gas_1 = self._gases[0] gas_2 = self._gases[1] if gas_1 is not None and not U.is_assimilator_on_gas( self.obs, gas_1): gas_1_pos = T.world_to_screen_pos(self.env.game_info, gas_1.pos, self.obs) M.build_by_idle_worker(self, C._BUILD_ASSIMILATOR_S, gas_1_pos) elif gas_2 is not None and not U.is_assimilator_on_gas( self.obs, gas_2): gas_2_pos = T.world_to_screen_pos(self.env.game_info, gas_2.pos, self.obs) M.build_by_idle_worker(self, C._BUILD_ASSIMILATOR_S, gas_2_pos) elif action == ProtossAction.Build_CyberneticsCore.value: power_index = U.get_power_mask_screen(self.obs, size=3) pos = U.get_pos(power_index) M.build_by_idle_worker(self, C._BUILD_CYBER_S, pos) elif action == ProtossAction.Attack.value: M.attack_step(self) elif action == ProtossAction.Retreat.value: M.retreat_step(self) elif action == ProtossAction.Do_nothing.value: self.safe_action(C._NO_OP, 0, []) def get_the_input(self): high_input, tech_cost, pop_num = U.get_input(self.obs) controller_input = np.concatenate([high_input, tech_cost, pop_num], axis=0) return controller_input def mapping_source_to_mini_by_rule(self, source_state): simple_input = np.zeros([20], dtype=np.int16) simple_input[0] = 0 # self.time_seconds simple_input[1] = source_state[28] # self.mineral_worker_nums simple_input[2] = source_state[30] + source_state[ 32] # self.gas_worker_nums simple_input[3] = source_state[2] # self.mineral simple_input[4] = source_state[3] # self.gas simple_input[5] = source_state[6] # self.food_cup simple_input[6] = source_state[7] # self.food_used simple_input[7] = source_state[10] # self.army_nums simple_input[8] = source_state[16] # self.gateway_num simple_input[9] = source_state[14] # self.pylon_num simple_input[10] = source_state[15] # self.Assimilator_num simple_input[11] = source_state[17] # self.CyberneticsCore_num simple_input[12] = source_state[12] # self.zealot_num simple_input[13] = source_state[13] # self.Stalker_num simple_input[14] = source_state[11] # self.probe_num simple_input[15] = source_state[4] + source_state[ 2] # self.collected_mineral simple_input[16] = source_state[4] # self.spent_mineral simple_input[17] = source_state[5] + source_state[ 3] # self.collected_gas simple_input[18] = source_state[5] # self.spent_gas simple_input[19] = 1 # self.Nexus_num return simple_input def play(self, verbose=False): self.play_train(verbose=verbose) def sample(self, verbose=False, use_image=True): is_attack = False state_last = None random_generated_int = random.randint(0, 2**31 - 1) filename = self.extract_save_dir + "/" + str( random_generated_int) + ".npz" recording_obs = [] recording_img = [] recording_action = [] recording_reward = [] np.random.seed(random_generated_int) tf.set_random_seed(random_generated_int) self.safe_action(C._NO_OP, 0, []) self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) self._gases = U.find_initial_gases(self.obs) while True: self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) if self.policy_flag and (not self.is_end): non_image_feature = self.mapping_source_to_mini_by_rule( self.get_the_input()) #print('non_image_feature.shape:', non_image_feature.shape) #print('non_image_feature:', non_image_feature) image_feature = U.get_simple_map_data(self.obs) #print('image_feature.shape:', image_feature.shape) #print('image_feature:', image_feature) latent_image_feature, mu, logvar = self.encode_obs( image_feature) #print('latent_image_feature.shape:', latent_image_feature.shape) #print('latent_image_feature:', latent_image_feature) feature = np.concatenate( [non_image_feature, latent_image_feature], axis=-1) #print('feature.shape:', feature.shape) #print('feature:', feature) #state_now = feature reward_last = 0 state_now, action, v_preds = self.get_action( feature, reward_last) # print(ProtossAction(action).name) self.mini_step(action) if state_last is not None: if False: print('state_last:', state_last, ', action_last:', action_last, ', state_now:', state_now) v_preds_next = self.net.policy.get_values(state_now) v_preds_next = self.get_values(v_preds_next) reward = 0 recording_obs.append(non_image_feature) recording_img.append(image_feature) recording_action.append(action) recording_reward.append(reward) #self.local_buffer.append(state_last, action_last, state_now, reward, v_preds, v_preds_next) state_last = state_now action_last = action self.policy_flag = False if self.is_end: if True: # consider the win/loss, to 0(not end), 1(loss), 2(draw), 3(win) recording_reward[-1] = (1 * self.result['reward'] + 2) if recording_reward[-1] != 0: print("result is:", recording_reward[-1]) recording_obs = np.array(recording_obs, dtype=np.uint16) recording_action = np.array(recording_action, dtype=np.uint8) recording_reward = np.array(recording_reward, dtype=np.uint8) recording_img = np.array(recording_img, dtype=np.float16) np.savez_compressed(filename, obs=recording_obs, img=recording_img, action=recording_action, reward=recording_reward) break def play_train(self, continues_attack=False, verbose=False): is_attack = False state_last = None self.safe_action(C._NO_OP, 0, []) self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) self._gases = U.find_initial_gases(self.obs) while True: self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos]) if self.policy_flag and (not self.is_end): non_image_feature = self.mapping_source_to_mini_by_rule( self.get_the_input()) #print('non_image_feature.shape:', non_image_feature.shape) #print('non_image_feature:', non_image_feature) image_feature = U.get_simple_map_data(self.obs) #print('image_feature.shape:', image_feature.shape) #print('image_feature:', image_feature) latent_image_feature, mu, logvar = self.encode_obs( image_feature) #print('latent_image_feature.shape:', latent_image_feature.shape) #print('latent_image_feature:', latent_image_feature) feature = np.concatenate( [non_image_feature, latent_image_feature], axis=-1) #print('feature.shape:', feature.shape) #print('feature:', feature) #state_now = feature reward_last = 0 state_now, action, v_preds = self.get_action( feature, reward_last) # print(ProtossAction(action).name) self.mini_step(action) if state_last is not None: if 0: print('state_last:', state_last, ', action_last:', action_last, ', state_now:', state_now) v_preds_next = self.net.policy.get_values(state_now) v_preds_next = self.get_values(v_preds_next) reward = 0 self.local_buffer.append(state_last, action_last, state_now, reward, v_preds, v_preds_next) state_last = state_now action_last = action self.policy_flag = False if self.is_end: if self.rl_training: self.local_buffer.rewards[-1] += 1 * self.result[ 'reward'] # self.result['win'] #print(self.local_buffer.rewards) self.global_buffer.add(self.local_buffer) #print("add %d buffer!" % (len(self.local_buffer.rewards))) break def encode_obs(self, obs): # convert raw obs to z, mu, logvar result = np.copy(obs) result = result.reshape(1, 64, 64, 12) mu, logvar = self.sec_net.vae.encode_mu_logvar(result) mu = mu[0] logvar = logvar[0] s = logvar.shape z = mu + np.exp(logvar / 2.0) * np.random.randn(*s) return z, mu, logvar def get_action(self, feature, reward): input_h = self.sec_net.rnn_output(self.rnn_state, feature) action, v_preds = self.net.policy.get_action(input_h, verbose=False) self.rnn_state = self.sec_net.rnn_next_state(feature, action, reward, self.rnn_state) return input_h, action, v_preds def set_flag(self): if self.step % C.time_wait(self.strategy_wait_secs) == 1: self.strategy_flag = True if self.step % C.time_wait(self.policy_wait_secs) == 1: self.policy_flag = True def safe_action(self, action, unit_type, args): if M.check_params(self, action, unit_type, args, 1): obs = self.env.step([sc2_actions.FunctionCall(action, args)])[0] self.obs = obs self.step += 1 self.update_result() self.set_flag() def select(self, action, unit_type, args): # safe select if M.check_params(self, action, unit_type, args, 0): self.obs = self.env.step([sc2_actions.FunctionCall(action, args)])[0] self.on_select = unit_type self.update_result() self.step += 1 self.set_flag() @property def result(self): return self._result def update_result(self): if self.obs is None: return if self.obs.last() or self.env.state == environment.StepType.LAST: self.is_end = True outcome = 0 o = self.obs.raw_observation player_id = o.observation.player_common.player_id for r in o.player_result: if r.player_id == player_id: outcome = sc2_env._possible_results.get(r.result, 0) frames = o.observation.game_loop result = {} result['outcome'] = outcome result['reward'] = self.obs.reward result['frames'] = frames self._result = result print('play end, total return', self.obs.reward) self.step = 0 def get_values(self, values): # check if the game is end if self.is_end and self.result['reward'] != 0: return 0 else: return values
class MiniAgent(): def __init__(self, agent_id=0, global_buffer=None, net=None, restore_model=False): self.agent_id = agent_id self.net = net self.global_buffer = global_buffer self.greedy_action = False self.local_buffer = Buffer() self.env = None self.restore_model = restore_model self.reset() def __str__(self): return None def set_env(self, env): self.env = env def reset(self): self.step = 0 self.obs = None self.reward = 0 self.done = False self.result = 0 self.local_buffer.reset() def play(self, show_details=False): #self.reset() self.obs = self.env.reset() state_last = None while True: # get the action if self.greedy_action: action_prob, v_preds = self.net.policy.get_action_probs(self.obs, verbose=False) action = np.argmax(action_prob) else: action, v_preds = self.net.policy.get_action(self.obs, verbose=False) # use the action to push the env step self.obs, self.reward, self.done, info = self.env.step(action) # add info to buffer if state_last is not None: if show_details: print('state_last:', state_last, ', action_last:', action_last, ', state_now:', self.obs) v_preds_next = self.net.policy.get_values(self.obs) v_preds_next = self.get_values(v_preds_next) self.local_buffer.append(state_last, action_last, self.obs, self.reward, v_preds, v_preds_next) state_last = self.obs action_last = action if self.done: self.result = self.reward print('play end, total return', self.result) if show_details else None if len(self.local_buffer.rewards) > 0: self.global_buffer.add(self.local_buffer) print("add %d buffer!" % (len(self.local_buffer.rewards))) if 1 else None break def init_network(self): self.net.initialize() if self.restore_model: self.net.restore_policy() def update_network(self, result_list): self.net.Update_policy(self.global_buffer) self.net.Update_result(result_list) def reset_old_network(self): self.net.reset_old_network() def save_model(self): self.net.save_policy() def update_summary(self, counter): return self.net.Update_summary(counter) def get_values(self, values): # check if the game is end if self.done: return 0 else: return values