def play_train_mini(self, verbose=False):
        is_attack = False
        state_last = None

        self.safe_action(C._NO_OP, 0, [])
        self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
        self._gases = U.find_initial_gases(self.obs)
        while True:

            self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
            if self.policy_flag and (not self.is_end):

                state_now = self.mapping_source_to_mini_by_rule(self.get_the_input())
                if self.greedy_action:
                    action_prob, v_preds = self.net.policy.get_action_probs(state_now, verbose=False)
                    action = np.argmax(action_prob)
                else:
                    action, v_preds = self.net.policy.get_action(state_now, verbose=False)

                # print(ProtossAction(action).name)
                self.mini_step(action)

                if state_last is not None:
                    if 0:
                        print('state_last:', state_last, ', action_last:', action_last, ', state_now:', state_now)
                    v_preds_next = self.net.policy.get_values(state_now)
                    v_preds_next = self.get_values(v_preds_next)
                    reward = 0
                    self.local_buffer.append(state_last, action_last, state_now, reward, v_preds, v_preds_next)

                # continuous attack, consistent with mind-game
                if action == ProtossAction.Attack.value:
                    is_attack = True
                if is_attack:
                    self.mini_step(ProtossAction.Attack.value)

                state_last = state_now
                action_last = action
                self.policy_flag = False

            if self.is_end:
                if self.rl_training:
                    self.local_buffer.rewards[-1] += 1 * self.result['reward']  # self.result['win']
                    print(self.local_buffer.rewards)
                    self.global_buffer.add(self.local_buffer)
                    print("add %d buffer!" % (len(self.local_buffer.rewards)))
                break
Beispiel #2
0
    def play_right(self, verbose=False):
        # note this is a right version of game play
        prev_state = None
        prev_action = None
        prev_value = None

        self.safe_action(C._NO_OP, 0, [])
        self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
        self._gases = U.find_initial_gases(self.obs)

        while True:

            self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
            if self.policy_flag and (not self.is_end):
                # get the state
                state = self.mapping_source_to_mini_by_rule(
                    self.get_the_input_right(self.obs))

                # get the action and value accoding to state
                action, value = self.net.policy.get_action(state,
                                                           verbose=verbose)

                # if this is not the fisrt state, store things to buffer
                if prev_state is not None:
                    # try reward = self.obs.reward
                    reward = self.obs.reward
                    if verbose:
                        print(prev_state, prev_action, state, reward,
                              prev_value, value)
                    self.local_buffer.append(prev_state, prev_action, state,
                                             reward, prev_value, value)

                self.mini_step(action)
                # the evn step to new states

                prev_state = state
                prev_action = action
                prev_value = value

                self.policy_flag = False

            if self.is_end:
                # get the last state and reward
                # get the state
                state = self.mapping_source_to_mini_by_rule(
                    self.get_the_input_right(self.obs))

                value = self.net.policy.get_values(state)
                # the value of the last state is defined somewhat different
                value = self.get_values_right(value)

                # if this is not the fisrt state, store things to buffer
                if prev_state is not None:
                    reward = self.obs.reward
                    if verbose:
                        print(prev_state, prev_action, state, reward,
                              prev_value, value)
                    self.local_buffer.append(prev_state, prev_action, state,
                                             reward, prev_value, value)
                break

        if self.rl_training:
            if verbose:
                print(self.local_buffer.values)
                print(self.local_buffer.values_next)
            #print(self.local_buffer.rewards)
            self.global_buffer.add(self.local_buffer)
Beispiel #3
0
    def sample(self, verbose=False, use_image=False):
        is_attack = False
        state_last = None

        random_generated_int = random.randint(0, 2**31 - 1)
        filename = self.extract_save_dir + "/" + str(
            random_generated_int) + ".npz"
        recording_obs = []
        recording_img = []
        recording_action = []

        np.random.seed(random_generated_int)
        tf.set_random_seed(random_generated_int)

        self.safe_action(C._NO_OP, 0, [])
        self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
        self._gases = U.find_initial_gases(self.obs)
        while True:

            self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
            if self.policy_flag and (not self.is_end):

                state_now = self.mapping_source_to_mini_by_rule(
                    self.get_the_input())
                recording_obs.append(state_now)

                if use_image:
                    recording_img.append(U.get_simple_map_data(self.obs))

                action, v_preds = self.net.policy.get_action(state_now,
                                                             verbose=False)
                recording_action.append(action)

                self.mini_step(action)

                if state_last is not None:
                    if False:
                        print('state_last:', state_last, ', action_last:',
                              action_last, ', state_now:', state_now)
                    v_preds_next = self.net.policy.get_values(state_now)
                    v_preds_next = self.get_values(v_preds_next)
                    reward = 0

                    self.local_buffer.append(state_last, action_last,
                                             state_now, reward, v_preds,
                                             v_preds_next)

                state_last = state_now
                action_last = action
                self.policy_flag = False

            if self.is_end:
                if True:
                    #note this will not consider the minerals larger than 256!
                    recording_obs = np.array(recording_obs, dtype=np.uint16)
                    recording_action = np.array(recording_action,
                                                dtype=np.uint8)
                    if not use_image:
                        np.savez_compressed(filename,
                                            obs=recording_obs,
                                            action=recording_action)
                    else:
                        recording_img = np.array(recording_img,
                                                 dtype=np.float16)
                        np.savez_compressed(filename,
                                            obs=recording_obs,
                                            img=recording_img,
                                            action=recording_action)
                break
    def play_right_add(self, verbose=False):
        # note this is a right version of game play, which also add input and action
        prev_state = None
        prev_action = None
        prev_value = None
        prev_add_state = None
        prev_map_state = None
        show_image = False

        self.safe_action(C._NO_OP, 0, [])
        self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
        self._gases = U.find_initial_gases(self.obs)

        self.dummy_add_state = np.zeros(1)
        self.dummy_map_state = np.zeros([1, 1, 1])

        simulate_seconds = 0
        feature_dict = U.edge_state()
        previous_match = -1

        while True:

            self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
            if self.policy_flag and (not self.is_end):
                # get the state
                state = self.mapping_source_to_mini_by_rule(
                    self.get_the_input_right(self.obs))

                if self.image_debug and verbose and self.step % C.time_wait(
                        self.image_wait_secs) == 0:
                    show_image = True
                else:
                    show_image = False

                if verbose:
                    print('show_image:', show_image)
                map_state = U.get_small_simple_map_data(
                    self.obs, show_image, show_image)

                if verbose:
                    print('map_state.shape:', map_state.shape)
                add_state = self.get_add_state(
                    self.get_the_input_right(self.obs))

                # get the action and value accoding to state
                #print("add_state:", add_state)
                if self.ob_space_add == 0:
                    add_state = self.dummy_add_state
                    map_state = self.dummy_map_state = np.zeros([1, 1, 1])
                action, action_probs, value = self.net.policy.get_act_action_probs(
                    state, add_state, map_state, verbose=verbose)

                # if this is not the fisrt state, store things to buffer
                if prev_state is not None:
                    # try reward = self.obs.reward
                    reward = self.obs.reward
                    if verbose:
                        print(prev_state, prev_add_state, prev_action, state,
                              reward, prev_value, value)
                    self.local_buffer.append_more_more(
                        prev_state, prev_add_state, prev_map_state,
                        prev_action, state, reward, prev_value, value)

                self.mini_step(action)
                simulate_seconds += self.policy_wait_secs
                # the evn step to new states

                prev_state = state
                prev_action = action
                prev_value = value
                prev_add_state = add_state
                prev_map_state = map_state

                self.policy_flag = False

            if self.is_end:
                # get the last state and reward
                # get the state
                state = self.mapping_source_to_mini_by_rule(
                    self.get_the_input_right(self.obs))
                map_state = U.get_small_simple_map_data(self.obs)
                add_state = self.get_add_state(
                    self.get_the_input_right(self.obs))

                if self.ob_space_add == 0:
                    add_state = self.dummy_add_state
                    map_state = self.dummy_map_state = np.zeros([1, 1, 1])

                value = self.net.policy.get_values(state, add_state, map_state)
                # the value of the last state is defined somewhat different
                value = self.get_values_right(value)

                # if this is not the fisrt state, store things to buffer
                if prev_state is not None:
                    reward = self.obs.reward
                    if verbose:
                        print(prev_state, prev_add_state, prev_action, state,
                              reward, prev_value, value)
                    self.local_buffer.append_more_more(
                        prev_state, prev_add_state, prev_map_state,
                        prev_action, state, reward, prev_value, value)
                break

        if self.rl_training:
            #print(self.local_buffer.values)
            #print(self.local_buffer.values_next)
            #print(self.local_buffer.rewards)
            self.global_buffer.add(self.local_buffer)
            print("add map bn:")
            print("add %d buffer!" % (len(self.local_buffer.rewards)))
    def play_train(self, continues_attack=False, verbose=False):
        is_attack = False
        state_last = None

        self.safe_action(C._NO_OP, 0, [])
        self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
        self._gases = U.find_initial_gases(self.obs)

        while True:

            self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
            if self.policy_flag and (not self.is_end):

                non_image_feature = self.mapping_source_to_mini_by_rule(
                    self.get_the_input())
                #print('non_image_feature.shape:', non_image_feature.shape)
                #print('non_image_feature:', non_image_feature)

                image_feature = U.get_simple_map_data(self.obs)
                #print('image_feature.shape:', image_feature.shape)
                #print('image_feature:', image_feature)

                latent_image_feature, mu, logvar = self.encode_obs(
                    image_feature)
                #print('latent_image_feature.shape:', latent_image_feature.shape)
                #print('latent_image_feature:', latent_image_feature)

                feature = np.concatenate(
                    [non_image_feature, latent_image_feature], axis=-1)
                #print('feature.shape:', feature.shape)
                #print('feature:', feature)

                #state_now = feature
                reward_last = 0
                state_now, action, v_preds = self.get_action(
                    feature, reward_last)

                # print(ProtossAction(action).name)
                self.mini_step(action)

                if state_last is not None:
                    if 0:
                        print('state_last:', state_last, ', action_last:',
                              action_last, ', state_now:', state_now)
                    v_preds_next = self.net.policy.get_values(state_now)
                    v_preds_next = self.get_values(v_preds_next)
                    reward = 0
                    self.local_buffer.append(state_last, action_last,
                                             state_now, reward, v_preds,
                                             v_preds_next)

                state_last = state_now
                action_last = action
                self.policy_flag = False

            if self.is_end:
                if self.rl_training:
                    self.local_buffer.rewards[-1] += 1 * self.result[
                        'reward']  # self.result['win']
                    #print(self.local_buffer.rewards)
                    self.global_buffer.add(self.local_buffer)
                    #print("add %d buffer!" % (len(self.local_buffer.rewards)))
                break
    def sample(self, verbose=False, use_image=True):
        is_attack = False
        state_last = None

        random_generated_int = random.randint(0, 2**31 - 1)
        filename = self.extract_save_dir + "/" + str(
            random_generated_int) + ".npz"

        recording_obs = []
        recording_img = []
        recording_action = []
        recording_reward = []

        np.random.seed(random_generated_int)
        tf.set_random_seed(random_generated_int)

        self.safe_action(C._NO_OP, 0, [])
        self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
        self._gases = U.find_initial_gases(self.obs)
        while True:

            self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
            if self.policy_flag and (not self.is_end):

                non_image_feature = self.mapping_source_to_mini_by_rule(
                    self.get_the_input())
                #print('non_image_feature.shape:', non_image_feature.shape)
                #print('non_image_feature:', non_image_feature)

                image_feature = U.get_simple_map_data(self.obs)
                #print('image_feature.shape:', image_feature.shape)
                #print('image_feature:', image_feature)

                latent_image_feature, mu, logvar = self.encode_obs(
                    image_feature)
                #print('latent_image_feature.shape:', latent_image_feature.shape)
                #print('latent_image_feature:', latent_image_feature)

                feature = np.concatenate(
                    [non_image_feature, latent_image_feature], axis=-1)
                #print('feature.shape:', feature.shape)
                #print('feature:', feature)

                #state_now = feature
                reward_last = 0
                state_now, action, v_preds = self.get_action(
                    feature, reward_last)

                # print(ProtossAction(action).name)
                self.mini_step(action)

                if state_last is not None:
                    if False:
                        print('state_last:', state_last, ', action_last:',
                              action_last, ', state_now:', state_now)
                    v_preds_next = self.net.policy.get_values(state_now)
                    v_preds_next = self.get_values(v_preds_next)
                    reward = 0

                    recording_obs.append(non_image_feature)
                    recording_img.append(image_feature)
                    recording_action.append(action)
                    recording_reward.append(reward)

                    #self.local_buffer.append(state_last, action_last, state_now, reward, v_preds, v_preds_next)

                state_last = state_now
                action_last = action
                self.policy_flag = False

            if self.is_end:
                if True:
                    # consider the win/loss, to 0(not end), 1(loss), 2(draw), 3(win)
                    recording_reward[-1] = (1 * self.result['reward'] + 2)
                    if recording_reward[-1] != 0:
                        print("result is:", recording_reward[-1])

                    recording_obs = np.array(recording_obs, dtype=np.uint16)
                    recording_action = np.array(recording_action,
                                                dtype=np.uint8)
                    recording_reward = np.array(recording_reward,
                                                dtype=np.uint8)
                    recording_img = np.array(recording_img, dtype=np.float16)

                    np.savez_compressed(filename,
                                        obs=recording_obs,
                                        img=recording_img,
                                        action=recording_action,
                                        reward=recording_reward)
                break
Beispiel #7
0
    def play_right_add(self, verbose=False):
        # note this is a right version of game play, which also add input and action
        prev_state = None
        prev_action = None
        prev_value = None
        prev_add_state = None
        prev_map_state = None
        show_image = False

        self.safe_action(C._NO_OP, 0, [])
        self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
        self._gases = U.find_initial_gases(self.obs)

        self.dummy_add_state = np.zeros(1)
        self.dummy_map_state = np.zeros([1, 1, 1])

        simulate_seconds = 0
        feature_dict = U.edge_state()
        previous_match = -1

        while True:

            self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
            if self.policy_flag and (not self.is_end):
                # get the state
                state = self.mapping_source_to_mini_by_rule(
                    self.get_the_input_right(self.obs))

                if self.image_debug and verbose and self.step % C.time_wait(
                        self.image_wait_secs) == 0:
                    show_image = True
                else:
                    show_image = False

                if verbose:
                    print('show_image:', show_image)
                map_state = U.get_small_simple_map_data(
                    self.obs, show_image, show_image)

                if verbose:
                    print('map_state.shape:', map_state.shape)
                add_state = self.get_add_state(
                    self.get_the_input_right(self.obs))

                # get the action and value accoding to state
                #print("add_state:", add_state)
                if self.ob_space_add == 0:
                    add_state = self.dummy_add_state
                    map_state = self.dummy_map_state = np.zeros([1, 1, 1])
                action, action_probs, value = self.net.policy.get_act_action_probs(
                    state, add_state, map_state, verbose=verbose)

                if self.probe_debug and verbose and self.step % C.time_wait(
                        self.prove_save_wait_secs) == 0:
                    save_prob = True
                else:
                    save_prob = False

                if self.action_prob_debug and verbose:
                    print('self.step:', self.step)
                    print(self.prob_show_wait_seconds)
                    print('simulate_seconds:', simulate_seconds)

                    use_TG = True
                    if use_TG:
                        bar_type = 'TG'
                        max_y = 0.3
                        color = 'b'
                    else:
                        bar_type = 'RG'
                        max_y = 0.5
                        color = 'r'

                    bar_name = bar_type + '_' + str(simulate_seconds)
                    if True and simulate_seconds in self.prob_show_wait_seconds:
                        pprint.pprint(action_probs)
                        U.show_prob_dist(action_probs,
                                         show=True,
                                         color=color,
                                         max_y=max_y,
                                         action_num=self.action_num,
                                         save=False,
                                         name=bar_name,
                                         count=0)

                if self.edge_state_debug and verbose:
                    match_list = U.calculate_state_mapping(state, feature_dict)
                    print('state:', state)
                    print('match_list:', match_list)

                    for i, match in enumerate(match_list):
                        if match and i != previous_match:
                            print('Match:', i + 1)
                            match_name = 'ES' + '_' + str(i + 1) + '_' + str(
                                simulate_seconds)
                            U.show_prob_dist(action_probs,
                                             show=True,
                                             color=color,
                                             max_y=max_y,
                                             action_num=self.action_num,
                                             save=True,
                                             name=match_name,
                                             count=0)
                            previous_match = i

                # if this is not the fisrt state, store things to buffer
                if prev_state is not None:
                    # try reward = self.obs.reward
                    reward = self.obs.reward
                    if verbose:
                        print(prev_state, prev_add_state, prev_action, state,
                              reward, prev_value, value)
                    self.local_buffer.append_more_more(
                        prev_state, prev_add_state, prev_map_state,
                        prev_action, state, reward, prev_value, value)

                self.mini_step(action)
                simulate_seconds += self.policy_wait_secs
                # the evn step to new states

                prev_state = state
                prev_action = action
                prev_value = value
                prev_add_state = add_state
                prev_map_state = map_state

                self.policy_flag = False

            if self.is_end:
                # get the last state and reward
                # get the state
                state = self.mapping_source_to_mini_by_rule(
                    self.get_the_input_right(self.obs))
                map_state = U.get_small_simple_map_data(self.obs)
                add_state = self.get_add_state(
                    self.get_the_input_right(self.obs))

                if self.ob_space_add == 0:
                    add_state = self.dummy_add_state
                    map_state = self.dummy_map_state = np.zeros([1, 1, 1])

                value = self.net.policy.get_values(state, add_state, map_state)
                # the value of the last state is defined somewhat different
                value = self.get_values_right(value)

                # if this is not the fisrt state, store things to buffer
                if prev_state is not None:
                    reward = self.obs.reward
                    if verbose:
                        print(prev_state, prev_add_state, prev_action, state,
                              reward, prev_value, value)
                    self.local_buffer.append_more_more(
                        prev_state, prev_add_state, prev_map_state,
                        prev_action, state, reward, prev_value, value)
                break

        if self.rl_training:
            #print(self.local_buffer.values)
            #print(self.local_buffer.values_next)
            #print(self.local_buffer.rewards)
            self.global_buffer.add(self.local_buffer)
            print("add map bn:")
            print("add %d buffer!" % (len(self.local_buffer.rewards)))