def play_train(self, continues_attack=False, verbose=False):
        is_attack = False
        state_last = None

        self.safe_action(C._NO_OP, 0, [])
        self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
        self._gases = U.find_initial_gases(self.obs)

        while True:

            self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
            if self.policy_flag and (not self.is_end):

                non_image_feature = self.mapping_source_to_mini_by_rule(
                    self.get_the_input())
                #print('non_image_feature.shape:', non_image_feature.shape)
                #print('non_image_feature:', non_image_feature)

                image_feature = U.get_simple_map_data(self.obs)
                #print('image_feature.shape:', image_feature.shape)
                #print('image_feature:', image_feature)

                latent_image_feature, mu, logvar = self.encode_obs(
                    image_feature)
                #print('latent_image_feature.shape:', latent_image_feature.shape)
                #print('latent_image_feature:', latent_image_feature)

                feature = np.concatenate(
                    [non_image_feature, latent_image_feature], axis=-1)
                #print('feature.shape:', feature.shape)
                #print('feature:', feature)

                #state_now = feature
                reward_last = 0
                state_now, action, v_preds = self.get_action(
                    feature, reward_last)

                # print(ProtossAction(action).name)
                self.mini_step(action)

                if state_last is not None:
                    if 0:
                        print('state_last:', state_last, ', action_last:',
                              action_last, ', state_now:', state_now)
                    v_preds_next = self.net.policy.get_values(state_now)
                    v_preds_next = self.get_values(v_preds_next)
                    reward = 0
                    self.local_buffer.append(state_last, action_last,
                                             state_now, reward, v_preds,
                                             v_preds_next)

                state_last = state_now
                action_last = action
                self.policy_flag = False

            if self.is_end:
                if self.rl_training:
                    self.local_buffer.rewards[-1] += 1 * self.result[
                        'reward']  # self.result['win']
                    #print(self.local_buffer.rewards)
                    self.global_buffer.add(self.local_buffer)
                    #print("add %d buffer!" % (len(self.local_buffer.rewards)))
                break
Exemple #2
0
    def sample(self, verbose=False, use_image=False):
        is_attack = False
        state_last = None

        random_generated_int = random.randint(0, 2**31 - 1)
        filename = self.extract_save_dir + "/" + str(
            random_generated_int) + ".npz"
        recording_obs = []
        recording_img = []
        recording_action = []

        np.random.seed(random_generated_int)
        tf.set_random_seed(random_generated_int)

        self.safe_action(C._NO_OP, 0, [])
        self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
        self._gases = U.find_initial_gases(self.obs)
        while True:

            self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
            if self.policy_flag and (not self.is_end):

                state_now = self.mapping_source_to_mini_by_rule(
                    self.get_the_input())
                recording_obs.append(state_now)

                if use_image:
                    recording_img.append(U.get_simple_map_data(self.obs))

                action, v_preds = self.net.policy.get_action(state_now,
                                                             verbose=False)
                recording_action.append(action)

                self.mini_step(action)

                if state_last is not None:
                    if False:
                        print('state_last:', state_last, ', action_last:',
                              action_last, ', state_now:', state_now)
                    v_preds_next = self.net.policy.get_values(state_now)
                    v_preds_next = self.get_values(v_preds_next)
                    reward = 0

                    self.local_buffer.append(state_last, action_last,
                                             state_now, reward, v_preds,
                                             v_preds_next)

                state_last = state_now
                action_last = action
                self.policy_flag = False

            if self.is_end:
                if True:
                    #note this will not consider the minerals larger than 256!
                    recording_obs = np.array(recording_obs, dtype=np.uint16)
                    recording_action = np.array(recording_action,
                                                dtype=np.uint8)
                    if not use_image:
                        np.savez_compressed(filename,
                                            obs=recording_obs,
                                            action=recording_action)
                    else:
                        recording_img = np.array(recording_img,
                                                 dtype=np.float16)
                        np.savez_compressed(filename,
                                            obs=recording_obs,
                                            img=recording_img,
                                            action=recording_action)
                break
    def sample(self, verbose=False, use_image=True):
        is_attack = False
        state_last = None

        random_generated_int = random.randint(0, 2**31 - 1)
        filename = self.extract_save_dir + "/" + str(
            random_generated_int) + ".npz"

        recording_obs = []
        recording_img = []
        recording_action = []
        recording_reward = []

        np.random.seed(random_generated_int)
        tf.set_random_seed(random_generated_int)

        self.safe_action(C._NO_OP, 0, [])
        self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
        self._gases = U.find_initial_gases(self.obs)
        while True:

            self.safe_action(C._MOVE_CAMERA, 0, [C.base_camera_pos])
            if self.policy_flag and (not self.is_end):

                non_image_feature = self.mapping_source_to_mini_by_rule(
                    self.get_the_input())
                #print('non_image_feature.shape:', non_image_feature.shape)
                #print('non_image_feature:', non_image_feature)

                image_feature = U.get_simple_map_data(self.obs)
                #print('image_feature.shape:', image_feature.shape)
                #print('image_feature:', image_feature)

                latent_image_feature, mu, logvar = self.encode_obs(
                    image_feature)
                #print('latent_image_feature.shape:', latent_image_feature.shape)
                #print('latent_image_feature:', latent_image_feature)

                feature = np.concatenate(
                    [non_image_feature, latent_image_feature], axis=-1)
                #print('feature.shape:', feature.shape)
                #print('feature:', feature)

                #state_now = feature
                reward_last = 0
                state_now, action, v_preds = self.get_action(
                    feature, reward_last)

                # print(ProtossAction(action).name)
                self.mini_step(action)

                if state_last is not None:
                    if False:
                        print('state_last:', state_last, ', action_last:',
                              action_last, ', state_now:', state_now)
                    v_preds_next = self.net.policy.get_values(state_now)
                    v_preds_next = self.get_values(v_preds_next)
                    reward = 0

                    recording_obs.append(non_image_feature)
                    recording_img.append(image_feature)
                    recording_action.append(action)
                    recording_reward.append(reward)

                    #self.local_buffer.append(state_last, action_last, state_now, reward, v_preds, v_preds_next)

                state_last = state_now
                action_last = action
                self.policy_flag = False

            if self.is_end:
                if True:
                    # consider the win/loss, to 0(not end), 1(loss), 2(draw), 3(win)
                    recording_reward[-1] = (1 * self.result['reward'] + 2)
                    if recording_reward[-1] != 0:
                        print("result is:", recording_reward[-1])

                    recording_obs = np.array(recording_obs, dtype=np.uint16)
                    recording_action = np.array(recording_action,
                                                dtype=np.uint8)
                    recording_reward = np.array(recording_reward,
                                                dtype=np.uint8)
                    recording_img = np.array(recording_img, dtype=np.float16)

                    np.savez_compressed(filename,
                                        obs=recording_obs,
                                        img=recording_img,
                                        action=recording_action,
                                        reward=recording_reward)
                break