Exemple #1
0
def get_avg_reward(train_rewards, teacher_rewards, args):
    # Process train rewards
    if len(train_rewards) == 0:
        avg_train_reward = args.reward_min
    else:
        if len(train_rewards) > args.window_size:
            avg_train_reward = np.average(train_rewards[-args.window_size:-1])
        else:
            avg_train_reward = np.average(train_rewards)
    avg_train_reward = np.clip(avg_train_reward,
                               a_min=args.reward_min,
                               a_max=args.reward_max)
    avg_train_reward = normalize(avg_train_reward,
                                 min_value=args.reward_min,
                                 max_value=args.reward_max)

    # Process teacher rewards
    if len(teacher_rewards) == 0:
        avg_teacher_reward = args.reward_min
    else:
        if len(teacher_rewards) > args.window_size:
            avg_teacher_reward = np.average(
                teacher_rewards[-args.window_size:-1])
        else:
            avg_teacher_reward = np.average(teacher_rewards)
    avg_teacher_reward = np.clip(avg_teacher_reward,
                                 a_min=args.reward_min,
                                 a_max=args.reward_max)
    avg_teacher_reward = normalize(avg_teacher_reward,
                                   min_value=args.reward_min,
                                   max_value=args.reward_max)

    return avg_train_reward, avg_teacher_reward
Exemple #2
0
def get_avg_reward(TRAIN_REWARD_N, EVAL_REWARD_N, args):
    if len(TRAIN_REWARD_N) == 0:
        avg_train_reward = AVG_CLIP_MIN  # NOTE Initialize to be min_value
    else:
        if len(TRAIN_REWARD_N) > args.window_size:
            avg_train_reward = np.average(TRAIN_REWARD_N[-args.window_size:-1])
        else:
            avg_train_reward = np.average(TRAIN_REWARD_N)
    avg_train_reward = np.clip(avg_train_reward,
                               a_min=AVG_CLIP_MIN,
                               a_max=AVG_CLIP_MAX)
    avg_train_reward = normalize(avg_train_reward,
                                 min_value=AVG_CLIP_MIN,
                                 max_value=AVG_CLIP_MAX)

    if len(EVAL_REWARD_N) == 0:
        raise ValueError()
    else:
        if len(EVAL_REWARD_N) > args.window_size:
            avg_eval_reward = np.average(EVAL_REWARD_N[-args.window_size:-1])
        else:
            avg_eval_reward = np.average(EVAL_REWARD_N)
    avg_eval_reward = np.clip(avg_eval_reward,
                              a_min=AVG_CLIP_MIN,
                              a_max=AVG_CLIP_MAX)
    avg_eval_reward = normalize(avg_eval_reward,
                                min_value=AVG_CLIP_MIN,
                                max_value=AVG_CLIP_MAX)

    return avg_train_reward, avg_eval_reward
Exemple #3
0
def meta_input_process(student_n, student_obs_n, i_teacher, i_student, args):
    obs = {}
    obs["student_obs_n"] = student_obs_n  # Task observation

    # Sample random action
    obs["random_action_n"] = []
    for _ in range(15):
        random_action_n = [
            np.random.uniform(low=-1, high=1., size=2),
            np.random.uniform(low=-1, high=1., size=2)
        ]
        obs["random_action_n"].append(random_action_n)

    # Get Q-values using student critic
    obs["student_agent_q_values"] = []
    obs["teacher_agent_q_values"] = []
    for i_action in range(15):
        q_value_n = get_q_value_n(student_n, student_obs_n,
                                  obs["random_action_n"][i_action])
        q_value_n = np.clip(q_value_n,
                            a_min=Q_VALUE_CLIP_MIN,
                            a_max=Q_VALUE_CLIP_MAX)

        q_value_student = np.array([
            normalize(value=q_value_n[i_student],
                      min_value=Q_VALUE_CLIP_MIN,
                      max_value=Q_VALUE_CLIP_MAX)
        ])
        obs["student_agent_q_values"].append(q_value_student)

        q_value_teacher = np.array([
            normalize(value=q_value_n[i_teacher],
                      min_value=Q_VALUE_CLIP_MIN,
                      max_value=Q_VALUE_CLIP_MAX)
        ])
        obs["teacher_agent_q_values"].append(q_value_teacher)

    return concat_in_order_teacher(obs), concat_in_order_asker(obs)
Exemple #4
0
def get_manager_obs(env_observations, ep_timesteps, args):
    if args.manager_done:
        manager_observations = []

        remaining_timesteps = normalize(value=(args.ep_max_timesteps -
                                               ep_timesteps),
                                        min_value=0.,
                                        max_value=float(args.ep_max_timesteps))
        remaining_timesteps = np.array([remaining_timesteps])

        for env_obs in env_observations:
            manager_obs = np.concatenate([env_obs, remaining_timesteps])
            manager_observations.append(manager_obs)
    else:
        manager_observations = env_observations

    return manager_observations
Exemple #5
0
def get_student_obs(env_obs_n, ep_timesteps, args):
    student_obs_n = []

    if args.student_done:
        remaining_timesteps = normalize(value=(args.ep_max_timesteps -
                                               ep_timesteps),
                                        min_value=0.,
                                        max_value=float(args.ep_max_timesteps))
        remaining_timesteps = np.array([remaining_timesteps])

        for env_obs in env_obs_n:
            student_obs = np.concatenate([env_obs, remaining_timesteps])
            student_obs_n.append(student_obs)

        return student_obs_n
    else:
        return env_obs_n
Exemple #6
0
def teacher_input_process(managers, teacher, manager_observations,
                          manager_actions, train_rewards, teacher_rewards,
                          i_teacher, i_student, session_advices, args):
    obs = {}
    obs["manager_observations"] = manager_observations
    obs["manager_actions"] = manager_actions
    obs["teacher_obs"] = manager_observations[i_teacher]
    obs["teacher_action"] = manager_actions[i_teacher]
    obs["student_obs"] = manager_observations[i_student]
    obs["student_action"] = manager_actions[i_student]
    obs["session_advices"] = session_advices

    # Get Q-values for manager_actions
    q_values = get_q_values(managers, manager_observations, manager_actions)
    q_values = np.clip(q_values, a_min=args.q_min, a_max=args.q_max)

    obs["q_with_teacher_critic"] = np.array([
        normalize(value=q_values[i_teacher],
                  min_value=args.q_min,
                  max_value=args.q_max)
    ])
    obs["q_with_student_critic"] = np.array([
        normalize(value=q_values[i_student],
                  min_value=args.q_min,
                  max_value=args.q_max)
    ])

    # Get Q-values for manager_actions_at
    obs["teacher_action_at"] = managers[i_teacher].select_deterministic_action(
        np.array(manager_observations[i_student]))

    q_values = get_q_values(managers, manager_observations,
                            [manager_actions[0], obs["teacher_action_at"]])
    q_values = np.clip(q_values, a_min=args.q_min, a_max=args.q_max)

    obs["q_at_with_teacher_critic"] = np.array([
        normalize(value=q_values[i_teacher],
                  min_value=args.q_min,
                  max_value=args.q_max)
    ])
    obs["q_at_with_student_critic"] = np.array([
        normalize(value=q_values[i_student],
                  min_value=args.q_min,
                  max_value=args.q_max)
    ])

    # Get avg reward
    avg_train_reward, avg_teacher_reward = get_avg_reward(
        train_rewards=train_rewards,
        teacher_rewards=teacher_rewards,
        args=args)
    obs["avg_train_reward"] = np.array([avg_train_reward])
    obs["avg_teacher_reward"] = np.array([avg_teacher_reward])

    # Get teacher remain timestep
    obs["remain_time"] = np.array([
        normalize(value=(teacher.n_advice - session_advices),
                  min_value=0.,
                  max_value=float(teacher.n_advice))
    ])

    return concat_in_order(obs, args), obs
Exemple #7
0
 def normalize(self, state):
     return normalize(state, self.rms.mean, self.rms.std)
Exemple #8
0
    def update_memory(self, teacher_reward, temp_managers, train_rewards,
                      teacher_rewards):
        """Update memory
        The next observation is updated by replacing student Q-values with its updated temporary policy.
        Average rewards and remaining timestep are also updated.
        The measured teacher_reward is also updated.
        """
        self.corrected_memory = [[] for _ in range(5)
                                 ]  # 5: obs, new_obs, action, reward, done

        i_student = 1
        for i_exp, exp in enumerate(self.tmp_memory):
            # Update student_action
            obs_dict = exp[-1]

            # Update q-value that measured using updated student_critic
            q_values = get_q_values(
                temp_managers, obs_dict["manager_observations"],
                [obs_dict["manager_actions"][0], obs_dict["student_action"]])
            q_values = np.clip(q_values,
                               a_min=self.args.q_min,
                               a_max=self.args.q_max)

            obs_dict["q_with_student_critic"] = np.array([
                normalize(value=q_values[i_student],
                          min_value=self.args.q_min,
                          max_value=self.args.q_max)
            ])

            q_values = get_q_values(temp_managers,
                                    obs_dict["manager_observations"], [
                                        obs_dict["manager_actions"][0],
                                        obs_dict["teacher_action_at"]
                                    ])
            q_values = np.clip(q_values,
                               a_min=self.args.q_min,
                               a_max=self.args.q_max)

            obs_dict["q_at_with_student_critic"] = np.array([
                normalize(value=q_values[i_student],
                          min_value=self.args.q_min,
                          max_value=self.args.q_max)
            ])

            # Update avg_reward
            # Note that avg_train_reward = R_{Phase I}
            # Note that avg_teacher_reward = R_{Phase II}
            avg_train_reward, avg_teacher_reward = get_avg_reward(
                train_rewards=train_rewards,
                teacher_rewards=teacher_rewards,
                args=self.args)
            obs_dict["avg_train_reward"] = np.array([avg_train_reward])
            obs_dict["avg_teacher_reward"] = np.array([avg_teacher_reward])

            # Update teacher remain timestep
            obs_dict["remain_time"] = np.array([
                normalize(value=(self.n_advice -
                                 (obs_dict["session_advices"] + 1)),
                          min_value=0.,
                          max_value=float(self.n_advice))
            ])

            new_obs = concat_in_order(obs_dict, self.args)
            self.corrected_memory[0].append(exp[0])
            self.corrected_memory[1].append(new_obs)
            self.corrected_memory[2].append(exp[2])
            self.corrected_memory[3].append(teacher_reward)
            self.corrected_memory[4].append(exp[4])

        self.add_memory()
        self.clear_tmp_memory()
Exemple #9
0
 def separate_label(data):
   X = normalize(data[:, :2].astype('float32'))
   Y = np.where(data[:, 2] == b'black', 0, 1)
   return X, Y