def get_avg_reward(train_rewards, teacher_rewards, args): # Process train rewards if len(train_rewards) == 0: avg_train_reward = args.reward_min else: if len(train_rewards) > args.window_size: avg_train_reward = np.average(train_rewards[-args.window_size:-1]) else: avg_train_reward = np.average(train_rewards) avg_train_reward = np.clip(avg_train_reward, a_min=args.reward_min, a_max=args.reward_max) avg_train_reward = normalize(avg_train_reward, min_value=args.reward_min, max_value=args.reward_max) # Process teacher rewards if len(teacher_rewards) == 0: avg_teacher_reward = args.reward_min else: if len(teacher_rewards) > args.window_size: avg_teacher_reward = np.average( teacher_rewards[-args.window_size:-1]) else: avg_teacher_reward = np.average(teacher_rewards) avg_teacher_reward = np.clip(avg_teacher_reward, a_min=args.reward_min, a_max=args.reward_max) avg_teacher_reward = normalize(avg_teacher_reward, min_value=args.reward_min, max_value=args.reward_max) return avg_train_reward, avg_teacher_reward
def get_avg_reward(TRAIN_REWARD_N, EVAL_REWARD_N, args): if len(TRAIN_REWARD_N) == 0: avg_train_reward = AVG_CLIP_MIN # NOTE Initialize to be min_value else: if len(TRAIN_REWARD_N) > args.window_size: avg_train_reward = np.average(TRAIN_REWARD_N[-args.window_size:-1]) else: avg_train_reward = np.average(TRAIN_REWARD_N) avg_train_reward = np.clip(avg_train_reward, a_min=AVG_CLIP_MIN, a_max=AVG_CLIP_MAX) avg_train_reward = normalize(avg_train_reward, min_value=AVG_CLIP_MIN, max_value=AVG_CLIP_MAX) if len(EVAL_REWARD_N) == 0: raise ValueError() else: if len(EVAL_REWARD_N) > args.window_size: avg_eval_reward = np.average(EVAL_REWARD_N[-args.window_size:-1]) else: avg_eval_reward = np.average(EVAL_REWARD_N) avg_eval_reward = np.clip(avg_eval_reward, a_min=AVG_CLIP_MIN, a_max=AVG_CLIP_MAX) avg_eval_reward = normalize(avg_eval_reward, min_value=AVG_CLIP_MIN, max_value=AVG_CLIP_MAX) return avg_train_reward, avg_eval_reward
def meta_input_process(student_n, student_obs_n, i_teacher, i_student, args): obs = {} obs["student_obs_n"] = student_obs_n # Task observation # Sample random action obs["random_action_n"] = [] for _ in range(15): random_action_n = [ np.random.uniform(low=-1, high=1., size=2), np.random.uniform(low=-1, high=1., size=2) ] obs["random_action_n"].append(random_action_n) # Get Q-values using student critic obs["student_agent_q_values"] = [] obs["teacher_agent_q_values"] = [] for i_action in range(15): q_value_n = get_q_value_n(student_n, student_obs_n, obs["random_action_n"][i_action]) q_value_n = np.clip(q_value_n, a_min=Q_VALUE_CLIP_MIN, a_max=Q_VALUE_CLIP_MAX) q_value_student = np.array([ normalize(value=q_value_n[i_student], min_value=Q_VALUE_CLIP_MIN, max_value=Q_VALUE_CLIP_MAX) ]) obs["student_agent_q_values"].append(q_value_student) q_value_teacher = np.array([ normalize(value=q_value_n[i_teacher], min_value=Q_VALUE_CLIP_MIN, max_value=Q_VALUE_CLIP_MAX) ]) obs["teacher_agent_q_values"].append(q_value_teacher) return concat_in_order_teacher(obs), concat_in_order_asker(obs)
def get_manager_obs(env_observations, ep_timesteps, args): if args.manager_done: manager_observations = [] remaining_timesteps = normalize(value=(args.ep_max_timesteps - ep_timesteps), min_value=0., max_value=float(args.ep_max_timesteps)) remaining_timesteps = np.array([remaining_timesteps]) for env_obs in env_observations: manager_obs = np.concatenate([env_obs, remaining_timesteps]) manager_observations.append(manager_obs) else: manager_observations = env_observations return manager_observations
def get_student_obs(env_obs_n, ep_timesteps, args): student_obs_n = [] if args.student_done: remaining_timesteps = normalize(value=(args.ep_max_timesteps - ep_timesteps), min_value=0., max_value=float(args.ep_max_timesteps)) remaining_timesteps = np.array([remaining_timesteps]) for env_obs in env_obs_n: student_obs = np.concatenate([env_obs, remaining_timesteps]) student_obs_n.append(student_obs) return student_obs_n else: return env_obs_n
def teacher_input_process(managers, teacher, manager_observations, manager_actions, train_rewards, teacher_rewards, i_teacher, i_student, session_advices, args): obs = {} obs["manager_observations"] = manager_observations obs["manager_actions"] = manager_actions obs["teacher_obs"] = manager_observations[i_teacher] obs["teacher_action"] = manager_actions[i_teacher] obs["student_obs"] = manager_observations[i_student] obs["student_action"] = manager_actions[i_student] obs["session_advices"] = session_advices # Get Q-values for manager_actions q_values = get_q_values(managers, manager_observations, manager_actions) q_values = np.clip(q_values, a_min=args.q_min, a_max=args.q_max) obs["q_with_teacher_critic"] = np.array([ normalize(value=q_values[i_teacher], min_value=args.q_min, max_value=args.q_max) ]) obs["q_with_student_critic"] = np.array([ normalize(value=q_values[i_student], min_value=args.q_min, max_value=args.q_max) ]) # Get Q-values for manager_actions_at obs["teacher_action_at"] = managers[i_teacher].select_deterministic_action( np.array(manager_observations[i_student])) q_values = get_q_values(managers, manager_observations, [manager_actions[0], obs["teacher_action_at"]]) q_values = np.clip(q_values, a_min=args.q_min, a_max=args.q_max) obs["q_at_with_teacher_critic"] = np.array([ normalize(value=q_values[i_teacher], min_value=args.q_min, max_value=args.q_max) ]) obs["q_at_with_student_critic"] = np.array([ normalize(value=q_values[i_student], min_value=args.q_min, max_value=args.q_max) ]) # Get avg reward avg_train_reward, avg_teacher_reward = get_avg_reward( train_rewards=train_rewards, teacher_rewards=teacher_rewards, args=args) obs["avg_train_reward"] = np.array([avg_train_reward]) obs["avg_teacher_reward"] = np.array([avg_teacher_reward]) # Get teacher remain timestep obs["remain_time"] = np.array([ normalize(value=(teacher.n_advice - session_advices), min_value=0., max_value=float(teacher.n_advice)) ]) return concat_in_order(obs, args), obs
def normalize(self, state): return normalize(state, self.rms.mean, self.rms.std)
def update_memory(self, teacher_reward, temp_managers, train_rewards, teacher_rewards): """Update memory The next observation is updated by replacing student Q-values with its updated temporary policy. Average rewards and remaining timestep are also updated. The measured teacher_reward is also updated. """ self.corrected_memory = [[] for _ in range(5) ] # 5: obs, new_obs, action, reward, done i_student = 1 for i_exp, exp in enumerate(self.tmp_memory): # Update student_action obs_dict = exp[-1] # Update q-value that measured using updated student_critic q_values = get_q_values( temp_managers, obs_dict["manager_observations"], [obs_dict["manager_actions"][0], obs_dict["student_action"]]) q_values = np.clip(q_values, a_min=self.args.q_min, a_max=self.args.q_max) obs_dict["q_with_student_critic"] = np.array([ normalize(value=q_values[i_student], min_value=self.args.q_min, max_value=self.args.q_max) ]) q_values = get_q_values(temp_managers, obs_dict["manager_observations"], [ obs_dict["manager_actions"][0], obs_dict["teacher_action_at"] ]) q_values = np.clip(q_values, a_min=self.args.q_min, a_max=self.args.q_max) obs_dict["q_at_with_student_critic"] = np.array([ normalize(value=q_values[i_student], min_value=self.args.q_min, max_value=self.args.q_max) ]) # Update avg_reward # Note that avg_train_reward = R_{Phase I} # Note that avg_teacher_reward = R_{Phase II} avg_train_reward, avg_teacher_reward = get_avg_reward( train_rewards=train_rewards, teacher_rewards=teacher_rewards, args=self.args) obs_dict["avg_train_reward"] = np.array([avg_train_reward]) obs_dict["avg_teacher_reward"] = np.array([avg_teacher_reward]) # Update teacher remain timestep obs_dict["remain_time"] = np.array([ normalize(value=(self.n_advice - (obs_dict["session_advices"] + 1)), min_value=0., max_value=float(self.n_advice)) ]) new_obs = concat_in_order(obs_dict, self.args) self.corrected_memory[0].append(exp[0]) self.corrected_memory[1].append(new_obs) self.corrected_memory[2].append(exp[2]) self.corrected_memory[3].append(teacher_reward) self.corrected_memory[4].append(exp[4]) self.add_memory() self.clear_tmp_memory()
def separate_label(data): X = normalize(data[:, :2].astype('float32')) Y = np.where(data[:, 2] == b'black', 0, 1) return X, Y