class ProcessAgent(Process): def __init__(self, id, prediction_q, training_q, episode_log_q, display=False): super(ProcessAgent, self).__init__() self.id = id self.prediction_q = prediction_q self.training_q = training_q self.episode_log_q = episode_log_q self.env = Environment(display=display) self.discount_factor = Config.DISCOUNT # one frame at a time self.wait_q = Queue(maxsize=1) self.exit_flag = Value('i', 0) @staticmethod def _accumulate_rewards(experiences, discount_factor, terminal_reward): reward_sum = terminal_reward for t in reversed(range(0, len(experiences) - 1)): r = np.clip(experiences[t].reward, Config.REWARD_MIN, Config.REWARD_MAX) reward_sum = discount_factor * reward_sum + r experiences[t].reward = reward_sum return experiences[:-1] @staticmethod def convert_data(experiences): x_ = np.array([exp.state for exp in experiences]) a_ = np.array([exp.action for exp in experiences]) r_ = np.array([exp.reward for exp in experiences]) return x_, r_, a_ def predict(self, state): # put the state in the prediction q # print('agent%d put one prediction'%self.id) self.prediction_q.put((self.id, state)) # wait for the prediction to come back p, v = self.wait_q.get() return p, v def select_action(self, prediction): return prediction def run_episode(self): self.env.reset() done = False experiences = [] time_count = 0 reward_sum = 0.0 while not done: prediction, value = self.predict(self.env.current_state) action = self.select_action(prediction) reward, done = self.env.step(action) reward_sum += reward exp = Experience(self.env.previous_state, action, prediction, reward, done) experiences.append(exp) if done or time_count == Config.TIME_MAX: terminal_reward = 0 if done else value updated_exps = ProcessAgent._accumulate_rewards( experiences, self.discount_factor, terminal_reward) x_, r_, a_ = self.convert_data(updated_exps) yield x_, r_, a_, reward_sum # reset the tmax count time_count = 0 # keep the last experience for the next batch experiences = [experiences[-1]] reward_sum = 0.0 time_count += 1 def run(self): # randomly sleep up to 1 second. helps agents boot smoothly. time.sleep(np.random.rand()) np.random.seed(np.int32(time.time() % 1 * 1000 + self.id * 10)) print('start agent') while self.exit_flag.value == 0: total_reward = 0 total_length = 0 for x_, r_, a_, reward_sum in self.run_episode(): total_reward += reward_sum if self.id == 0: print('sum of reward is %f' % total_reward) total_length += len(r_) + 1 # +1 for last frame that we drop self.training_q.put((x_, r_, a_)) self.episode_log_q.put( (datetime.now(), total_reward, total_length))
11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 ] } models = { name: NetworkVP('cpu:0', name, len(actions)) for name in model_names } for model in models.values(): model.load() while not done: if env.current_state is None: env.step(0) # NO-OP while we wait for the frame buffer to fill. else: if command_steps > 0: command_steps -= 1 if command.isdigit(): action = int(command) else: model = models[command] p = model.predict_p( np.expand_dims(env.current_state, axis=0))[0] # action = np.argmax(p) action = np.random.choice(actions, p=p) _, done, _ = env.step(action) else: if command is None: print(
class ProcessAgent(Process): def __init__(self, id, prediction_q, training_q, episode_log_q, reward_modifier_q=None): super(ProcessAgent, self).__init__() self.id = id self.prediction_q = prediction_q self.training_q = training_q self.episode_log_q = episode_log_q self.reward_modifier_q = reward_modifier_q self.env = Environment() self.num_actions = self.env.get_num_actions() self.onehots = np.eye(self.num_actions) self.actions = np.arange(self.num_actions) self.discount_factor = Config.DISCOUNT # one frame at a time self.wait_q = Queue(maxsize=1) self.exit_flag = Value('i', 0) @staticmethod def _accumulate_rewards(experiences, discount_factor, terminal_reward): reward_sum = terminal_reward for t in reversed(range(0, len(experiences) - 1)): r = np.clip(experiences[t].reward, Config.REWARD_MIN, Config.REWARD_MAX) reward_sum = discount_factor * reward_sum + r experiences[t].reward = reward_sum return experiences[:-1] def convert_data(self, experiences): x_ = np.array([exp.state for exp in experiences]) a_ = self.onehots[np.array([exp.action for exp in experiences], dtype=int)].astype(np.float32) r_ = np.array([exp.reward for exp in experiences]) return x_, r_, a_ def predict(self, state): # put the state in the prediction q self.prediction_q.put((self.id, state)) # wait for the prediction to come back p, v = self.wait_q.get() return p, v def select_action(self, prediction): if Config.PLAY_MODE: action = np.argmax(prediction) else: action = np.random.choice(self.actions, p=prediction) return action def run_episode(self): self.env.reset() done = False experiences = [] path = { "obs": [], "original_rewards": [], "actions": [], "human_obs": [], } time_count = 0 while not done: # very first few frames if self.env.current_state is None: self.env.step(0) # 0 == NOOP continue prediction, value = self.predict(self.env.current_state) action = self.select_action(prediction) reward, done, info = self.env.step(action) exp = Experience(self.env.previous_state, action, prediction, reward, done, info["human_obs"]) experiences.append(exp) if done or time_count == Config.TIME_MAX: terminal_reward = 0 if done else value ################################ # START REWARD MODIFICATIONS # ################################ if self.reward_modifier_q: # Translate the experiences into the "path" that RL-Teacher expects if len(path["obs"]) > 0: # Cut off the first item in the list because it's from an old episode new_experiences = experiences[1:] else: new_experiences = experiences path["obs"] += [e.state for e in new_experiences] path["original_rewards"] += [ e.reward for e in new_experiences ] path["actions"] += [e.action for e in new_experiences] path["human_obs"] += [e.human_obs for e in new_experiences] # TODO SPEED UP!! THIS IS SLOWING THINGS DOWN! self.reward_modifier_q.put((self.id, done, path)) path["rewards"] = self.wait_q.get() # Translate new rewards back into the experiences for i in range(len(experiences)): # Work backwards because the path is longer than the experience list, but their ends are synced experiences[-(1 + i)].reward = path["rewards"][-(1 + i)] ################################ # END REWARD MODIFICATIONS # ################################ reward_sum = sum([x.reward for x in experiences]) updated_exps = ProcessAgent._accumulate_rewards( experiences, self.discount_factor, terminal_reward) x_, r_, a_ = self.convert_data(updated_exps) yield x_, r_, a_, reward_sum # reset the tmax count time_count = 0 # keep the last experience for the next batch experiences = [experiences[-1]] reward_sum = 0.0 time_count += 1 def run(self): # randomly sleep up to 1 second. helps agents boot smoothly. time.sleep(np.random.rand()) np.random.seed(np.int32(time.time() % 1 * 1000 + self.id * 10)) while self.exit_flag.value == 0: total_reward = 0 total_length = 0 for x_, r_, a_, reward_sum in self.run_episode(): total_reward += reward_sum total_length += len(r_) + 1 # +1 for last frame that we drop self.training_q.put((x_, r_, a_)) self.episode_log_q.put( (datetime.now(), total_reward, total_length))