class DQNTester: def __init__(self, skip_frame, num_actions, load, rand_val=0.05): rand_vals = (rand_val, rand_val, 2) self.action_handler = ActionHandler(rand_vals) self.cnn = CNN((None, skip_frame, 84, 84), num_actions, 1) self.cnn.load(load) self.q_vals = list() self.skip_frame = skip_frame self.exp_handler = DataSet(84, 84, np.random.RandomState(), phi_length=skip_frame) self.skip_frame = skip_frame self.state_tm1 = np.zeros((84, 84), dtype=np.uint8) def get_game_action(self): q_vals = self.cnn.get_output( self.exp_handler.phi(self.state_tm1).reshape( 1, self.skip_frame, 84, 84))[0] self.q_vals.append(q_vals) return self.action_handler.action_vect_to_game_action(q_vals) def frames_processed(self, frames, action_performed, reward): game_action = self.action_handler.game_action_to_action_ind( action_performed) self.exp_handler.add_sample(self.state_tm1, game_action, reward, False) self.state_tm1 = frames[-1] def set_legal_actions(self, legal_actions): self.action_handler.set_legal_actions(legal_actions)
class AsyncTargetLearner(AsyncClient): def __init__(self, num_actions, initial_cnn_values, cnn_partial, pipe, skip_frame=4, phi_length=4, async_update_step=5, target_update_frames=40000): super().__init__(pipe) # initialize action handler, ending E-greedy is either 0.1, 0.01, 0.5 with probability 0.4, 0.3, 0.3 end_rand = np.random.choice([0.1, 0.01, 0.5], p=[0.4, 0.3, 0.3]) rand_vals = (1, end_rand, 4000000) # anneal over four million frames self.action_handler = ActionHandler(rand_vals) # initialize cnn self.cnn = cnn_partial() self.cnn.set_parameters(initial_cnn_values) self.cnn.set_target_parameters(initial_cnn_values) self.frame_buffer = np.zeros((1, phi_length, 84, 84), dtype=np.float32) self.skip_frame = skip_frame self.phi_length = phi_length self.loss_list = list() self.async_update_step = async_update_step self.target_update_frames = target_update_frames self.target_update_count = 0 def add_state_to_buffer(self, state): self.frame_buffer[0, 0:self.phi_length-1] = self.frame_buffer[0, 1:self.phi_length] self.frame_buffer[0, self.phi_length-1] = state def frame_buffer_with(self, state): empty_buffer = np.zeros((1, self.phi_length, 84, 84), dtype=np.float32) empty_buffer[0, 0:self.phi_length-1] = self.frame_buffer[0, 1:self.phi_length] empty_buffer[0, self.phi_length-1] = state return empty_buffer def check_update_target(self, total_frames_count): if total_frames_count >= self.target_update_count * self.target_update_frames: self.target_update_count += 1 return True return False def get_action(self, frame_buffer): return self.cnn.get_output(frame_buffer)[0] def get_game_action(self, frame_buffer): # checks to see if we are doing random, if so returns random game action rand, action = self.action_handler.get_random() if not rand: action = self.get_action(frame_buffer) return self.action_handler.action_vect_to_game_action(action, random=False) return action def set_legal_actions(self, legal_actions): self.action_handler.set_legal_actions(legal_actions)
class DQNLearner(learner): def __init__(self, skip_frame, num_actions, load=None, random_state=np.random.RandomState()): super().__init__() rand_vals = (1, 0.1, 1000000) # starting at 1 anneal eGreedy policy to 0.1 over 1,000,000 actions self.action_handler = ActionHandler(ActionPolicy.eGreedy, rand_vals) self.minimum_replay_size = 100 self.exp_handler = DataSet(84, 84, random_state, max_steps=1000000, phi_length=skip_frame) self.cnn = CNN((None, skip_frame, 84, 84), num_actions) self.skip_frame = skip_frame self.discount = .95 self.costList = list() self.state_tm1 = None if load is not None: self.cnn.load(load) def frames_processed(self, frames, action_performed, reward): game_action = self.action_handler.game_action_to_action_ind(action_performed) if self.state_tm1 is not None: self.exp_handler.add_sample(self.state_tm1, game_action, reward, False) # generate minibatch data if self.exp_handler.size > self.minimum_replay_size: states, actions, rewards, state_tp1s, terminal = self.exp_handler.random_batch(32) cost = self.cnn.train(states, actions, rewards, state_tp1s, terminal) self.costList.append(cost) self.action_handler.anneal() self.state_tm1 = frames[-1] def get_action(self, processed_screens): return self.cnn.get_output(processed_screens)[0] def game_over(self): self.exp_handler.add_terminal() # adds a terminal def get_game_action(self): return self.action_handler.action_vect_to_game_action( self.get_action(self.exp_handler.phi(self.state_tm1).reshape(1, self.skip_frame, 84, 84))) def set_legal_actions(self, legal_actions): self.action_handler.set_legal_actions(legal_actions) def save(self, file): self.cnn.save(file) def get_cost_list(self): return self.costList
class evolvedLearner(): def __init__(self, output_fn): rand_vals = (0.05, 0.05, 2) self.action_handler = ActionHandler(ActionPolicy.eGreedy, rand_vals) self.last_img = None self.output_fn = output_fn def set_legal_actions(self, legal_actions): self.action_handler.set_legal_actions(legal_actions) def frames_processed(self, frames, action_performed, reward): self.last_img = frames[-1]/255 def get_game_action(self): return self.action_handler.action_vect_to_game_action(self.output_fn(self.last_img))
class AsyncA3CLearner(AsyncClient): def __init__(self, num_actions, initial_cnn_values, cnn_partial, pipe, skip_frame=4, phi_length=4, async_update_step=5): super().__init__(pipe) # A3C doesn't have an EGreedy exploration policy so we set the random values to 0 self.action_handler = ActionHandler((0, 0, 2)) # initialize cnn self.cnn = cnn_partial() self.cnn.set_parameters(initial_cnn_values) self.frame_buffer = np.zeros((1, phi_length, 84, 84), dtype=np.float32) self.skip_frame = skip_frame self.phi_length = phi_length self.loss_list = list() self.async_update_step = async_update_step def add_state_to_buffer(self, state): self.frame_buffer[0, 0:self.phi_length - 1] = self.frame_buffer[0, 1:self.phi_length] self.frame_buffer[0, self.phi_length - 1] = state def frame_buffer_with(self, state): empty_buffer = np.zeros((1, self.phi_length, 84, 84), dtype=np.float32) empty_buffer[0, 0:self.phi_length - 1] = self.frame_buffer[0, 1:self.phi_length] empty_buffer[0, self.phi_length - 1] = state return empty_buffer def get_action(self, frame_buffer): return self.cnn.get_policy_output(frame_buffer)[0] def get_game_action(self, frame_buffer): action = self.get_action(frame_buffer) return self.action_handler.action_vect_to_game_action(action, random=False) def set_legal_actions(self, legal_actions): self.action_handler.set_legal_actions(legal_actions)
class PrioritizedExperienceLearner(learner): def __init__(self, skip_frame, num_actions, load=None): super().__init__() rand_vals = (1, 0.1, 10000 / skip_frame) # starting at 1 anneal eGreedy policy to 0.1 over 1,000,000/skip_frame self.action_handler = ActionHandler(ActionPolicy.eGreedy, rand_vals) self.exp_handler = PrioritizedExperienceHandler(1000000 / skip_frame) self.train_handler = TrainHandler(32, num_actions) self.cnn = CNN((None, skip_frame, 86, 80), num_actions, 0.1) self.discount = 0.99 if load is not None: self.cnn.load(load) def frames_processed(self, frames, action_performed, reward): self.exp_handler.add_experience(frames, self.action_handler.game_action_to_action_ind(action_performed), reward) self.train_handler.train_prioritized(self.exp_handler, 0.99, self.cnn) self.action_handler.anneal() def plot_tree(self): self.exp_handler.tree.plot() def get_action(self, game_input): return self.cnn.get_output(game_input)[0] def game_over(self): self.exp_handler.trim() # trim experience replay of learner self.exp_handler.add_terminal() # adds a terminal def get_game_action(self, game_input): return self.action_handler.action_vect_to_game_action(self.get_action(game_input)) def set_legal_actions(self, legal_actions): self.action_handler.set_legal_actions(legal_actions) def save(self, file): self.cnn.save(file) def get_cost_list(self): return self.train_handler.costList
class AsyncA3CLearner(AsyncClient): def __init__(self, num_actions, initial_cnn_values, cnn_partial, pipe, skip_frame=4, phi_length=4, async_update_step=5): super().__init__(pipe) # A3C doesn't have an EGreedy exploration policy so we set the random values to 0 self.action_handler = ActionHandler((0, 0, 2)) # initialize cnn self.cnn = cnn_partial() self.cnn.set_parameters(initial_cnn_values) self.frame_buffer = np.zeros((1, phi_length, 84, 84), dtype=np.float32) self.skip_frame = skip_frame self.phi_length = phi_length self.loss_list = list() self.async_update_step = async_update_step def add_state_to_buffer(self, state): self.frame_buffer[0, 0:self.phi_length-1] = self.frame_buffer[0, 1:self.phi_length] self.frame_buffer[0, self.phi_length-1] = state def frame_buffer_with(self, state): empty_buffer = np.zeros((1, self.phi_length, 84, 84), dtype=np.float32) empty_buffer[0, 0:self.phi_length-1] = self.frame_buffer[0, 1:self.phi_length] empty_buffer[0, self.phi_length-1] = state return empty_buffer def get_action(self, frame_buffer): return self.cnn.get_policy_output(frame_buffer)[0] def get_game_action(self, frame_buffer): action = self.get_action(frame_buffer) return self.action_handler.action_vect_to_game_action(action, random=False) def set_legal_actions(self, legal_actions): self.action_handler.set_legal_actions(legal_actions)
class DQNTester: def __init__(self, skip_frame, num_actions, load, rand_val=0.05): rand_vals = (rand_val, rand_val, 2) self.action_handler = ActionHandler(ActionPolicy.eGreedy, rand_vals) self.cnn = CNN((None, skip_frame, 84, 84), num_actions, 1) self.cnn.load(load) self.q_vals = list() self.skip_frame = skip_frame self.exp_handler = DataSet(84, 84, np.random.RandomState(), phi_length=skip_frame) self.skip_frame = skip_frame self.state_tm1 = np.zeros((84, 84), dtype=np.uint8) def get_game_action(self): q_vals = self.cnn.get_output(self.exp_handler.phi(self.state_tm1).reshape(1, self.skip_frame, 84, 84))[0] self.q_vals.append(q_vals) return self.action_handler.action_vect_to_game_action(q_vals) def frames_processed(self, frames, action_performed, reward): game_action = self.action_handler.game_action_to_action_ind(action_performed) self.exp_handler.add_sample(self.state_tm1, game_action, reward, False) self.state_tm1 = frames[-1] def set_legal_actions(self, legal_actions): self.action_handler.set_legal_actions(legal_actions)
def test_action_vect_to_game_action(action_handler: ActionHandler): game_action = action_handler.action_vect_to_game_action([0, 0, 1, 0], random=False) assert isinstance(game_action, np.integer), "expected int got {}".format( type(game_action)) assert game_action == 4
class NoveltyLearner(): def __init__(self, skip_frame, num_actions): rand_vals = ( 1, 0.1, 1000000 ) # starting at 1 anneal eGreedy policy to 0.1 over 1,000,000 actions self.action_handler = ActionHandler(ActionPolicy.eGreedy, rand_vals) self.minimum_replay_size = 100 self.exp_handler = DataSet(84, 84, np.random.RandomState(), max_steps=1000000, phi_length=skip_frame) self.cnn = CNN((None, skip_frame, 84, 84), num_actions) self.skip_frame = skip_frame self.costList = list() self.state_tm1 = None # novelty setup self.frame_table = dict() self.new_novel_states = 0 def frames_processed(self, frames, action_performed, reward): # novelty reward for frame in frames: frame[frame > 0] = 1 frame_hash = hash(frame.data.tobytes()) # if already in table if frame_hash in self.frame_table: novelty_reward = 0 self.frame_table[frame_hash] += 1 # new state else: novelty_reward = 1 self.frame_table[frame_hash] = 1 self.new_novel_states += 1 # if no reward from the game reward from novelty if reward == 0: reward = novelty_reward game_action = self.action_handler.game_action_to_action_ind( action_performed) if self.state_tm1 is not None: self.exp_handler.add_sample(self.state_tm1, game_action, reward, False) # generate minibatch data if self.exp_handler.size > self.minimum_replay_size: states, actions, rewards, state_tp1s, terminal = self.exp_handler.random_batch( 32) cost = self.cnn.train(states, actions, rewards, state_tp1s, terminal) self.costList.append(cost) self.action_handler.anneal() self.state_tm1 = frames[-1] def set_legal_actions(self, legal_actions): self.num_actions = len(legal_actions) self.action_handler.set_legal_actions(legal_actions) def get_action(self, processed_screens): return self.cnn.get_output(processed_screens)[0] def get_game_action(self): return self.action_handler.action_vect_to_game_action( self.get_action( self.exp_handler.phi(self.state_tm1).reshape( 1, self.skip_frame, 84, 84))) def game_over(self): self.exp_handler.add_terminal() # adds a terminal # print('novel states', self.new_novel_states, 'total states', len(self.frame_table)) self.new_novel_states = 0 def get_cost_list(self): return self.costList def save(self, file): self.cnn.save(file)
def test_action_vect_to_game_action(action_handler: ActionHandler): game_action = action_handler.action_vect_to_game_action([0, 0, 1, 0], random=False) assert isinstance(game_action, np.integer), "expected int got {}".format(type(game_action)) assert game_action == 4
class DQNLearner(learner): def __init__(self, skip_frame, num_actions, load=None, random_state=np.random.RandomState()): super().__init__() rand_vals = ( 1, 0.1, 1000000 ) # starting at 1 anneal eGreedy policy to 0.1 over 1,000,000 actions self.action_handler = ActionHandler(rand_vals) self.minimum_replay_size = 100 self.exp_handler = DataSet(84, 84, random_state, max_steps=1000000, phi_length=skip_frame) self.cnn = CNN((None, skip_frame, 84, 84), num_actions) self.skip_frame = skip_frame self.discount = .95 self.costList = list() self.state_tm1 = None if load is not None: self.cnn.load(load) def frames_processed(self, frames, action_performed, reward): game_action = self.action_handler.game_action_to_action_ind( action_performed) if self.state_tm1 is not None: self.exp_handler.add_sample(self.state_tm1, game_action, reward, False) # generate minibatch data if self.exp_handler.size > self.minimum_replay_size: states, actions, rewards, state_tp1s, terminal = self.exp_handler.random_batch( 32) cost = self.cnn.train(states, actions, rewards, state_tp1s, terminal) self.costList.append(cost) self.action_handler.anneal() self.state_tm1 = frames[-1] def get_action(self, processed_screens): return self.cnn.get_output(processed_screens)[0] def game_over(self): self.exp_handler.add_terminal() # adds a terminal def get_game_action(self): return self.action_handler.action_vect_to_game_action( self.get_action( self.exp_handler.phi(self.state_tm1).reshape( 1, self.skip_frame, 84, 84))) def set_legal_actions(self, legal_actions): self.action_handler.set_legal_actions(legal_actions) def save(self, file): self.cnn.save(file) def get_cost_list(self): return self.costList