def test_memory(self): memory = ActionReplayMemory(1000000, 4) # index = 0 # while(index < 100000): # axr = np.random.randint(0,100,(84,84,4)) # memory.append(axr,4,5) # sys.stdout.write('\r{}/{}'.format(index,1000000)) # sys.stdout.flush() # index += 1 print(memory.size())
def test_detail(self): memory = ActionReplayMemory(250, 4) #test memory memory_old = ActionReplayMemoryOld(250, 4) index = 0 h_prep = HistoryPreprocessor(4) np_prep = NumpyPreprocessor() preprocessors = PreprocessorSequence([h_prep, np_prep]) for x in range(0, 1000): axr = np.random.randint(0, 100, (84, 84)) prep_state = preprocessors.process_state_for_memory(axr) memory.append(prep_state, 4, 5) memory_old.append(prep_state, 4, 5) for t in range(0, 10): batch_size = 32 indexes = (np.random.randint(0, memory._filled_size, size=batch_size)).tolist() curr_arr, next_arr, reward_arr, action_arr, terminal_arr = memory.sample( batch_size, indexes) curr_arr2, next_arr2, reward_arr2, action_arr2, terminal_arr2 = memory_old.sample( batch_size, indexes) for i, terminal in enumerate(terminal_arr): empty_arr = np.zeros((84, 84)) for d in range(0, 4): self.assertTrue(not np.all(curr_arr[i][:, :, d] == empty_arr)) self.assertTrue( np.all(curr_arr[i][:, :, d] == curr_arr2[i][:, :, d])) if (indexes[i] >= 4): self.assertTrue( np.all(curr_arr[i][:, :, 1] == memory.survey(indexes[i] - 1))) self.assertTrue( np.all(curr_arr[i][:, :, 2] == memory.survey(indexes[i] - 2))) self.assertTrue( np.all(curr_arr[i][:, :, 3] == memory.survey(indexes[i] - 3))) self.assertTrue( np.all(curr_arr[i][:, :, 0] == curr_arr2[i][:, :, 0]))
def test_seq(self): memory = ActionReplayMemory(250, 4) #test memory index = 0 for x in range(0, 1000): axr = np.random.randint(0, 100, (84, 84, 4)) memory.append(axr, 4, 5) for i in range(0, 10): curr_arr, next_arr, reward_arr, action_arr, terminal_arr = memory.sample( 10) for i, terminal in enumerate(terminal_arr): empty_arr = np.zeros((84, 84)) for d in range(0, 4): self.assertTrue(not np.all(curr_arr[i][:, :, d] == empty_arr))
def test_memory(self): memory = ActionReplayMemory(250, 4) #test memory index = 0 while (index < 1000): axr = np.random.randint(0, 100, (84, 84, 4)) memory.append(axr, 4, 5) if ((index + 1) % 50 == 0): axr = np.random.randint(0, 100, (84, 84, 4)) memory.end_episode(axr, True) index += 1 index += 1 for i in range(0, 10): #some sampling tests curr_arr, next_arr, reward_arr, action_arr, terminal_arr = memory.sample( 10) for i, terminal in enumerate(terminal_arr): self.assertTrue( np.all(curr_arr[i][:, :, 0] == next_arr[i][:, :, 1])) self.assertTrue(np.sum(reward_arr - 5) == 0) self.assertTrue(np.sum(action_arr - 4) == 0)
def main(): #env = gym.make("Enduro-v0") #env = gym.make("SpaceInvaders-v0") #env = gym.make("Breakout-v0") model_name = "result-q4" if (len(sys.argv) >= 2): model_name = sys.argv[1] if (len(sys.argv) >= 3): env = gym.make(sys.argv[2]) else: #env = gym.make("Enduro-v0") env = gym.make("SpaceInvaders-v0") #env = gym.make("Breakout-v0") #no skip frames env.frameskip = 1 input_shape = (84, 84) batch_size = 32 num_actions = env.action_space.n memory_size = 1000000 memory_burn_in_num = 50000 start_epsilon = 1 end_epsilon = 0.01 decay_steps = 1000000 target_update_freq = 10000 train_freq = 4 #How often you train the network history_size = 4 history_prep = HistoryPreprocessor(history_size) atari_prep = AtariPreprocessor(input_shape, 0, 999) numpy_prep = NumpyPreprocessor() preprocessors = PreprocessorSequence( [atari_prep, history_prep, numpy_prep]) #from left to right policy = LinearDecayGreedyEpsilonPolicy(start_epsilon, end_epsilon, decay_steps) linear_model = create_model(history_size, input_shape, num_actions, model_name) linear_model.summary() optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) loss_func = huber_loss #linear_model.compile(optimizer, loss_func) random_policy = UniformRandomPolicy(num_actions) #memory = ActionReplayMemory(1000000,4) memory = ActionReplayMemory(memory_size, 4) memory_burn_in(env, memory, preprocessors, memory_burn_in_num, random_policy) #print(reward_arr) #print(curr_state_arr) agent = DoubleQNAgent(linear_model, preprocessors, memory, policy, 0.99, target_update_freq, None, train_freq, batch_size) agent.compile(optimizer, loss_func) agent.save_models() agent.fit(env, 1000000, 100000)