def run_loaded_agent(self): input_width = 3 input_height = 4 n_actions = 2 max_iter = 100 discount = 0.9 learn_rate = .005 batch_size = 4 rng = np.random filename = "agent_max_iter-{}-width-{}-height-{}-discount-{}-lr-{}-batch-{}.npz".format(max_iter, input_width, input_height, discount, learn_rate, batch_size) agent_obj = DeepQLearner(input_width, input_height, n_actions, discount, learn_rate, batch_size, rng) try: agent_obj.load(filename) except: print "Failed to Load file. Aborting." return self.test_agent(agent_obj, input_height, input_width)
D[entry][step] = sequence[entry] if terminal == 0: state = state_prime elif terminal == 1: state = s1 print('done') # build the reinforcement-learning agent print('Building RL agent ... '), agent = DeepQLearner( input_width, input_height, n_actions, discount, learn_rate, batch_size, rng ) print('done') # begin training print('Training RL agent ... ') state = s1 # initialize first state running_loss = [] for i in range(max_iter): action = agent.choose_action(state, epsilon) # choose an action using epsilon-greedy policy state_prime, reward, terminal = world.act(state, action) # get the new state, reward and terminal value from world
def run_episodes(self): #print('module name:', __name__) #print('process id:', os.getpid()) # universal learning parameters input_width = 3 input_height = 4 n_actions = 2 discount = 0.9 learn_rate = .005 batch_size = 4 rng = np.random replay_size = 16 max_iter = 175 epsilon = 0.2 #TODO: Make this settable from GUI beginning_state = np.array([[[[0, 0, 0], #pink [0, 0, 0], #orange [0, 1, 0], #blue [0, 0, 0]]]]) #green print('Starting in 5 seconds... prepare rover opposite to pink flag.') sleep(5) # initialize replay memory D <s, a, r, s', t> to replay size with random policy print('Initializing replay memory ... ') replay_memory = ( np.zeros((replay_size, 1, input_height, input_width), dtype='int32'), np.zeros((replay_size, 1), dtype='int32'), np.zeros((replay_size, 1), dtype=theano.config.floatX), np.zeros((replay_size, 1, input_height, input_width), dtype=theano.config.floatX), np.zeros((replay_size, 1), dtype='int32') ) s1_middle_thirds = beginning_state[0][0][[0, 1, 2, 3], [1, 1, 1, 1]] terminal = 0 #TODO: STEP 1: Fill with random weights for step in range(replay_size): print(step) mp_lock.acquire() state = self.last_state.get_last_state() mp_lock.release() action = np.random.randint(2) self.world.act(action) sleep(0.2) mp_lock.acquire() state_prime = self.last_state.get_last_state() show_cv_frame(self.last_state.get_last_image(), "state_prime") mp_lock.release() # get the reward and terminal value of new state reward, terminal = self.calculate_reward_and_terminal(state_prime) self.print_color_states(state_prime) print ('Lead to reward of: {}').format(reward) sequence = [state, action, reward, state_prime, terminal] for entry in range(len(replay_memory)): replay_memory[entry][step] = sequence[entry] if terminal == 1: print("Terminal reached, reset rover to opposite red flag. Starting again in 5 seconds...") print("Resetting back to s1:") self.reset_rover_to_start(s1_middle_thirds) print('done') # build the reinforcement-learning agent print('Building RL agent ... ') agent = DeepQLearner(input_width, input_height, n_actions, discount, learn_rate, batch_size, rng) print('Training RL agent ... Reset rover to opposite pink flag.') self.reset_rover_to_start(s1_middle_thirds) print('Starting in 5 seconds...') sleep(5) running_loss = [] #TODO: STEP 2: Optimize network for i in range(max_iter): mp_lock.acquire() state = self.last_state.get_last_state() mp_lock.release() action = agent.choose_action(state, epsilon) # choose an action using epsilon-greedy policy # get the new state, reward and terminal value from world self.world.act(action) sleep(0.2) mp_lock.acquire() state_prime = self.last_state.get_last_state() show_cv_frame(self.last_state.get_last_image(), "state_prime") mp_lock.release() self.print_color_states(state_prime) reward, terminal = self.calculate_reward_and_terminal(state_prime) sequence = [state, action, reward, state_prime, terminal] # concatenate into a sequence print "Found state: " print state_prime print ('Lead to reward of: {}').format(reward) for entry in range(len(replay_memory)): np.delete(replay_memory[entry], 0, 0) # delete the first entry along the first axis np.append(replay_memory[entry], sequence[entry]) # append the new sequence at the end batch_index = np.random.permutation(batch_size) # get random mini-batch indices loss = agent.train(replay_memory[0][batch_index], replay_memory[1][batch_index], replay_memory[2][batch_index], replay_memory[3][batch_index], replay_memory[4][batch_index]) running_loss.append(loss) #if i % 100 == 0: print("Loss at iter %i: %f" % (i, loss)) state = state_prime if terminal == 1: print("Terminal reached, reset rover to opposite red flag. Starting again in 5 seconds...") print("Resetting back to s1:") self.reset_rover_to_start(s1_middle_thirds) print('... done training') # test to see if it has learned best route print("Testing whether optimal path is learned ... set rover to start.\n") self.reset_rover_to_start(s1_middle_thirds) filename = "agent_max_iter-{}-width-{}-height-{}-discount-{}-lr-{}-batch-{}.npz".format(max_iter, input_width, input_height, discount, learn_rate, batch_size) agent.save(filename) #TODO: STEP 3: Test self.test_agent(agent, input_height, input_width)
sequence = [state, action, reward, state_prime, terminal] for entry in range(len(D)): D[entry][step] = sequence[entry] if terminal == 0: state = state_prime elif terminal == 1: state = s1 print('done') # build the reinforcement-learning agent print('Building RL agent ... '), agent = DeepQLearner(input_width, input_height, n_actions, discount, learn_rate, batch_size, rng) print('done') # begin training print('Training RL agent ... ') state = s1 # initialize first state running_loss = [] for i in range(max_iter): action = agent.choose_action( state, epsilon) # choose an action using epsilon-greedy policy state_prime, reward, terminal = world.act( state, action) # get the new state, reward and terminal value from world sequence = [state, action, reward, state_prime,