Esempio n. 1
0
    def run_episodes(self):
        #print('module name:', __name__)
        #print('process id:', os.getpid())

        # universal learning parameters
        input_width = 3
        input_height = 4
        n_actions = 2
        discount = 0.9
        learn_rate = .005
        batch_size = 4
        rng = np.random
        replay_size = 16
        max_iter = 175
        epsilon = 0.2
        #TODO: Make this settable from GUI
        beginning_state = np.array([[[[0, 0, 0],    #pink
                                      [0, 0, 0],    #orange
                                      [0, 1, 0],    #blue
                                      [0, 0, 0]]]]) #green

        print('Starting in 5 seconds... prepare rover opposite to pink flag.')
        sleep(5)
        
        # initialize replay memory D <s, a, r, s', t> to replay size with random policy
        print('Initializing replay memory ... ')
        replay_memory = (
            np.zeros((replay_size, 1, input_height, input_width), dtype='int32'),
            np.zeros((replay_size, 1), dtype='int32'),
            np.zeros((replay_size, 1), dtype=theano.config.floatX),
            np.zeros((replay_size, 1, input_height, input_width), dtype=theano.config.floatX),
            np.zeros((replay_size, 1), dtype='int32')
        )

        s1_middle_thirds = beginning_state[0][0][[0, 1, 2, 3], [1, 1, 1, 1]]
        terminal = 0

        #TODO: STEP 1: Fill with random weights
        for step in range(replay_size):
            print(step)

            mp_lock.acquire()
            state = self.last_state.get_last_state()
            mp_lock.release()

            action = np.random.randint(2)

            self.world.act(action)
            sleep(0.2)

            mp_lock.acquire()
            state_prime = self.last_state.get_last_state()
            show_cv_frame(self.last_state.get_last_image(), "state_prime")
            mp_lock.release()

            # get the reward and terminal value of new state
            reward, terminal = self.calculate_reward_and_terminal(state_prime)

            self.print_color_states(state_prime)

            print ('Lead to reward of: {}').format(reward)
            sequence = [state, action, reward, state_prime, terminal]

            for entry in range(len(replay_memory)):

                replay_memory[entry][step] = sequence[entry]

            if terminal == 1:
                print("Terminal reached, reset rover to opposite red flag. Starting again in 5 seconds...")
                print("Resetting back to s1:")
                self.reset_rover_to_start(s1_middle_thirds)

        print('done')

        # build the reinforcement-learning agent
        print('Building RL agent ... ')
        agent = DeepQLearner(input_width, input_height, n_actions, discount, learn_rate, batch_size, rng)

        print('Training RL agent ... Reset rover to opposite pink flag.')
        self.reset_rover_to_start(s1_middle_thirds)
        print('Starting in 5 seconds...')
        sleep(5)

        running_loss = []

        #TODO: STEP 2: Optimize network
        for i in range(max_iter):
            mp_lock.acquire()
            state = self.last_state.get_last_state()
            mp_lock.release()

            action = agent.choose_action(state, epsilon)  # choose an action using epsilon-greedy policy

            # get the new state, reward and terminal value from world
            self.world.act(action)
            sleep(0.2)

            mp_lock.acquire()
            state_prime = self.last_state.get_last_state()
            show_cv_frame(self.last_state.get_last_image(), "state_prime")
            mp_lock.release()

            self.print_color_states(state_prime)

            reward, terminal = self.calculate_reward_and_terminal(state_prime)

            sequence = [state, action, reward, state_prime, terminal]  # concatenate into a sequence
            print "Found state: "
            print state_prime
            print ('Lead to reward of: {}').format(reward)

            for entry in range(len(replay_memory)):
                np.delete(replay_memory[entry], 0, 0)  # delete the first entry along the first axis
                np.append(replay_memory[entry], sequence[entry])  # append the new sequence at the end

            batch_index = np.random.permutation(batch_size)  # get random mini-batch indices

            loss = agent.train(replay_memory[0][batch_index], replay_memory[1][batch_index],
                               replay_memory[2][batch_index], replay_memory[3][batch_index],
                               replay_memory[4][batch_index])

            running_loss.append(loss)

            #if i % 100 == 0:
            print("Loss at iter %i: %f" % (i, loss))

            state = state_prime
            if terminal == 1:
                print("Terminal reached, reset rover to opposite red flag. Starting again in 5 seconds...")
                print("Resetting back to s1:")
                self.reset_rover_to_start(s1_middle_thirds)

        print('... done training')

        # test to see if it has learned best route
        print("Testing whether optimal path is learned ... set rover to start.\n")
        self.reset_rover_to_start(s1_middle_thirds)

        filename = "agent_max_iter-{}-width-{}-height-{}-discount-{}-lr-{}-batch-{}.npz".format(max_iter,
                                                                                                input_width,
                                                                                                input_height,
                                                                                                discount,
                                                                                                learn_rate,
                                                                                                batch_size)
        agent.save(filename)

        #TODO: STEP 3: Test
        self.test_agent(agent, input_height, input_width)
Esempio n. 2
0
    n_actions,
    discount,
    learn_rate,
    batch_size,
    rng
)

print('done')

# begin training
print('Training RL agent ... ')
state = s1  # initialize first state
running_loss = []
for i in range(max_iter):

    action = agent.choose_action(state, epsilon)  # choose an action using epsilon-greedy policy
    state_prime, reward, terminal = world.act(state, action)  # get the new state, reward and terminal value from world
    sequence = [state, action, reward, state_prime, terminal]  # concatenate into a sequence

    for entry in range(len(D)):

        np.delete(D[entry], 0, 0)  # delete the first entry along the first axis
        np.append(D[entry], sequence[entry])  # append the new sequence at the end

    batch_index = np.random.permutation(batch_size)  # get random mini-batch indices

    loss = agent.train(D[0][batch_index], D[1][batch_index], D[2][batch_index], D[3][batch_index], D[4][batch_index])
    running_loss.append(loss)

    if i % 100 == 0:
        print("Loss at iter %i: %f" % (i, loss))
Esempio n. 3
0
print('done')

# build the reinforcement-learning agent
print('Building RL agent ... '),
agent = DeepQLearner(input_width, input_height, n_actions, discount,
                     learn_rate, batch_size, rng)

print('done')

# begin training
print('Training RL agent ... ')
state = s1  # initialize first state
running_loss = []
for i in range(max_iter):

    action = agent.choose_action(
        state, epsilon)  # choose an action using epsilon-greedy policy
    state_prime, reward, terminal = world.act(
        state,
        action)  # get the new state, reward and terminal value from world
    sequence = [state, action, reward, state_prime,
                terminal]  # concatenate into a sequence

    for entry in range(len(D)):

        np.delete(D[entry], 0,
                  0)  # delete the first entry along the first axis
        np.append(D[entry],
                  sequence[entry])  # append the new sequence at the end

    batch_index = np.random.permutation(
        batch_size)  # get random mini-batch indices