コード例 #1
0
    def __init__(self, config):

        super(ForagingEnv, self).__init__()

        # params

        self.config = config

        self.max_food = 7
        self.food_reward = 10
        self.sight_reward = 0.1

        # init
        self.done = False
        self.total_success = 0
        self.current_step = 0
        self.duration = 0
        self.start_time = None
        self.exp_manager = None

        # Define action and sensors space
        self.action_space = spaces.Box(low=0,
                                       high=1,
                                       shape=(4, ),
                                       dtype=np.float32)
        # why high and low?
        self.observation_space = spaces.Box(low=0,
                                            high=1,
                                            shape=(5, ),
                                            dtype=np.float32)

        self.action_selection = ActionSelection(self.config)

        self.robot = False
        while not self.robot:
            if self.config.sim_hard == 'sim':
                self.robot = robobo.SimulationRobobo().connect(
                    address=self.config.robot_ip, port=self.config.robot_port)
            else:
                self.robot = robobo.HardwareRobobo(camera=True).connect(
                    address=self.config.robot_ip_hard)

            time.sleep(1)
コード例 #2
0
def main():

    global crashed

    EP_COUNT = 100
    STEP_COUNT = 50
    STEP_SIZE_MS = 500
    CRASH_SENSOR_BOUNDARY = -0.43  # negative for simulation, positive for RW. Not used right now
    CRASH_POSITION_BOUNDARY = 0.02

    A = [(20,20), (0,20), (20,0)]#, (10,5), (5,10)]  # (-25,-25),
    epsilon = 0.1
    epsilon_decaying = 0.5
    gamma = 0.1
    recency_factor = 0.01  # higher parameter setting -> forget older information faster
    # proximity_factor = 1  # how heavily to penalize proximity to obstacle

    ACTION_NAMES = ['forward', 'sharp left', 'sharp right']  # 'left', 'right']  # 'backward',
    SENS_NAMES = ["IR" + str(i + 1) for i in range(8)]

    # Initialize the robot -> SET THESE PARAMETERS!
    hardware = False
    # nn_from_file = True if input('Would you like to use a pre-trained neural network? (y/n') == 'y' else False
    nn_from_file = True
    learning = False  # Disable this if you want to run a pre-trained network
    if nn_from_file is True:
        print('loading network, disabling learning...')
        learning = False

    if hardware:
        rob = robobo.HardwareRobobo(camera=True).connect(address="192.168.1.7")
    else:
        rob = robobo.SimulationRobobo().connect(address='172.20.10.3', port=19997)
        rob.stop_world()
        time.sleep(0.1)

    if not learning:
        epsilon = 0
        epsilon_decaying = 0

    # Initialize the data structure and neural network
    eps = []
    hidden_layers = [16, 12]
    model = init_nn(input_dims=len(SENS_NAMES), output_dims=len(A),
                    hidden_layers=hidden_layers, from_file=nn_from_file)

    for episode in range(EP_COUNT):

        data = EpisodeData(ACTION_NAMES, sens_names=SENS_NAMES)

        signal.signal(signal.SIGINT, terminate_program)
        start_simulation(rob)

        ### INITIALIZATION ###
        print('\n--- episode {} ---'.format(episode + 1))

        S = np.log(np.array(rob.read_irs()))
        S[np.isinf(S)] = 0
        last_position = np.array([0,0,0])

        ########## Q-LEARNING LOOP ##########

        crashed = False

        for i in range(STEP_COUNT):
            start_time = time.time()

            print('\n--- step {} ---'.format(i+1))

            # pos = rob.position()

            ### ACTION SELECTION & EXECUTION ###
            Q_s = model.predict(np.expand_dims(S, 0))[0]

            # request wheel speed parameters for max action
            action, wheels = e_greedy_action(Q_s, A, epsilon)# epsilon_decaying ** (1 + (0.1 * episode)))

            # move the robot
            rob.move(wheels['left'], wheels['right'], STEP_SIZE_MS)

            if learning:
                time.sleep(STEP_SIZE_MS/1000)

            ### OBSERVING NEXT STATE ###
            S_prime = np.log(np.array(rob.read_irs()))
            S_prime[np.isinf(S_prime)] = 0

            print("ROB IRs: {}".format(S_prime / 10))
            current_position = np.array(rob.position())
            print("robobo is at {}".format(current_position))

            # observe the reward
            s_trans = wheels['left'] + wheels['right']  # translational speed of the robot
            s_rot = abs(wheels['left'] - wheels['right'])  # rotational speed of the robot

            if hardware:
                crashed = False
                raise NotImplementedError("Haven't implemented this, I suggest using a threshold for the sensor (see"
                                          "code below this statement)")
            else:
                dist = np.linalg.norm(last_position - current_position)
                crashed = min(S_prime[3:] / 10) < CRASH_SENSOR_BOUNDARY or dist < CRASH_POSITION_BOUNDARY

            if not crashed:
                # reward = 1 + min(S) * proximity_factor  # - (wheels == A[1])
                # see Eiben et al. for this formula
                reward = s_trans * (1 - 0.9 * (s_rot / 20)) * (1 - (min(S_prime[3:]) / -0.65))
            else:
                reward = -400

            # Retrieve Q values from neural network
            Q_prime = model.predict(np.expand_dims(S_prime, 0))[0]

            ### LEARNING ###

            Q_target = reward + (gamma * np.argmax(Q_prime))

            Q_targets = np.copy(Q_s)
            Q_targets[action] = Q_target

            ### SAVE DATA ###

            # pos = np.array([1,2,3])
            data.update(i, S, Q_s, Q_targets, reward)  # pos removed

            ### TERMINATION CONDITION ###

            # if S == S_prime and not S.sum() == 0:  # np.isinf(S).any() is False:
            #     print('Termination condition reached')
            #     break

            S = np.copy(S_prime)
            last_position = current_position

            print("crashed: ", crashed)
            print("chosen action:", ACTION_NAMES[action])
            print('reward: ', reward)
            print("Q_s (NN output): ", Q_s)
            print("Updated Q-values: " + str(Q_targets))

            elapsed_time = time.time() - start_time

            if crashed:
                break

        # terminate the episode data and store it
        data.terminate()
        eps.append(data)

        if learning:

            print("\n----- Learning ----\n")
            X = pd.concat([ep_data.sens for ep_data in eps])
            y = pd.concat([ep_data.Q_targets for ep_data in eps])

            # # calculate sample weights
            # ep_lengths = [len(ep_data.sens) for ep_data in eps[::-1]]
            # sample_weights = []
            #
            # for i, ep_length in enumerate(ep_lengths):
            #     sample_weights = sample_weights + ([(1 - recency_factor) ** i] * ep_length)

            # perform learning over the episode
            model.fit(X, y, epochs=100) #sample_weight=np.array(sample_weights))

        # # perform an evaluation of the episode (probably not necessary till later)
        # model.evaluate(data)

        # time.sleep(0.1)

        # pause the simulation and read the collected food
        # rob.pause_simulation()

        rob.sleep(1)

        if crashed or episode == EP_COUNT:
            save_nn(model)
            print('Robot crashed, resetting simulation...')
            data.terminate(crashed)
            # could implement something here to save the experience if resetting the simulation!

        if crashed:
            rob.stop_world()

    rob.stop_world()
コード例 #3
0
def main():
    signal.signal(signal.SIGINT, terminate_program)

    rob = robobo.SimulationRobobo().connect(address='172.17.0.1', port=19997) if use_simulation \
        else robobo.HardwareRobobo(camera=True).connect(address="10.15.3.48")

    rob.set_phone_tilt(45, 100) if use_simulation else rob.set_phone_tilt(
        109, 100)

    state_table = {}
    q_table_file = './src/state_table.json'
    if os.path.exists(q_table_file):
        with open(q_table_file) as g:
            state_table = json.load(g)

    def get_sensor_info(direction):
        a = np.log(np.array(rob.read_irs())) / 10
        all_sensor_info = np.array([0 if x == inf else 1 + (-x / 2) - 0.2 for x in a]) if use_simulation \
            else np.array(np.log(rob.read_irs())) / 10
        all_sensor_info[all_sensor_info == inf] = 0
        all_sensor_info[all_sensor_info == -inf] = 0
        # [0, 1, 2, 3, 4, 5, 6, 7]
        if direction == 'front':
            return all_sensor_info[5]
        elif direction == 'back':
            return all_sensor_info[1]
        elif direction == 'front_left':
            return all_sensor_info[6]
        elif direction == 'front_left_left':
            return all_sensor_info[7]
        elif direction == 'front_right':
            return all_sensor_info[4]
        elif direction == 'front_right_right':
            return all_sensor_info[3]
        elif direction == 'back_left':
            return all_sensor_info[0]
        elif direction == 'back_right':
            return all_sensor_info[2]
        elif direction == 'all':
            print(all_sensor_info[3:])
            return all_sensor_info
        elif direction == 'front_3':
            return [all_sensor_info[3]] + [all_sensor_info[5]
                                           ] + [all_sensor_info[7]]
        else:
            raise Exception('Invalid direction')

    # safe, almost safe, not safe. combine with previous state of safe almost safe and not safe.
    # safe to almost safe is good, almost safe to safe is okay, safe to safe is neutral
    # s to a to r to s'.
    # Small steps for going left or right (left or right are only rotating and straight is going forward).
    # controller is the q values: the boundary for every sensor.

    def move_left():
        rob.move(-speed, speed, dist)

    def move_right():
        rob.move(speed, -speed, dist)

    def go_straight():
        rob.move(speed, speed, dist)

    def move_back():
        rob.move(-speed, -speed, dist)

    boundary_sensor = [0.6, 0.8] if not use_simulation else [0.5, 0.95]
    boundaries_color = [0.1, 0.7] if not use_simulation else [0.05, 0.85]

    # A static collision-avoidance policy
    def static_policy(color_info):
        max_c = np.max(color_info)
        if max_c == color_info[0]:
            return 1
        elif max_c == color_info[1]:
            return 0
        elif max_c == color_info[2]:
            return 2
        return 0

    def epsilon_policy(s, epsilon):
        s = str(s)
        # epsilon greedy
        """"
      ACTIONS ARE DEFINED AS FOLLOWS:
        NUM: ACTION
          ------------
          0: STRAIGHT
          1: LEFT
          2: RIGHT
          ------------
      """
        e = 0 if run_test else epsilon
        if e > random.random():
            return random.choice([0, 1, 2])
        else:
            return np.argmax(state_table[s])

    def take_action(action):
        if action == 1:
            move_left()
        elif action == 2:
            move_right()
        elif action == 0:
            go_straight()
        # elif action == 'back':
        #     move_back()

    def get_color_info():
        image = rob.get_image_front()

        # Mask function
        def get_red_pixels(img):
            hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
            lower_range = np.array([0, 50, 20])
            upper_range = np.array([5, 255, 255])
            mask = cv2.inRange(hsv, lower_range, upper_range)
            # print(get_green_pixels(image))
            cv2.imwrite('a.png', mask)
            a = b = 0
            for i in mask:
                for j in i:
                    b += 1
                    if j == 255:
                        a += 1
            return a / b
            # count = 0
            # pix_count = 0
            # b = 64
            # for i in range(len(img)):
            #     for j in range(len(img[i])):
            #         pixel = img[i][j]
            #         pix_count += 1
            #         if (pixel[0] > b or pixel[2] > b) and pixel[1] < b * 2 \
            #                 or (pixel[0] > b*2 and pixel[1] > b*2 and pixel[2] > b*2):
            #             # img[i][j] = [0, 0, 0]
            #             count += 1
            # return 1 - (count / pix_count)

        left, middle_l, middle_r, right = np.hsplit(image, 4)
        middle = np.concatenate((middle_l, middle_r), axis=1)
        return get_red_pixels(left), get_red_pixels(middle), get_red_pixels(
            right)

    def get_reward(previous_state, new_state, previous_sensor, new_sensor,
                   prev_action, action, prev_val, new_val):

        # Max no. of green in img, before and after
        # 0: No green pixels in img;    1: All img consists of green pixels

        prev_right, prev_mid, prev_left = prev_val
        sum_prev_val = sum(prev_val)
        new_right, new_mid, new_left = new_val
        sum_new_val = sum(new_val)
        max_new_sensor = np.max(new_sensor)
        max_prev_sensor = np.max(previous_sensor)
        max_c_prev = np.max(previous_state[3:])
        max_c_new = np.max(new_state[3:])

        # Encourages going towards prey
        if max_c_prev == 0 and max_c_new == 1:
            return 10 if action == 0 else 2  # This was originally 0 when left/right

        # Massive payoff if we get super close to prey
        if max_c_prev == 1 and max_c_new == 2:
            return 30
        if max_c_prev == 2 == max_c_new == 2 \
                and max_prev_sensor == 1:
            return 20

        # Nothing happens if prey gets away
        if max_c_prev == 2 and max_c_new == 1:
            return -2  # This was originally 0

        # Give good reward if we see more red than before
        if sum_prev_val < sum_new_val:
            # return 10 if action == 0 else 0
            return 8 if action == 0 else 1

        # If sensors detect enemy, then give good payoff.
        # If sensors detect wall, give bad payoff to steer clear
        if max_new_sensor > max_prev_sensor:
            return 30 if max_c_new >= 1 else -5

        # Give good payoff to encourage exploring (going straight)
        # Minor bad payoff for turning around, but not bad enough to discourage it
        # return 1 if action == 0 else -1
        return 0 if action == 0 else -2

    # Returns list of values with discretized sensor values and color values.
    def make_discrete(values_s, boundary_s, values_c, boundaries_c):
        discrete_list_s = []
        discrete_list_c = []

        for x in values_s:
            if boundary_s[0] > x:
                discrete_list_s.append(0)
            elif boundary_s[1] > x > boundary_s[0]:
                discrete_list_s.append(1)
            else:
                discrete_list_s.append(2)
        for y in values_c:
            if y < boundaries_c[0]:
                discrete_list_c.append(0)
            elif boundaries_c[0] < y < boundaries_c[1]:
                discrete_list_c.append(1)
            else:
                discrete_list_c.append(2)
        print('real c_values: ', values_c)
        return discrete_list_s + discrete_list_c

    """
  REINFORCEMENT LEARNING PROCESS
  INPUT:  alpha    : learning rate
          gamma    : discount factor
          epsilon  : epsilon value for e-greedy
          episodes : no. of episodes
          act_lim  : no. of actions robot takes before ending episode
          qL       : True if you use Q-Learning
  """
    stat_fitness = list()
    stat_rewards = [0]

    def normalize(reward, old_min, old_max, new_min=-1, new_max=1):
        return ((reward - old_min) /
                (old_max - old_min)) * (new_max - new_min) + new_min

    # def run_static(lim, no_blocks=0):
    #     for i in range(lim):
    #         if use_simulation:
    #             rob.play_simulation()
    #
    #         a, b, c = get_color_info()
    #         current_color_info = a, b, c
    #         current_sensor_info = get_sensor_info('front_3')
    #
    #         current_state = make_discrete(get_sensor_info('front_3'), boundary_sensor, current_color_info,
    #                                       boundaries_color)
    #
    #         if str(current_state) not in state_table.keys():
    #             state_table[str(current_state)] = [0 for _ in range(3)]
    #
    #         a, b, c = get_color_info()
    #         new_color_info = a, b, c
    #         # print(a, b, c, new_color_info)
    #
    #         action = static_policy(new_color_info)
    #
    #         take_action(action)
    #
    #         new_state = make_discrete(get_sensor_info('front_3'), boundary_sensor, new_color_info,
    #                                   boundaries_color)
    #         # TODO: make sure that current color info gets initialized the first time.
    #         r = get_reward(current_state, new_state, action, current_color_info, new_color_info, no_blocks)
    #         if r == 20:
    #             no_blocks += 1
    #
    #         norm_r = normalize(r, -30, 20)
    #
    #         if i != 0:
    #             stat_fitness.append(stat_fitness[-1] + (no_blocks / i))
    #         else:
    #             stat_fitness.append(float(0))
    #         print(fitness)
    #         if stat_rewards:
    #             stat_rewards.append(stat_rewards[-1] + norm_r)
    #         else:
    #             rewards.append(norm_r)
    #
    #         current_state = new_state
    #         current_color_info = new_color_info

    def rl(alpha,
           gamma,
           epsilon,
           episodes,
           act_lim,
           param_tuples,
           qL=False,
           no_blocks=0):

        fitness = list()
        rewards = [0]

        for i in range(episodes):
            print('Episode ' + str(i))
            terminate = False

            if use_simulation:
                rob.play_simulation()
                prey_robot = robobo.SimulationRoboboPrey().connect(
                    address='172.17.0.1', port=19989)
            else:
                prey_robot = robobo.HardwareRobobo(camera=True).connect(
                    address="10.15.3.48")  # Connect to different ip?
            prey_controller = prey.Prey(robot=prey_robot, level=2)
            prey_controller.start(
            )  # Not sure yet if this is indeed needed for both simu and hardware

            current_color_space = get_color_info()
            current_sensor_info = get_sensor_info('front_3')
            current_state = make_discrete(current_sensor_info, boundary_sensor,
                                          current_color_space,
                                          boundaries_color)

            if str(current_state) not in state_table.keys():
                state_table[str(current_state)] = [0 for _ in range(3)]

            action = epsilon_policy(current_state, epsilon)
            # current_collected_food = rob.collected_food() if use_simulation else 0
            # initialise state if it doesn't exist, else retrieve the current q-value
            x = 0
            while not terminate:

                take_action(action)
                # new_collected_food = rob.collected_food() if use_simulation else 0

                # Whole img extracted to get reward value
                # left, mid, right extracted to save state space accordingly

                new_color_space = get_color_info()
                new_sensor_info = get_sensor_info('front_3')
                new_state = make_discrete(new_sensor_info, boundary_sensor,
                                          new_color_space, boundaries_color)

                if str(new_state) not in state_table.keys():
                    state_table[str(new_state)] = [0 for _ in range(3)]

                new_action = epsilon_policy(new_state, epsilon)

                # Retrieve the max action if we use Q-Learning
                max_action = np.argmax(
                    state_table[str(new_state)]) if qL else new_action

                # Get reward
                r = get_reward(current_state, new_state, current_sensor_info,
                               new_sensor_info, action, new_action,
                               current_color_space, new_color_space)
                print("State and obtained Reward: ", new_state, r)

                # norm_r = normalize(r, -30, 20)
                #
                # if i != 0:
                #     fitness.append(no_blocks / i)
                # else:
                #     fitness.append(float(0))
                # # print(fitness)
                # if rewards:
                #     rewards.append(rewards[-1] + norm_r)
                # else:
                #     rewards.append(norm_r)

                # Update rule
                if not run_test:
                    # print('update')
                    state_table[str(current_state)][action] += \
                        alpha * (r + (gamma *
                                      np.array(
                                          state_table[str(new_state)][max_action]))
                                 - np.array(state_table[str(current_state)][action]))

                # Stop episode if we get very close to an obstacle
                if (max(new_state[:3]) == 2 and max(new_state[3:]) != 2
                        and use_simulation) or x == act_lim - 1:
                    state_table[str(new_state)][new_action] = -10
                    terminate = True
                    print("done")
                    if not run_test:
                        print('writing json')
                        with open(q_table_file, 'w') as json_file:
                            json.dump(state_table, json_file)

                    if use_simulation:
                        print("stopping the simulation")
                        prey_controller.stop()
                        prey_controller.join()
                        prey_robot.disconnect()
                        rob.stop_world()
                        while not rob.is_sim_stopped():
                            print("waiting for the simulation to stop")
                        time.sleep(2)

                # update current state and action
                current_state = new_state
                current_sensor_info = new_sensor_info
                action = new_action
                current_color_space = new_color_space

                # increment action limit counter
                x += 1

        return fitness, rewards

    experiments = 2

    epsilons = [0.01, 0.08, 0.22]
    gammas = [0.9]
    param_tuples = [(epsilon, gamma) for epsilon in epsilons
                    for gamma in gammas]
    _, _ = rl(0.9, 0, 0, 10, 200, [()],
              qL=True)  # alpha, gamma, epsilon, episodes, actions per episode
コード例 #4
0
    def rl(alpha,
           gamma,
           epsilon,
           episodes,
           act_lim,
           param_tuples,
           qL=False,
           no_blocks=0):

        fitness = list()
        rewards = [0]

        for i in range(episodes):
            print('Episode ' + str(i))
            terminate = False

            if use_simulation:
                rob.play_simulation()
                prey_robot = robobo.SimulationRoboboPrey().connect(
                    address='172.17.0.1', port=19989)
            else:
                prey_robot = robobo.HardwareRobobo(camera=True).connect(
                    address="10.15.3.48")  # Connect to different ip?
            prey_controller = prey.Prey(robot=prey_robot, level=2)
            prey_controller.start(
            )  # Not sure yet if this is indeed needed for both simu and hardware

            current_color_space = get_color_info()
            current_sensor_info = get_sensor_info('front_3')
            current_state = make_discrete(current_sensor_info, boundary_sensor,
                                          current_color_space,
                                          boundaries_color)

            if str(current_state) not in state_table.keys():
                state_table[str(current_state)] = [0 for _ in range(3)]

            action = epsilon_policy(current_state, epsilon)
            # current_collected_food = rob.collected_food() if use_simulation else 0
            # initialise state if it doesn't exist, else retrieve the current q-value
            x = 0
            while not terminate:

                take_action(action)
                # new_collected_food = rob.collected_food() if use_simulation else 0

                # Whole img extracted to get reward value
                # left, mid, right extracted to save state space accordingly

                new_color_space = get_color_info()
                new_sensor_info = get_sensor_info('front_3')
                new_state = make_discrete(new_sensor_info, boundary_sensor,
                                          new_color_space, boundaries_color)

                if str(new_state) not in state_table.keys():
                    state_table[str(new_state)] = [0 for _ in range(3)]

                new_action = epsilon_policy(new_state, epsilon)

                # Retrieve the max action if we use Q-Learning
                max_action = np.argmax(
                    state_table[str(new_state)]) if qL else new_action

                # Get reward
                r = get_reward(current_state, new_state, current_sensor_info,
                               new_sensor_info, action, new_action,
                               current_color_space, new_color_space)
                print("State and obtained Reward: ", new_state, r)

                # norm_r = normalize(r, -30, 20)
                #
                # if i != 0:
                #     fitness.append(no_blocks / i)
                # else:
                #     fitness.append(float(0))
                # # print(fitness)
                # if rewards:
                #     rewards.append(rewards[-1] + norm_r)
                # else:
                #     rewards.append(norm_r)

                # Update rule
                if not run_test:
                    # print('update')
                    state_table[str(current_state)][action] += \
                        alpha * (r + (gamma *
                                      np.array(
                                          state_table[str(new_state)][max_action]))
                                 - np.array(state_table[str(current_state)][action]))

                # Stop episode if we get very close to an obstacle
                if (max(new_state[:3]) == 2 and max(new_state[3:]) != 2
                        and use_simulation) or x == act_lim - 1:
                    state_table[str(new_state)][new_action] = -10
                    terminate = True
                    print("done")
                    if not run_test:
                        print('writing json')
                        with open(q_table_file, 'w') as json_file:
                            json.dump(state_table, json_file)

                    if use_simulation:
                        print("stopping the simulation")
                        prey_controller.stop()
                        prey_controller.join()
                        prey_robot.disconnect()
                        rob.stop_world()
                        while not rob.is_sim_stopped():
                            print("waiting for the simulation to stop")
                        time.sleep(2)

                # update current state and action
                current_state = new_state
                current_sensor_info = new_sensor_info
                action = new_action
                current_color_space = new_color_space

                # increment action limit counter
                x += 1

        return fitness, rewards
コード例 #5
0
def main():
    signal.signal(signal.SIGINT, terminate_program)

    rob = robobo.SimulationRobobo("#0").connect(address='10.15.3.49', port=19997) if use_simulation \
        else robobo.HardwareRobobo(camera=True).connect(address="192.168.43.187")

    def get_sensor_info(direction):
        a = np.log(np.array(rob.read_irs())) / 10
        all_sensor_info = np.array([0 if x == inf else 1 + (-x / 2) - 0.2 for x in a]) if use_simulation \
            else np.array(np.log(rob.read_irs())) / 10
        all_sensor_info[all_sensor_info == inf] = 0
        all_sensor_info[all_sensor_info == -inf] = 0
        # [0, 1, 2, 3, 4, 5, 6, 7]
        if direction == 'front':
            return all_sensor_info[5]
        elif direction == 'back':
            return all_sensor_info[1]
        elif direction == 'front_left':
            return np.max(all_sensor_info[[6, 7]])
        elif direction == 'front_right':
            return np.max(all_sensor_info[[3, 4]])
        elif direction == 'back_left':
            return all_sensor_info[0]
        elif direction == 'back_right':
            return all_sensor_info[2]
        elif direction == 'all':
            return all_sensor_info
        else:
            raise Exception('Invalid direction')

    # safe, almost safe, not safe. combine with previous state of safe almost safe and not safe.
    # safe to almost safe is good, almost safe to safe is okay, safe to safe is neutral
    # s to a to r to s'.
    # Small steps for going left or right (left or right are only rotating and straight is going forward).
    # controller is the q values: the boundary for every sensor.

    def move_left():
        rob.move(-speed, speed, dist)

    def move_right():
        rob.move(speed, -speed, dist)

    def go_straight():
        rob.move(speed, speed, dist)

    def move_back():
        rob.move(-speed, -speed, dist)

    boundary = [0.5, 0.8] if not use_simulation else [0.75, 0.95]

    # A static collision-avoidance policy
    def static_policy(s):
        if get_sensor_info('front_left') >= s \
                and get_sensor_info('front_left') > get_sensor_info('front_right'):
            take_action('right')

        elif get_sensor_info('front_right') >= s \
                and get_sensor_info('front_right') > get_sensor_info('front_left'):
            take_action('left')
        else:
            take_action('straight')

    state_table = {}
    if os.path.exists('./src/state_table.json'):
        with open('./src/state_table.json') as f:
            state_table = json.load(f)

    def epsilon_policy(s, epsilon):
        s = str(s)
        # epsilon greedy
        """"
        ACTIONS ARE DEFINED AS FOLLOWS:
          NUM: ACTION
            ------------
            0: STRAIGHT
            1: LEFT
            2: RIGHT
            ------------
        """
        e = 0 if run_test else epsilon
        if e > random.random():
            return random.choice([0, 1, 2])
        else:
            return np.argmax(state_table[s])

    def take_action(action):
        if action == 1:
            move_left()
        elif action == 2:
            move_right()
        elif action == 0:
            go_straight()
        # elif action == 'back':
        #     move_back()

    def get_reward(current, new, action):
        if current == 0 and new == 0:
            return 0 if action == 0 else -1
        elif current == 0 and new == 1:
            return 1
        elif current == 0 and new == 2:
            return -10
        elif current == 1 and new == 0:
            return 1
        elif current == 1 and new == 1:
            return 1 if action == 0 else 0
        elif current == 1 and new == 2:
            return -10
        return 0
        # TODO give negative reward for repetitions

    def make_discrete(values, boundaries):
        discrete_list = []
        for x in values:
            if x > boundaries[1]:
                discrete_list.append(2)
            elif boundaries[1] > x > boundaries[0]:
                discrete_list.append(1)
            elif boundaries[0] > x:
                discrete_list.append(0)
        return discrete_list

    """
    REINFORCEMENT LEARNING PROCESS
    INPUT:  alpha    : learning rate
            gamma    : discount factor
            epsilon  : epsilon value for e-greedy
            episodes : no. of episodes
            act_lim  : no. of actions robot takes before ending episode
            qL       : True if you use Q-Learning
    """

    def rl(alpha, gamma, epsilon, episodes, act_lim, qL=False):
        for i in range(episodes):
            print('Episode ' + str(i))
            terminate = False
            if use_simulation:
                rob.play_simulation()

            current_state = make_discrete(get_sensor_info('all')[3:], boundary)

            if str(current_state) not in state_table.keys():
                state_table[str(current_state)] = [0 for _ in range(3)]

            action = epsilon_policy(current_state, epsilon)
            # initialise state if it doesn't exist, else retrieve the current q-value
            x = 0
            while not terminate:
                take_action(action)
                new_state = make_discrete(get_sensor_info('all')[3:], boundary)

                if str(new_state) not in state_table.keys():
                    state_table[str(new_state)] = [0 for _ in range(3)]

                new_action = epsilon_policy(new_state, epsilon)

                # Retrieve the max action if we use Q-Learning
                max_action = np.argmax(
                    state_table[str(new_state)]) if qL else new_action

                # Get reward
                r = get_reward(np.max(current_state), np.max(new_state),
                               action)

                normalized_r = ((r - -10) / (2 - -10)) * (1 - -1) + -1
                fitness.append(normalized_r *
                               np.max(get_sensor_info("all")[3:]))
                # print(fitness)
                if rewards:
                    rewards.append(rewards[-1] + normalized_r)
                else:
                    rewards.append(normalized_r)

                # Update rule
                print("r: ", r)

                if not run_test:
                    print('update')
                    state_table[str(current_state)][action] += \
                        alpha * (r + (gamma *
                                      np.array(
                                          state_table[str(new_state)][max_action]))
                                 - np.array(state_table[str(current_state)][action]))

                # Stop episode if we get very close to an obstacle
                if max(new_state) == 2 or x == act_lim - 1:
                    state_table[str(new_state)][new_action] = -10
                    terminate = True
                    print("done")
                    if not run_test:
                        print('writing json')
                        with open('./src/state_table.json', 'w') as json_file:
                            json.dump(state_table, json_file)

                    if use_simulation:
                        print("stopping the simulation")
                        rob.stop_world()
                        while not rob.is_sim_stopped():
                            print("waiting for the simulation to stop")
                        time.sleep(2)

                # update current state and action
                current_state = new_state
                action = new_action

                # increment action limit counter
                x += 1

    # alpha, gamma, epsilon, episodes, actions per episode
    rl(0.9, 0.9, 0.05, 30, 500, qL=True)

    pprint(state_table)

    if not run_test:
        all_rewards = []
        all_fits = []
        if os.path.exists('./src/rewards.csv'):
            with open('./src/rewards.csv') as f:
                all_rewards = pickle.load(f)

        if os.path.exists('./src/fitness.csv'):
            with open('./src/fitness.csv') as f:
                all_fits = pickle.load(f)

        all_rewards += rewards
        all_fits += fitness

        print(all_rewards)
        print(all_fits)

        with open('./src/rewards.csv', 'w') as f:
            pickle.dump(all_rewards, f)

        with open('./src/fitness.csv', 'w') as f:
            pickle.dump(all_fits, f)

        plt.figure('Rewards')
        plt.plot(all_rewards)
        plt.savefig("./src/plot_reward.png")
        plt.show()

        plt.figure('Fitness Values')
        plt.plot(all_fits)
        plt.savefig("./src/plot_fitness.png")
コード例 #6
0
    def rl(alpha, gamma, epsilon, episodes, act_lim, exp, qL=False):

        fitness = list()
        rewards = [0]

        for i in range(episodes):
            print('Episode ' + str(i))
            terminate = False
            prey_robot = []
            if use_simulation:
                rob.play_simulation()
                prey_robot = robobo.SimulationRoboboPrey().connect(
                    address=virtual_ip, port=19989)
            if not use_simulation:
                prey_robot = robobo.HardwareRobobo().connect(
                    address=robot_prey)
            prey_controller = prey.Prey(
                robot=prey_robot, level=2) if use_simulation else prey.Prey(
                    robot=prey_robot, level=4)
            prey_controller.start()
            current_color_space = get_color_info()
            current_sensor_info = get_sensor_info('front_3')
            current_state = make_discrete(current_sensor_info, boundary_sensor,
                                          current_color_space,
                                          boundaries_color)

            if str(current_state) not in state_table.keys():
                state_table[str(current_state)] = [0 for _ in range(3)]

            action = epsilon_policy(current_state, epsilon)
            x = 0
            while not terminate:

                take_action(action)
                # new_collected_food = rob.collected_food() if use_simulation else 0

                # Whole img extracted to get reward value
                # left, mid, right extracted to save state space accordingly

                new_color_space = get_color_info()
                new_sensor_info = get_sensor_info('front_3')
                new_state = make_discrete(new_sensor_info, boundary_sensor,
                                          new_color_space, boundaries_color)

                if str(new_state) not in state_table.keys():
                    state_table[str(new_state)] = [0 for _ in range(3)]

                new_action = epsilon_policy(new_state, epsilon)

                # Retrieve the max action if we use Q-Learning
                max_action = np.argmax(
                    state_table[str(new_state)]) if qL else new_action

                # Get reward
                r = get_reward(current_state, new_state, current_sensor_info,
                               new_sensor_info, action, new_action,
                               current_color_space, new_color_space)
                print("State and obtained Reward: ", new_state, r)

                norm_r = normalize(r, MIN_REWARD, MAX_REWARD)
                norm_steps = normalize(x, MIN_TIMESTEPS + 0.01, MAX_TIMESTEPS)
                fitness.append(norm_r / norm_steps)

                # Update rule
                if not run_test:
                    # print('update')
                    state_table[str(current_state)][action] += \
                        alpha * (r + (gamma *
                                      np.array(
                                          state_table[str(new_state)][max_action]))
                                 - np.array(state_table[str(current_state)][action]))

                # Stop episode if we get very close to an obstacle
                if (max(new_state[:3]) == 2 and max(new_state[3:]) != 2
                        and use_simulation) or x == act_lim - 1:
                    # state_table[str(new_state)][new_action] = -10
                    terminate = True
                    print("done")
                    if not run_test:
                        print('writing json')
                        with open(
                                './src/task3_results/state_table_e' +
                                str(epsilon) + '_exp_' + str(exp) + '.json',
                                'wb') as json_file:
                            json.dump(state_table, json_file)

                    if use_simulation:
                        print("stopping the simulation")
                        prey_controller.stop()
                        prey_controller.join()
                        prey_robot.disconnect()
                        rob.stop_world()
                        while not rob.is_sim_stopped():
                            print("waiting for the simulation to stop")
                        time.sleep(2)

                # update current state and action
                current_state = new_state
                current_sensor_info = new_sensor_info
                action = new_action
                current_color_space = new_color_space

                # increment action limit counter
                x += 1

        return fitness, rewards