def __init__(self, config): super(ForagingEnv, self).__init__() # params self.config = config self.max_food = 7 self.food_reward = 10 self.sight_reward = 0.1 # init self.done = False self.total_success = 0 self.current_step = 0 self.duration = 0 self.start_time = None self.exp_manager = None # Define action and sensors space self.action_space = spaces.Box(low=0, high=1, shape=(4, ), dtype=np.float32) # why high and low? self.observation_space = spaces.Box(low=0, high=1, shape=(5, ), dtype=np.float32) self.action_selection = ActionSelection(self.config) self.robot = False while not self.robot: if self.config.sim_hard == 'sim': self.robot = robobo.SimulationRobobo().connect( address=self.config.robot_ip, port=self.config.robot_port) else: self.robot = robobo.HardwareRobobo(camera=True).connect( address=self.config.robot_ip_hard) time.sleep(1)
def main(): global crashed EP_COUNT = 100 STEP_COUNT = 50 STEP_SIZE_MS = 500 CRASH_SENSOR_BOUNDARY = -0.43 # negative for simulation, positive for RW. Not used right now CRASH_POSITION_BOUNDARY = 0.02 A = [(20,20), (0,20), (20,0)]#, (10,5), (5,10)] # (-25,-25), epsilon = 0.1 epsilon_decaying = 0.5 gamma = 0.1 recency_factor = 0.01 # higher parameter setting -> forget older information faster # proximity_factor = 1 # how heavily to penalize proximity to obstacle ACTION_NAMES = ['forward', 'sharp left', 'sharp right'] # 'left', 'right'] # 'backward', SENS_NAMES = ["IR" + str(i + 1) for i in range(8)] # Initialize the robot -> SET THESE PARAMETERS! hardware = False # nn_from_file = True if input('Would you like to use a pre-trained neural network? (y/n') == 'y' else False nn_from_file = True learning = False # Disable this if you want to run a pre-trained network if nn_from_file is True: print('loading network, disabling learning...') learning = False if hardware: rob = robobo.HardwareRobobo(camera=True).connect(address="192.168.1.7") else: rob = robobo.SimulationRobobo().connect(address='172.20.10.3', port=19997) rob.stop_world() time.sleep(0.1) if not learning: epsilon = 0 epsilon_decaying = 0 # Initialize the data structure and neural network eps = [] hidden_layers = [16, 12] model = init_nn(input_dims=len(SENS_NAMES), output_dims=len(A), hidden_layers=hidden_layers, from_file=nn_from_file) for episode in range(EP_COUNT): data = EpisodeData(ACTION_NAMES, sens_names=SENS_NAMES) signal.signal(signal.SIGINT, terminate_program) start_simulation(rob) ### INITIALIZATION ### print('\n--- episode {} ---'.format(episode + 1)) S = np.log(np.array(rob.read_irs())) S[np.isinf(S)] = 0 last_position = np.array([0,0,0]) ########## Q-LEARNING LOOP ########## crashed = False for i in range(STEP_COUNT): start_time = time.time() print('\n--- step {} ---'.format(i+1)) # pos = rob.position() ### ACTION SELECTION & EXECUTION ### Q_s = model.predict(np.expand_dims(S, 0))[0] # request wheel speed parameters for max action action, wheels = e_greedy_action(Q_s, A, epsilon)# epsilon_decaying ** (1 + (0.1 * episode))) # move the robot rob.move(wheels['left'], wheels['right'], STEP_SIZE_MS) if learning: time.sleep(STEP_SIZE_MS/1000) ### OBSERVING NEXT STATE ### S_prime = np.log(np.array(rob.read_irs())) S_prime[np.isinf(S_prime)] = 0 print("ROB IRs: {}".format(S_prime / 10)) current_position = np.array(rob.position()) print("robobo is at {}".format(current_position)) # observe the reward s_trans = wheels['left'] + wheels['right'] # translational speed of the robot s_rot = abs(wheels['left'] - wheels['right']) # rotational speed of the robot if hardware: crashed = False raise NotImplementedError("Haven't implemented this, I suggest using a threshold for the sensor (see" "code below this statement)") else: dist = np.linalg.norm(last_position - current_position) crashed = min(S_prime[3:] / 10) < CRASH_SENSOR_BOUNDARY or dist < CRASH_POSITION_BOUNDARY if not crashed: # reward = 1 + min(S) * proximity_factor # - (wheels == A[1]) # see Eiben et al. for this formula reward = s_trans * (1 - 0.9 * (s_rot / 20)) * (1 - (min(S_prime[3:]) / -0.65)) else: reward = -400 # Retrieve Q values from neural network Q_prime = model.predict(np.expand_dims(S_prime, 0))[0] ### LEARNING ### Q_target = reward + (gamma * np.argmax(Q_prime)) Q_targets = np.copy(Q_s) Q_targets[action] = Q_target ### SAVE DATA ### # pos = np.array([1,2,3]) data.update(i, S, Q_s, Q_targets, reward) # pos removed ### TERMINATION CONDITION ### # if S == S_prime and not S.sum() == 0: # np.isinf(S).any() is False: # print('Termination condition reached') # break S = np.copy(S_prime) last_position = current_position print("crashed: ", crashed) print("chosen action:", ACTION_NAMES[action]) print('reward: ', reward) print("Q_s (NN output): ", Q_s) print("Updated Q-values: " + str(Q_targets)) elapsed_time = time.time() - start_time if crashed: break # terminate the episode data and store it data.terminate() eps.append(data) if learning: print("\n----- Learning ----\n") X = pd.concat([ep_data.sens for ep_data in eps]) y = pd.concat([ep_data.Q_targets for ep_data in eps]) # # calculate sample weights # ep_lengths = [len(ep_data.sens) for ep_data in eps[::-1]] # sample_weights = [] # # for i, ep_length in enumerate(ep_lengths): # sample_weights = sample_weights + ([(1 - recency_factor) ** i] * ep_length) # perform learning over the episode model.fit(X, y, epochs=100) #sample_weight=np.array(sample_weights)) # # perform an evaluation of the episode (probably not necessary till later) # model.evaluate(data) # time.sleep(0.1) # pause the simulation and read the collected food # rob.pause_simulation() rob.sleep(1) if crashed or episode == EP_COUNT: save_nn(model) print('Robot crashed, resetting simulation...') data.terminate(crashed) # could implement something here to save the experience if resetting the simulation! if crashed: rob.stop_world() rob.stop_world()
def main(): signal.signal(signal.SIGINT, terminate_program) rob = robobo.SimulationRobobo().connect(address='172.17.0.1', port=19997) if use_simulation \ else robobo.HardwareRobobo(camera=True).connect(address="10.15.3.48") rob.set_phone_tilt(45, 100) if use_simulation else rob.set_phone_tilt( 109, 100) state_table = {} q_table_file = './src/state_table.json' if os.path.exists(q_table_file): with open(q_table_file) as g: state_table = json.load(g) def get_sensor_info(direction): a = np.log(np.array(rob.read_irs())) / 10 all_sensor_info = np.array([0 if x == inf else 1 + (-x / 2) - 0.2 for x in a]) if use_simulation \ else np.array(np.log(rob.read_irs())) / 10 all_sensor_info[all_sensor_info == inf] = 0 all_sensor_info[all_sensor_info == -inf] = 0 # [0, 1, 2, 3, 4, 5, 6, 7] if direction == 'front': return all_sensor_info[5] elif direction == 'back': return all_sensor_info[1] elif direction == 'front_left': return all_sensor_info[6] elif direction == 'front_left_left': return all_sensor_info[7] elif direction == 'front_right': return all_sensor_info[4] elif direction == 'front_right_right': return all_sensor_info[3] elif direction == 'back_left': return all_sensor_info[0] elif direction == 'back_right': return all_sensor_info[2] elif direction == 'all': print(all_sensor_info[3:]) return all_sensor_info elif direction == 'front_3': return [all_sensor_info[3]] + [all_sensor_info[5] ] + [all_sensor_info[7]] else: raise Exception('Invalid direction') # safe, almost safe, not safe. combine with previous state of safe almost safe and not safe. # safe to almost safe is good, almost safe to safe is okay, safe to safe is neutral # s to a to r to s'. # Small steps for going left or right (left or right are only rotating and straight is going forward). # controller is the q values: the boundary for every sensor. def move_left(): rob.move(-speed, speed, dist) def move_right(): rob.move(speed, -speed, dist) def go_straight(): rob.move(speed, speed, dist) def move_back(): rob.move(-speed, -speed, dist) boundary_sensor = [0.6, 0.8] if not use_simulation else [0.5, 0.95] boundaries_color = [0.1, 0.7] if not use_simulation else [0.05, 0.85] # A static collision-avoidance policy def static_policy(color_info): max_c = np.max(color_info) if max_c == color_info[0]: return 1 elif max_c == color_info[1]: return 0 elif max_c == color_info[2]: return 2 return 0 def epsilon_policy(s, epsilon): s = str(s) # epsilon greedy """" ACTIONS ARE DEFINED AS FOLLOWS: NUM: ACTION ------------ 0: STRAIGHT 1: LEFT 2: RIGHT ------------ """ e = 0 if run_test else epsilon if e > random.random(): return random.choice([0, 1, 2]) else: return np.argmax(state_table[s]) def take_action(action): if action == 1: move_left() elif action == 2: move_right() elif action == 0: go_straight() # elif action == 'back': # move_back() def get_color_info(): image = rob.get_image_front() # Mask function def get_red_pixels(img): hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) lower_range = np.array([0, 50, 20]) upper_range = np.array([5, 255, 255]) mask = cv2.inRange(hsv, lower_range, upper_range) # print(get_green_pixels(image)) cv2.imwrite('a.png', mask) a = b = 0 for i in mask: for j in i: b += 1 if j == 255: a += 1 return a / b # count = 0 # pix_count = 0 # b = 64 # for i in range(len(img)): # for j in range(len(img[i])): # pixel = img[i][j] # pix_count += 1 # if (pixel[0] > b or pixel[2] > b) and pixel[1] < b * 2 \ # or (pixel[0] > b*2 and pixel[1] > b*2 and pixel[2] > b*2): # # img[i][j] = [0, 0, 0] # count += 1 # return 1 - (count / pix_count) left, middle_l, middle_r, right = np.hsplit(image, 4) middle = np.concatenate((middle_l, middle_r), axis=1) return get_red_pixels(left), get_red_pixels(middle), get_red_pixels( right) def get_reward(previous_state, new_state, previous_sensor, new_sensor, prev_action, action, prev_val, new_val): # Max no. of green in img, before and after # 0: No green pixels in img; 1: All img consists of green pixels prev_right, prev_mid, prev_left = prev_val sum_prev_val = sum(prev_val) new_right, new_mid, new_left = new_val sum_new_val = sum(new_val) max_new_sensor = np.max(new_sensor) max_prev_sensor = np.max(previous_sensor) max_c_prev = np.max(previous_state[3:]) max_c_new = np.max(new_state[3:]) # Encourages going towards prey if max_c_prev == 0 and max_c_new == 1: return 10 if action == 0 else 2 # This was originally 0 when left/right # Massive payoff if we get super close to prey if max_c_prev == 1 and max_c_new == 2: return 30 if max_c_prev == 2 == max_c_new == 2 \ and max_prev_sensor == 1: return 20 # Nothing happens if prey gets away if max_c_prev == 2 and max_c_new == 1: return -2 # This was originally 0 # Give good reward if we see more red than before if sum_prev_val < sum_new_val: # return 10 if action == 0 else 0 return 8 if action == 0 else 1 # If sensors detect enemy, then give good payoff. # If sensors detect wall, give bad payoff to steer clear if max_new_sensor > max_prev_sensor: return 30 if max_c_new >= 1 else -5 # Give good payoff to encourage exploring (going straight) # Minor bad payoff for turning around, but not bad enough to discourage it # return 1 if action == 0 else -1 return 0 if action == 0 else -2 # Returns list of values with discretized sensor values and color values. def make_discrete(values_s, boundary_s, values_c, boundaries_c): discrete_list_s = [] discrete_list_c = [] for x in values_s: if boundary_s[0] > x: discrete_list_s.append(0) elif boundary_s[1] > x > boundary_s[0]: discrete_list_s.append(1) else: discrete_list_s.append(2) for y in values_c: if y < boundaries_c[0]: discrete_list_c.append(0) elif boundaries_c[0] < y < boundaries_c[1]: discrete_list_c.append(1) else: discrete_list_c.append(2) print('real c_values: ', values_c) return discrete_list_s + discrete_list_c """ REINFORCEMENT LEARNING PROCESS INPUT: alpha : learning rate gamma : discount factor epsilon : epsilon value for e-greedy episodes : no. of episodes act_lim : no. of actions robot takes before ending episode qL : True if you use Q-Learning """ stat_fitness = list() stat_rewards = [0] def normalize(reward, old_min, old_max, new_min=-1, new_max=1): return ((reward - old_min) / (old_max - old_min)) * (new_max - new_min) + new_min # def run_static(lim, no_blocks=0): # for i in range(lim): # if use_simulation: # rob.play_simulation() # # a, b, c = get_color_info() # current_color_info = a, b, c # current_sensor_info = get_sensor_info('front_3') # # current_state = make_discrete(get_sensor_info('front_3'), boundary_sensor, current_color_info, # boundaries_color) # # if str(current_state) not in state_table.keys(): # state_table[str(current_state)] = [0 for _ in range(3)] # # a, b, c = get_color_info() # new_color_info = a, b, c # # print(a, b, c, new_color_info) # # action = static_policy(new_color_info) # # take_action(action) # # new_state = make_discrete(get_sensor_info('front_3'), boundary_sensor, new_color_info, # boundaries_color) # # TODO: make sure that current color info gets initialized the first time. # r = get_reward(current_state, new_state, action, current_color_info, new_color_info, no_blocks) # if r == 20: # no_blocks += 1 # # norm_r = normalize(r, -30, 20) # # if i != 0: # stat_fitness.append(stat_fitness[-1] + (no_blocks / i)) # else: # stat_fitness.append(float(0)) # print(fitness) # if stat_rewards: # stat_rewards.append(stat_rewards[-1] + norm_r) # else: # rewards.append(norm_r) # # current_state = new_state # current_color_info = new_color_info def rl(alpha, gamma, epsilon, episodes, act_lim, param_tuples, qL=False, no_blocks=0): fitness = list() rewards = [0] for i in range(episodes): print('Episode ' + str(i)) terminate = False if use_simulation: rob.play_simulation() prey_robot = robobo.SimulationRoboboPrey().connect( address='172.17.0.1', port=19989) else: prey_robot = robobo.HardwareRobobo(camera=True).connect( address="10.15.3.48") # Connect to different ip? prey_controller = prey.Prey(robot=prey_robot, level=2) prey_controller.start( ) # Not sure yet if this is indeed needed for both simu and hardware current_color_space = get_color_info() current_sensor_info = get_sensor_info('front_3') current_state = make_discrete(current_sensor_info, boundary_sensor, current_color_space, boundaries_color) if str(current_state) not in state_table.keys(): state_table[str(current_state)] = [0 for _ in range(3)] action = epsilon_policy(current_state, epsilon) # current_collected_food = rob.collected_food() if use_simulation else 0 # initialise state if it doesn't exist, else retrieve the current q-value x = 0 while not terminate: take_action(action) # new_collected_food = rob.collected_food() if use_simulation else 0 # Whole img extracted to get reward value # left, mid, right extracted to save state space accordingly new_color_space = get_color_info() new_sensor_info = get_sensor_info('front_3') new_state = make_discrete(new_sensor_info, boundary_sensor, new_color_space, boundaries_color) if str(new_state) not in state_table.keys(): state_table[str(new_state)] = [0 for _ in range(3)] new_action = epsilon_policy(new_state, epsilon) # Retrieve the max action if we use Q-Learning max_action = np.argmax( state_table[str(new_state)]) if qL else new_action # Get reward r = get_reward(current_state, new_state, current_sensor_info, new_sensor_info, action, new_action, current_color_space, new_color_space) print("State and obtained Reward: ", new_state, r) # norm_r = normalize(r, -30, 20) # # if i != 0: # fitness.append(no_blocks / i) # else: # fitness.append(float(0)) # # print(fitness) # if rewards: # rewards.append(rewards[-1] + norm_r) # else: # rewards.append(norm_r) # Update rule if not run_test: # print('update') state_table[str(current_state)][action] += \ alpha * (r + (gamma * np.array( state_table[str(new_state)][max_action])) - np.array(state_table[str(current_state)][action])) # Stop episode if we get very close to an obstacle if (max(new_state[:3]) == 2 and max(new_state[3:]) != 2 and use_simulation) or x == act_lim - 1: state_table[str(new_state)][new_action] = -10 terminate = True print("done") if not run_test: print('writing json') with open(q_table_file, 'w') as json_file: json.dump(state_table, json_file) if use_simulation: print("stopping the simulation") prey_controller.stop() prey_controller.join() prey_robot.disconnect() rob.stop_world() while not rob.is_sim_stopped(): print("waiting for the simulation to stop") time.sleep(2) # update current state and action current_state = new_state current_sensor_info = new_sensor_info action = new_action current_color_space = new_color_space # increment action limit counter x += 1 return fitness, rewards experiments = 2 epsilons = [0.01, 0.08, 0.22] gammas = [0.9] param_tuples = [(epsilon, gamma) for epsilon in epsilons for gamma in gammas] _, _ = rl(0.9, 0, 0, 10, 200, [()], qL=True) # alpha, gamma, epsilon, episodes, actions per episode
def rl(alpha, gamma, epsilon, episodes, act_lim, param_tuples, qL=False, no_blocks=0): fitness = list() rewards = [0] for i in range(episodes): print('Episode ' + str(i)) terminate = False if use_simulation: rob.play_simulation() prey_robot = robobo.SimulationRoboboPrey().connect( address='172.17.0.1', port=19989) else: prey_robot = robobo.HardwareRobobo(camera=True).connect( address="10.15.3.48") # Connect to different ip? prey_controller = prey.Prey(robot=prey_robot, level=2) prey_controller.start( ) # Not sure yet if this is indeed needed for both simu and hardware current_color_space = get_color_info() current_sensor_info = get_sensor_info('front_3') current_state = make_discrete(current_sensor_info, boundary_sensor, current_color_space, boundaries_color) if str(current_state) not in state_table.keys(): state_table[str(current_state)] = [0 for _ in range(3)] action = epsilon_policy(current_state, epsilon) # current_collected_food = rob.collected_food() if use_simulation else 0 # initialise state if it doesn't exist, else retrieve the current q-value x = 0 while not terminate: take_action(action) # new_collected_food = rob.collected_food() if use_simulation else 0 # Whole img extracted to get reward value # left, mid, right extracted to save state space accordingly new_color_space = get_color_info() new_sensor_info = get_sensor_info('front_3') new_state = make_discrete(new_sensor_info, boundary_sensor, new_color_space, boundaries_color) if str(new_state) not in state_table.keys(): state_table[str(new_state)] = [0 for _ in range(3)] new_action = epsilon_policy(new_state, epsilon) # Retrieve the max action if we use Q-Learning max_action = np.argmax( state_table[str(new_state)]) if qL else new_action # Get reward r = get_reward(current_state, new_state, current_sensor_info, new_sensor_info, action, new_action, current_color_space, new_color_space) print("State and obtained Reward: ", new_state, r) # norm_r = normalize(r, -30, 20) # # if i != 0: # fitness.append(no_blocks / i) # else: # fitness.append(float(0)) # # print(fitness) # if rewards: # rewards.append(rewards[-1] + norm_r) # else: # rewards.append(norm_r) # Update rule if not run_test: # print('update') state_table[str(current_state)][action] += \ alpha * (r + (gamma * np.array( state_table[str(new_state)][max_action])) - np.array(state_table[str(current_state)][action])) # Stop episode if we get very close to an obstacle if (max(new_state[:3]) == 2 and max(new_state[3:]) != 2 and use_simulation) or x == act_lim - 1: state_table[str(new_state)][new_action] = -10 terminate = True print("done") if not run_test: print('writing json') with open(q_table_file, 'w') as json_file: json.dump(state_table, json_file) if use_simulation: print("stopping the simulation") prey_controller.stop() prey_controller.join() prey_robot.disconnect() rob.stop_world() while not rob.is_sim_stopped(): print("waiting for the simulation to stop") time.sleep(2) # update current state and action current_state = new_state current_sensor_info = new_sensor_info action = new_action current_color_space = new_color_space # increment action limit counter x += 1 return fitness, rewards
def main(): signal.signal(signal.SIGINT, terminate_program) rob = robobo.SimulationRobobo("#0").connect(address='10.15.3.49', port=19997) if use_simulation \ else robobo.HardwareRobobo(camera=True).connect(address="192.168.43.187") def get_sensor_info(direction): a = np.log(np.array(rob.read_irs())) / 10 all_sensor_info = np.array([0 if x == inf else 1 + (-x / 2) - 0.2 for x in a]) if use_simulation \ else np.array(np.log(rob.read_irs())) / 10 all_sensor_info[all_sensor_info == inf] = 0 all_sensor_info[all_sensor_info == -inf] = 0 # [0, 1, 2, 3, 4, 5, 6, 7] if direction == 'front': return all_sensor_info[5] elif direction == 'back': return all_sensor_info[1] elif direction == 'front_left': return np.max(all_sensor_info[[6, 7]]) elif direction == 'front_right': return np.max(all_sensor_info[[3, 4]]) elif direction == 'back_left': return all_sensor_info[0] elif direction == 'back_right': return all_sensor_info[2] elif direction == 'all': return all_sensor_info else: raise Exception('Invalid direction') # safe, almost safe, not safe. combine with previous state of safe almost safe and not safe. # safe to almost safe is good, almost safe to safe is okay, safe to safe is neutral # s to a to r to s'. # Small steps for going left or right (left or right are only rotating and straight is going forward). # controller is the q values: the boundary for every sensor. def move_left(): rob.move(-speed, speed, dist) def move_right(): rob.move(speed, -speed, dist) def go_straight(): rob.move(speed, speed, dist) def move_back(): rob.move(-speed, -speed, dist) boundary = [0.5, 0.8] if not use_simulation else [0.75, 0.95] # A static collision-avoidance policy def static_policy(s): if get_sensor_info('front_left') >= s \ and get_sensor_info('front_left') > get_sensor_info('front_right'): take_action('right') elif get_sensor_info('front_right') >= s \ and get_sensor_info('front_right') > get_sensor_info('front_left'): take_action('left') else: take_action('straight') state_table = {} if os.path.exists('./src/state_table.json'): with open('./src/state_table.json') as f: state_table = json.load(f) def epsilon_policy(s, epsilon): s = str(s) # epsilon greedy """" ACTIONS ARE DEFINED AS FOLLOWS: NUM: ACTION ------------ 0: STRAIGHT 1: LEFT 2: RIGHT ------------ """ e = 0 if run_test else epsilon if e > random.random(): return random.choice([0, 1, 2]) else: return np.argmax(state_table[s]) def take_action(action): if action == 1: move_left() elif action == 2: move_right() elif action == 0: go_straight() # elif action == 'back': # move_back() def get_reward(current, new, action): if current == 0 and new == 0: return 0 if action == 0 else -1 elif current == 0 and new == 1: return 1 elif current == 0 and new == 2: return -10 elif current == 1 and new == 0: return 1 elif current == 1 and new == 1: return 1 if action == 0 else 0 elif current == 1 and new == 2: return -10 return 0 # TODO give negative reward for repetitions def make_discrete(values, boundaries): discrete_list = [] for x in values: if x > boundaries[1]: discrete_list.append(2) elif boundaries[1] > x > boundaries[0]: discrete_list.append(1) elif boundaries[0] > x: discrete_list.append(0) return discrete_list """ REINFORCEMENT LEARNING PROCESS INPUT: alpha : learning rate gamma : discount factor epsilon : epsilon value for e-greedy episodes : no. of episodes act_lim : no. of actions robot takes before ending episode qL : True if you use Q-Learning """ def rl(alpha, gamma, epsilon, episodes, act_lim, qL=False): for i in range(episodes): print('Episode ' + str(i)) terminate = False if use_simulation: rob.play_simulation() current_state = make_discrete(get_sensor_info('all')[3:], boundary) if str(current_state) not in state_table.keys(): state_table[str(current_state)] = [0 for _ in range(3)] action = epsilon_policy(current_state, epsilon) # initialise state if it doesn't exist, else retrieve the current q-value x = 0 while not terminate: take_action(action) new_state = make_discrete(get_sensor_info('all')[3:], boundary) if str(new_state) not in state_table.keys(): state_table[str(new_state)] = [0 for _ in range(3)] new_action = epsilon_policy(new_state, epsilon) # Retrieve the max action if we use Q-Learning max_action = np.argmax( state_table[str(new_state)]) if qL else new_action # Get reward r = get_reward(np.max(current_state), np.max(new_state), action) normalized_r = ((r - -10) / (2 - -10)) * (1 - -1) + -1 fitness.append(normalized_r * np.max(get_sensor_info("all")[3:])) # print(fitness) if rewards: rewards.append(rewards[-1] + normalized_r) else: rewards.append(normalized_r) # Update rule print("r: ", r) if not run_test: print('update') state_table[str(current_state)][action] += \ alpha * (r + (gamma * np.array( state_table[str(new_state)][max_action])) - np.array(state_table[str(current_state)][action])) # Stop episode if we get very close to an obstacle if max(new_state) == 2 or x == act_lim - 1: state_table[str(new_state)][new_action] = -10 terminate = True print("done") if not run_test: print('writing json') with open('./src/state_table.json', 'w') as json_file: json.dump(state_table, json_file) if use_simulation: print("stopping the simulation") rob.stop_world() while not rob.is_sim_stopped(): print("waiting for the simulation to stop") time.sleep(2) # update current state and action current_state = new_state action = new_action # increment action limit counter x += 1 # alpha, gamma, epsilon, episodes, actions per episode rl(0.9, 0.9, 0.05, 30, 500, qL=True) pprint(state_table) if not run_test: all_rewards = [] all_fits = [] if os.path.exists('./src/rewards.csv'): with open('./src/rewards.csv') as f: all_rewards = pickle.load(f) if os.path.exists('./src/fitness.csv'): with open('./src/fitness.csv') as f: all_fits = pickle.load(f) all_rewards += rewards all_fits += fitness print(all_rewards) print(all_fits) with open('./src/rewards.csv', 'w') as f: pickle.dump(all_rewards, f) with open('./src/fitness.csv', 'w') as f: pickle.dump(all_fits, f) plt.figure('Rewards') plt.plot(all_rewards) plt.savefig("./src/plot_reward.png") plt.show() plt.figure('Fitness Values') plt.plot(all_fits) plt.savefig("./src/plot_fitness.png")
def rl(alpha, gamma, epsilon, episodes, act_lim, exp, qL=False): fitness = list() rewards = [0] for i in range(episodes): print('Episode ' + str(i)) terminate = False prey_robot = [] if use_simulation: rob.play_simulation() prey_robot = robobo.SimulationRoboboPrey().connect( address=virtual_ip, port=19989) if not use_simulation: prey_robot = robobo.HardwareRobobo().connect( address=robot_prey) prey_controller = prey.Prey( robot=prey_robot, level=2) if use_simulation else prey.Prey( robot=prey_robot, level=4) prey_controller.start() current_color_space = get_color_info() current_sensor_info = get_sensor_info('front_3') current_state = make_discrete(current_sensor_info, boundary_sensor, current_color_space, boundaries_color) if str(current_state) not in state_table.keys(): state_table[str(current_state)] = [0 for _ in range(3)] action = epsilon_policy(current_state, epsilon) x = 0 while not terminate: take_action(action) # new_collected_food = rob.collected_food() if use_simulation else 0 # Whole img extracted to get reward value # left, mid, right extracted to save state space accordingly new_color_space = get_color_info() new_sensor_info = get_sensor_info('front_3') new_state = make_discrete(new_sensor_info, boundary_sensor, new_color_space, boundaries_color) if str(new_state) not in state_table.keys(): state_table[str(new_state)] = [0 for _ in range(3)] new_action = epsilon_policy(new_state, epsilon) # Retrieve the max action if we use Q-Learning max_action = np.argmax( state_table[str(new_state)]) if qL else new_action # Get reward r = get_reward(current_state, new_state, current_sensor_info, new_sensor_info, action, new_action, current_color_space, new_color_space) print("State and obtained Reward: ", new_state, r) norm_r = normalize(r, MIN_REWARD, MAX_REWARD) norm_steps = normalize(x, MIN_TIMESTEPS + 0.01, MAX_TIMESTEPS) fitness.append(norm_r / norm_steps) # Update rule if not run_test: # print('update') state_table[str(current_state)][action] += \ alpha * (r + (gamma * np.array( state_table[str(new_state)][max_action])) - np.array(state_table[str(current_state)][action])) # Stop episode if we get very close to an obstacle if (max(new_state[:3]) == 2 and max(new_state[3:]) != 2 and use_simulation) or x == act_lim - 1: # state_table[str(new_state)][new_action] = -10 terminate = True print("done") if not run_test: print('writing json') with open( './src/task3_results/state_table_e' + str(epsilon) + '_exp_' + str(exp) + '.json', 'wb') as json_file: json.dump(state_table, json_file) if use_simulation: print("stopping the simulation") prey_controller.stop() prey_controller.join() prey_robot.disconnect() rob.stop_world() while not rob.is_sim_stopped(): print("waiting for the simulation to stop") time.sleep(2) # update current state and action current_state = new_state current_sensor_info = new_sensor_info action = new_action current_color_space = new_color_space # increment action limit counter x += 1 return fitness, rewards