def t3(episodes): # print("Inside task_3") global obj_dict global table_turtlebot epsilon = 0.9 epsilon_discount = 0.99 start_time = time.time() # total_episodes = 2 highest_reward = 0 last_time_steps = np.ndarray(0) # root_path="home/deepika/catkin_ws/src/reinforcement" # filename_1=root_path+"/books.json" # print(filename_1) # with open(filename_1) as file: # obj_dict = json.load(file) state_init = problem.get_current_state() table_turtlebot = Table_turtlebot(state_init) for x in range(episodes): # print("Inside x") done = False cumulated_reward = 0 state_init = problem.get_current_state() # print("Initial state is ",state_init) # print("data type",type(state_init)) # table_turtlebot= Table_turtlebot(state_init) if epsilon > 0.05: epsilon *= epsilon_discount for i in range(20000): # print("Inside i") # Pick an action based on the current state action = chooseAction_taskl(state_init, epsilon) # print("Action in x :",action) # Execute the action and get feedback observation, next_state, reward = action_to_execute_func(action) # print("Observation:",observation) # print("Next state is",next_state) # print("reward is:",reward) cumulated_reward += reward update(state_init, action, reward, next_state) # term=problem.is_terminal_state() # print("term:",term) if problem.is_terminal_state() == 0: state_init = next_state else: last_time_steps = np.append(last_time_steps, [int(i + 1)]) # print("last time steps:",last_time_steps) # print("Terminal state is reached in",i) break print("Episode ", x + 1, "Reward :", cumulated_reward) problem.reset_world()
def t2(total_episodes): # print("Inside task_2") epsilon = 0.9 epsilon_discount = 0.99 start_time = time.time() # total_episodes = 2 highest_reward = 0 last_time_steps = np.ndarray(0) for x in range(total_episodes): # print("Inside x") done = False cumulated_reward = 0 state_init = problem.get_current_state() # print("Initial state is ",state_init) # print("data type",type(state_init)) if epsilon > 0.05: epsilon *= epsilon_discount for i in range(70000): # print("Inside i") # Pick an action based on the current state action = chooseAction(state_init, epsilon) # print("Action in x :",action) # Execute the action and get feedback observation, next_state, reward = action_to_execute_func(action) # print("Next state is",next_state) # print("reward is:",reward) cumulated_reward += reward update(state_init, action, reward, next_state) term = problem.is_terminal_state() # print("term:",term) if problem.is_terminal_state() == 0: state_init = next_state else: last_time_steps = np.append(last_time_steps, [int(i + 1)]) # print("last time steps:",last_time_steps) # print("Terminal state is reached in",i) break print("Episode ", x + 1, "Reward :", cumulated_reward) problem.reset_world()
def q_learning_task3(num_episodes): action_list = problem.get_all_actions() epsilon = 0.99 alpha = 0.3 gamma = 0.9 action_list = problem.get_all_actions() number_of_actions = len(action_list) #print action_list q_table = [[0 for j in range(number_of_actions)] for i in range(1)] zero_list = [0 for i in range(number_of_actions)] outerloopcnt = 0 book_loc_data = [] trolley_loc_data = [] new_action_list = [] #parsing json file with open('books.json') as book_parse: data = json.load(book_parse) loc_data = data["books"] #print len(loc_data) for i in range(0, len(loc_data)): book_name = "book_" + str(i + 1) book_loc_data.append(loc_data[book_name]["load_loc"]) #print book_loc_data bin_data = data["bins"] #print len(bin_data) for i in range(0, len(bin_data)): trolley_name = "trolly_" + str(i + 1) trolley_loc_data.append(bin_data[trolley_name]["load_loc"]) #print trolley_loc_data for i in range(num_episodes): current_location = problem.get_current_state() cumulative_reward = 0 state_list = [] state_list.append(current_location) count = 0 if (outerloopcnt % 10 == 0): epsilon -= 0.1 #Using Epsilon greedy approach # Run the episode till it reaches the terminal state while (problem.is_terminal_state() == 0): str_out = "" state_index = state_list.index(current_location) index = current_location.find('turtlebot3_burger') bot_loc = [] if (index != -1): bot_location = current_location[index:] loc_list = bot_location.split(',') x_loc = loc_list[1] y_loc = loc_list[2] bot_loc.append(float(x_loc)) bot_loc.append(float(y_loc)) rand = random.random() if (rand < epsilon): #selecting the action randomly new_action_list = copy.deepcopy(action_list) if (bot_loc not in book_loc_data[0]): i = 0 while i < len(new_action_list): if "careful_pick" in new_action_list[i]: new_action_list.remove(new_action_list[i]) i = i + 1 i = 0 while i < len(new_action_list): if "normal_pick" in new_action_list[i]: new_action_list.remove(new_action_list[i]) i = i + 1 if (bot_loc in trolley_loc_data[0]): i = 0 while i < len(new_action_list): if "normal_place" in new_action_list[i]: new_action_list.remove(new_action_list[i]) i = i + 1 i = 0 while i < len(new_action_list): if "careful_place" in new_action_list[i]: new_action_list.remove(new_action_list[i]) i = i + 1 action = random.choice(new_action_list) action_index = action_list.index(action) else: #Taking the maximum valued action from the q_table for that state max_value = np.amax(q_table[state_index]) action_index = q_table[state_index].index(max_value) action = action_list[action_index] #print bot_loc #Executing the action corresponding to the action selected if ('normal_moveF' in action): success, next_state, reward = problem.execute_normal_moveF() elif ('normal_TurnCW' in action): success, next_state, reward = problem.execute_normal_TurnCW() elif ('normal_TurnCCW' in action): success, next_state, reward = problem.execute_normal_TurnCCW() elif ('careful_moveF' in action): success, next_state, reward = problem.execute_careful_moveF() elif ('careful_TurnCW' in action): success, next_state, reward = problem.execute_careful_TurnCW() elif ('careful_TurnCCW' in action): success, next_state, reward = problem.execute_careful_TurnCCW() elif ('normal_place' in action): action = action.split(' ') book_name = action[1] bin_name = action[2][:-1] success, next_state, reward = problem.execute_normal_place( book_name, bin_name) elif ('careful_place' in action): action = action.split(' ') book_name = action[1] bin_name = action[2][:-1] success, next_state, reward = problem.execute_careful_place( book_name, bin_name) elif ('normal_pick' in action): action = action.split(' ') book_name = action[1][:-1] success, next_state, reward = problem.execute_normal_pick( book_name) elif ('careful_pick' in action): action = action.split(' ') book_name = action[1][:-1] success, next_state, reward = problem.execute_careful_pick( book_name) else: print "specified action is not in action list" if next_state in state_list: next_state_index = state_list.index(next_state) else: state_list.append(next_state) next_state_index = state_list.index(next_state) q_table.append(zero_list) old_value = q_table[state_index][action_index] next_max = np.amax(q_table[next_state_index]) new_value = (float(1 - alpha) * old_value) + (alpha * ((reward) + (gamma * next_max))) #update the q_table q_table[state_index][action_index] = new_value str_out += '"(' + state_list[state_index] + ',' + action_list[ action_index] + ',' + state_list[next_state_index] + ',' + str( reward) + ')' str_out += ':' + str(q_table[state_index][action_index]) + '"' pub.publish(str_out) #calculating the reward if (count == 0): cumulative_reward += reward else: cumulative_reward += (gamma**count) * reward count += 1 current_location = next_state #print current_location print(outerloopcnt, count, cumulative_reward) outerloopcnt += 1 problem.reset_world()
def q_learning_task2(num_episodes): action_list = problem.get_all_actions() epsilon = 0.99 alpha = 0.3 gamma = 0.9 action_list = problem.get_all_actions() number_of_actions = len(action_list) q_table = [[0 for j in range(number_of_actions)] for i in range(1)] zero_list = [0 for i in range(number_of_actions)] outerloopcnt = 0 for i in range(num_episodes): current_location = problem.get_current_state() cumulative_reward = 0 state_list = [] state_list.append(current_location) count = 0 if (outerloopcnt % 10 == 0): epsilon -= 0.1 #Using Epsilon greedy approach # Run the episode till it reaches the terminal state while (problem.is_terminal_state() == 0): str_out = "" state_index = state_list.index(current_location) rand = random.random() if (rand < epsilon): #selecting the action randomly action = random.choice(action_list) action_index = action_list.index(action) else: #Taking the maximum valued action from the q_table for that state max_value = np.amax(q_table[state_index]) action_index = q_table[state_index].index(max_value) action = action_list[action_index] #Executing the action corresponding to the action selected if ('normal_moveF' in action): success, next_state, reward = problem.execute_normal_moveF() elif ('normal_TurnCW' in action): success, next_state, reward = problem.execute_normal_TurnCW() elif ('normal_TurnCCW' in action): success, next_state, reward = problem.execute_normal_TurnCCW() elif ('careful_moveF' in action): success, next_state, reward = problem.execute_careful_moveF() elif ('careful_TurnCW' in action): success, next_state, reward = problem.execute_careful_TurnCW() elif ('careful_TurnCCW' in action): success, next_state, reward = problem.execute_careful_TurnCCW() elif ('normal_place' in action): action = action.split(' ') book_name = action[1] bin_name = action[2][:-1] success, next_state, reward = problem.execute_normal_place( book_name, bin_name) elif ('careful_place' in action): action = action.split(' ') book_name = action[1] bin_name = action[2][:-1] success, next_state, reward = problem.execute_careful_place( book_name, bin_name) elif ('normal_pick' in action): action = action.split(' ') book_name = action[1][:-1] success, next_state, reward = problem.execute_normal_pick( book_name) elif ('careful_pick' in action): action = action.split(' ') book_name = action[1][:-1] success, next_state, reward = problem.execute_careful_pick( book_name) else: print "specified action is not in action list" if next_state in state_list: next_state_index = state_list.index(next_state) else: state_list.append(next_state) next_state_index = state_list.index(next_state) q_table.append(zero_list) old_value = q_table[state_index][action_index] next_max = np.amax(q_table[next_state_index]) new_value = (float(1 - alpha) * old_value) + (alpha * ((reward) + (gamma * next_max))) #update the q_table q_table[state_index][action_index] = new_value str_out += '"(' + state_list[state_index] + ',' + action_list[ action_index] + ',' + state_list[next_state_index] + ',' + str( reward) + ')' str_out += ':' + str(q_table[state_index][action_index]) + '"' pub.publish(str_out) #calculating the reward if (count == 0): cumulative_reward += reward else: cumulative_reward += (gamma**count) * reward count += 1 current_location = next_state #print current_location print(outerloopcnt, count, cumulative_reward) outerloopcnt += 1 problem.reset_world()
def execute_track_t1(self): q = QTable() q.table = OrderedDict() print("Executing trajectory.") try: for curr_state, action_reward in self.track.items(): action, reward = action_reward action = action.replace('(', '').replace(')', '').replace(',', '') reward = float(reward) if curr_state not in q.table: q.table[curr_state] = [0] * self.total_actions # Q value for this state and action Q(s,a) q.old_value = float(q.table[curr_state][self.index[action]]) # Execute selected action status, new_state, _ = take_step(action) # print status, new_state, reward q.count_actions += 1 if reward > 0: print(action, reward) # Get max Q from next state if new_state in q.table: q.next_max = float(max(q.table[new_state])) new_value = (1 - q.alpha) * q.old_value + q.alpha * ( reward + q.gamma * q.next_max) q.table[curr_state][self.index[action]] = new_value if q.count_actions == 1: q.reward += reward else: q.r_gamma *= 0.9 q.reward += (q.r_gamma * reward) print("Reached terminal state") finally: message = '' for k, v in q.table.items(): action_value = {} for i, item in enumerate(v): action_value[self.actions[i]] = item q.table[k] = action_value for k, v in q.table.items(): message += k + ' : ' message += str(v) message += '-' message = message[:-1] print 'publish message' self.publisher.publish(message) problem.reset_world() print('World Reset') q.reward = 0 q.r_gamma = 1
def train_t3(self, max_episodes): q = QTable() get_load_locations(self.init_state) curr_state = self.init_state curr_state = sort_state(curr_state) print("Searching the purpose of life.") for epoch in range(0, 2 * max_episodes): try: if epoch % 2 == 0: print("Executing actions along with Exlporation") else: print("Executing actions with Learned Policy only") while not problem.is_terminal_state(): if curr_state not in q.table: q.table[curr_state] = [0] * self.total_actions if random.uniform(0, 1) < q.epsilon and epoch % 2 == 0: # print("Exploring..") action = self.actions[random.randint(0, 9)] else: m = max(q.table[curr_state]) indices = [ j for j, k in enumerate(q.table[curr_state]) if k == m ] # This keeps bot from always picking the 1st best # action and randomly chooses from all best choices idx = random.choice(indices) action = self.actions[idx] if ('place' in action and not is_bin_location(curr_state)) \ or ('pick' in action and not is_book_location(curr_state)): # Choose best action from pruned actions # a_rewards = q.table[curr_state][0:3]+q.table[curr_state][43:46] # idx = a_rewards.index(max(a_rewards)) # action = pruned_actions[idx] action = random.choice(pruned_actions) # Q value for this state and action Q(s,a) q.old_value = float( q.table[curr_state][self.index[action]]) # Execute selected action status, new_state, reward = take_step(action) if reward > 0: print(action, reward, status) sort_state(new_state) # Get max Q from next state if new_state in q.table: q.next_max = float(max(q.table[new_state])) new_value = (1 - q.alpha) * q.old_value + q.alpha * ( reward + q.gamma * q.next_max) q.table[curr_state][self.index[action]] = new_value if q.count_actions == 1: q.reward += reward else: q.r_gamma *= 0.9 q.reward += (q.r_gamma * reward) message = "(" + curr_state + "," + action + "," + new_state + "," + str( reward) + ")" + " : " + str(new_value) self.publisher.publish(message) # print message curr_state = new_state print("Total Reward: " + str(q.reward)) print("Reached terminal state") finally: problem.reset_world() print('World Reset') q.reward = 0 q.r_gamma = 1