def test_q_learning_real(self, q_old, starting_nu=0.1): nu = starting_nu block_world = BlockWorld(self.states_x, self.states_y, self.blocks_count, self.stack_count, record=False) cnt = 0 while cnt < self.iteration_count: cnt += 1 block_world.pre_render() curr_state = block_world.get_state_as_tuple_pramodith2() if self.debug and curr_state in q_old: print("Current State: %s" + str(curr_state), q_old[curr_state]) action, block_id = self.get_next_action(curr_state, q_old, nu) if self.debug: print("Action: ", action, block_id) next_state = block_world.get_next_state_based_on_state_tuple( curr_state, (action, block_id)) new_reward = block_world.get_reward_for_state( next_state, curr_state) new_reward += block_world.get_reward_for_state_action_pramodith( curr_state, next_state) print("Reward") print(new_reward) if new_reward >= 100: print("Converged in %d", cnt) return cnt print("q:", q_old.get(str(curr_state), None)) block_world.update_state_from_tuple_pramodith(next_state) block_world.render() # time.sleep(0.1) return cnt
def random_exploration(self): gamma = 0.1 q = defaultdict(lambda: 0) episode_count = 2 prev_action = None for ep in range(episode_count): block_world = BlockWorld(self.states_x, self.states_y, self.blocks_count, 1, record=True) print("Goal: ", [ COLORS_STR[i] for stack in block_world.goal_config for i in stack ]) while block_world.get_reward() != 0: block_world.pre_render() state = block_world.get_state_as_tuple() action, block_id = self.get_random_action_from_prev_action( prev_action) next_state = block_world.get_next_state_based_on_state_tuple( state, (action, block_id)) q_val = gamma * max([ q[next_state, b] for b in self.get_allowed_actions_from_prev_action( (action, block_id)) ]) q[(block_world.get_state_as_tuple(), action)] = block_world.get_reward_for_state( state, block_world.goal_config.tolist()) + q_val block_world.update_state_from_tuple(next_state) prev_action = action, block_id block_world.render()
def q_learning_random(self): episode_count = 100 success_count = 0 for ep in range(episode_count): block_world = BlockWorld(self.states_x, self.states_y, self.blocks_count, 1, record=False) print("Goal: %s" % [ COLORS_STR[i] for stack in block_world.goal_config for i in stack ]) for iteration in range(5000): block_world.pre_render() curr_state = block_world.get_state_as_tuple() action, block_id = self.get_next_action(curr_state, q, nu) next_state = block_world.get_next_state_based_on_state_tuple( curr_state, (action, block_id)) is_goal_next = block_world.get_reward_for_state( next_state, block_world.goal_config) == 0 if self.debug: print("Current State: ", curr_state, is_goal_next) block_world.update_state_from_tuple(next_state) block_world.render() if is_goal_next: print("Goal State Reached!!! in %d iterations" % iteration) success_count += 1 break if iteration % 100 == 1: print( ep, iteration, ) if self.debug: print(iteration) print("success_count: ", success_count)
def random_exploration2(self): episode_count = 2 prev_action = None for ep in range(episode_count): block_world = BlockWorld(self.states_x, self.states_y, self.blocks_count, 1, record=True) print("Goal: ", [ COLORS_STR[i] for stack in block_world.goal_config for i in stack ]) while block_world.get_reward() != 0: block_world.pre_render() action, block_id = self.get_random_action_from_prev_action( prev_action) print("Action chosen :", action, block_id) if action != Action.DROP and action != Action.PICK: block_world.move_block_by_action(action, block_id) prev_action = action, block_id block_world.render()
def q_learning_real(self, starting_nu=0.0, use_old=True, record=False, demo_id=1, goal_config=None): alpha = 1 gamma = 0.1 action = None picked = False paused = False user_choice = True user_motion_pick = False rendered_pick = True record_actions = {} if use_old: if demo_id == 1: q_old = Demonstrations.load_obj("q_table/q_3_blocks_all_goals") else: q_old = Demonstrations.load_obj("q_table/q_demo_" + str(demo_id - 1)) else: q_old = {} nu = starting_nu block_world = BlockWorld(self.states_x, self.states_y, self.blocks_count, self.stack_count, record=False, goal_config=goal_config) if record: record_actions["starting_state"] = [ (block_world.block_dict[i].rect.centerx, block_world.block_dict[i].rect.centery) for i in range(self.blocks_count) ] record_actions["goal_config"] = [block_world.goal_config] record_actions["actions"] = [] if self.debug: print("Goal: ", [[COLORS_STR[i] for i in stack if i >= 0] for stack in block_world.goal_config]) cnt = 0 q = q_old.copy() while cnt < self.iteration_count: cnt += 1 block_world.pre_render() curr_state = block_world.get_state_as_tuple_pramodith2() if curr_state not in q: q[curr_state] = {} if self.debug: print("Current State: ", curr_state) user_choice = True action = None user_motion_pick = False while user_choice or paused: time.sleep(0.5) for event in pygame.event.get(): if event.type == KEYDOWN: print("") if event.key == K_SPACE: if paused == False: paused = True else: block_world.block_dict[block_id].surf.fill( COLORS[block_id]) user_motion_pick = False paused = False user_choice = False if rendered_pick and paused: print('Waiting for user correction.') print(block_id) if event.key == K_UP: print("d") user_choice = True picked = False user_motion_pick = True action = Action.MOVE_UP user_choice = True elif event.key == K_DOWN: print("d") user_choice = True picked = False user_motion_pick = True action = Action.MOVE_DOWN elif event.key == K_LEFT: print("d") user_choice = True user_motion_pick = True picked = False action = Action.MOVE_LEFT elif event.key == K_RIGHT: print("d") user_choice = True user_motion_pick = True picked = False action = Action.MOVE_RIGHT elif event.type == pygame.MOUSEBUTTONDOWN: if paused: pos = pygame.mouse.get_pos() for block in block_world.block_dict.values(): if block.rect.collidepoint(pos): if block_id: block_world.block_dict[ block_id].surf.fill( COLORS[block_id]) action = Action.PICK block_id = block.id user_choice = True picked = True block_world.block_dict[block_id].surf.fill( CLICKED_COLOR[block_id]) rendered_pick = False break if not user_motion_pick and paused == False: user_choice = False if paused == False or (not rendered_pick or user_motion_pick): break if not user_choice: action, block_id = self.get_next_action(curr_state, q, nu) if record: record_actions["actions"].append( ('algorithm', action, block_id)) else: if action == Action.PICK: rendered_pick = True print('Skipping models choice to listen to the expert') if record and action: record_actions["actions"].append( ('user', action, block_id)) if self.debug: print("Action: ", action, block_id) next_state = block_world.get_next_state_based_on_state_tuple( curr_state, (action, block_id)) new_reward = block_world.get_reward_for_state( next_state, curr_state) new_reward += block_world.get_reward_for_state_action_pramodith( curr_state, next_state) if new_reward > 1 or new_reward < -1: if self.debug: print("next_state: ", next_state) if self.debug: print("new_reward: ", new_reward) if (action, block_id) in q[curr_state]: q_sa = q[curr_state][(action, block_id)] else: q_sa = 0 q[curr_state][(action, block_id)] = 0 if next_state in q and len(q[next_state]) > 0: max_q_dash_s_dash_a_dash = max( [q[next_state][a_dash] for a_dash in q[next_state]]) else: max_q_dash_s_dash_a_dash = 0 if self.debug: print("max_q:", max_q_dash_s_dash_a_dash) if new_reward > 70: q[curr_state][(action, block_id)] = ( (1 - alpha) * q_sa) + (alpha * (new_reward)) break else: q[curr_state][( action, block_id)] += alpha * (new_reward + gamma * (max_q_dash_s_dash_a_dash) - q_sa) if self.debug: print("q:", q[curr_state][(action, block_id)]) block_world.update_state_from_tuple_pramodith(next_state) block_world.render() time.sleep(0.1) pygame.display.quit() Demonstrations.save_obj(q, "q_table/q_demo_" + str(demo_id)) Demonstrations.save_obj(record_actions, "state_action_recording/demo_" + str(demo_id))
def q_learning(self, q=None, starting_nu=1.0, decay_nu=True, decay_rate=0.9995): gamma = 0.1 alpha = 1 episode_count = 100 if not q: q = defaultdict(lambda: defaultdict(lambda: 0)) success_count = 0 for ep in range(episode_count): block_world = BlockWorld(self.states_x, self.states_y, self.blocks_count, 1, record=False) nu = starting_nu print("Goal: ", [ COLORS_STR[i] for stack in block_world.goal_config for i in stack ]) for iteration in range(self.iteration_count): block_world.pre_render() curr_state = block_world.get_state_as_tuple() if self.debug: print("Current State: ", curr_state) action, block_id = self.get_next_action(curr_state, q, nu) if self.debug: print("Action: ", action, block_id) next_state = block_world.get_next_state_based_on_state_tuple( curr_state, (action, block_id)) new_reward = block_world.get_reward_for_state( next_state, block_world.goal_config) if self.debug: print("next_state: ", next_state) if self.debug: print("new_reward: ", new_reward) q_i = q[curr_state][(action, block_id)] if len(q[next_state]) > 0: max_q = max( [q[next_state][a_dash] for a_dash in q[next_state]]) else: max_q = 0 if self.debug: print("max_q:", max_q) q[curr_state][(action, block_id)] = ( (1 - alpha) * q_i) + (alpha * (new_reward + gamma * max_q)) if self.debug: print("q:", q[curr_state][(action, block_id)]) block_world.update_state_from_tuple(next_state) block_world.render() if new_reward == 1: print("Goal State Reached!!! in %d iterations" % iteration) success_count += 1 break if decay_nu and iteration > 50: nu = decay_rate * nu if iteration % 100 == 1: print("EP[%d]It[%d]: Q[%d], nu:[%f]" % (ep, iteration, len(q), nu)) print("success_count: ", success_count)
def q_learning_real(self, starting_nu=0.0, use_old=True): alpha = 1 gamma = 0.1 actions_queue = deque(maxlen=5) state_distance_queue = deque(maxlen=6) converged = False if use_old: q_old = RLTrainer.load_obj("Q\q_oracle") else: q_old = {} nu = starting_nu block_world = BlockWorld(self.states_x, self.states_y, self.blocks_count, self.stack_count, record=False) if self.debug: print("Goal: ", [[COLORS_STR[i] for i in stack if i >= 0] for stack in block_world.goal_config]) state_s = State([[ block_world.block_dict[i].rect.centerx, block_world.block_dict[i].rect.centery ] for i in range(self.blocks_count)], selected_index=block_world.selected_block_id, goal_config=block_world.goal_config, screen_dims=(block_world.screen_width, block_world.screen_height)) block_world.goal_loc = state_s.goal_positions remaining_prob = 1 - nu do_next = 0 cnt = 0 q = q_old.copy() while cnt < self.iteration_count: cnt += 1 #while not converged: block_world.pre_render(True) curr_state = block_world.get_state_as_tuple_pramodith2() if curr_state not in q: q[curr_state] = {} #print("Current State: ", curr_state) ''' if np.random.rand() < 0.1: state_s = State([[block_world.block_dict[i].rect.centerx, block_world.block_dict[i].rect.centery] for i in range(self.blocks_count)], selected_index=block_world.selected_block_id, goal_config=block_world.goal_config, screen_dims=(block_world.screen_width, block_world.screen_height)) if state_s.goal_reached(): print("REACHED...") break # block_world.goal_loc = state_s.goal_positions action, block_id = self.get_next_action_supervised_t(state_t=None, state_s=state_s, q=None, nu=0) else: if action==Action.PICK or action==Action.DROP: actions_queue.append(0) else: actions_queue.append(1) #action, block_id = self.get_next_action(curr_state, q, nu) state_distance_queue.append(curr_state[0]) if len(state_distance_queue)==6: if (len(set(list(state_distance_queue)[0::2]))==1 and len(set(list(state_distance_queue)[1::2]))==1) or do_next>0: state_s = State([[block_world.block_dict[i].rect.centerx, block_world.block_dict[i].rect.centery] for i in range(self.blocks_count)], selected_index=block_world.selected_block_id, goal_config=block_world.goal_config, screen_dims=(block_world.screen_width, block_world.screen_height)) action, block_id = self.get_next_action_supervised_t(state_t=None, state_s=state_s, q=None, nu=0) ''' action, block_id = self.get_next_action(curr_state, q, nu) if self.debug: print("Action: ", action, block_id) next_state = block_world.get_next_state_based_on_state_tuple( curr_state, (action, block_id)) new_reward = block_world.get_reward_for_state( next_state, curr_state) new_reward += block_world.get_reward_for_state_action_pramodith( curr_state, next_state) if new_reward > 1 or new_reward < -1: if self.debug: print("next_state: ", next_state) if self.debug: print("new_reward: ", new_reward) # ever_seen_goal = ever_seen_goal or new_reward == 1 if (action, block_id) in q[curr_state]: q_sa = q[curr_state][(action, block_id)] else: q_sa = 0 q[curr_state][(action, block_id)] = 0 if next_state in q and len(q[next_state]) > 0: max_q_dash_s_dash_a_dash = max( [q[next_state][a_dash] for a_dash in q[next_state]]) else: max_q_dash_s_dash_a_dash = 0 if self.debug: print("max_q:", max_q_dash_s_dash_a_dash) if new_reward > 70: q[curr_state][(action, block_id)] = ( (1 - alpha) * q_sa) + (alpha * (new_reward)) return else: q[curr_state][( action, block_id)] += alpha * (new_reward + gamma * (max_q_dash_s_dash_a_dash) - q_sa) if self.debug: print("q:", q[curr_state][(action, block_id)]) block_world.update_state_from_tuple_pramodith(next_state) if cnt > 4000 and cnt % 250 == 0 and nu > 0.05: alpha -= 0.1 # print(cnt) # nu-=0.1 #nu *= 0.9995 block_world.render() # converged = ever_seen_goal and q == q_old #q_old = q # time.sleep(0.1) pygame.display.quit() #self.test_q_learning_real(q) RLTrainer.save_obj(q, "Q\q_oracle")
def test_q_learning_real(self, q_old, starting_nu=0.1): nu = starting_nu prev_reward = 0 block_world = BlockWorld(self.states_x, self.states_y, self.blocks_count, self.stack_count, record=False) ever_seen_goal = False cnt = 0 ''' state_s = State( [[block_world.block_dict[i].rect.centerx, block_world.block_dict[i].rect.centery] for i in range(self.blocks_count)], block_world.selected_block_id, block_world.goal_config,screen_dims=(block_world.screen_width, block_world.screen_height)) block_world.goal_loc = state_s.goal_positions ''' while cnt < self.iteration_count: cnt += 1 #q = q_old.copy() block_world.pre_render() curr_state = block_world.get_state_as_tuple_pramodith2() if self.debug and curr_state in q_old: print("Current State: %s" + str(curr_state), q_old[curr_state]) #if np.random.rand() < 0.01: ''' state_s = State([[block_world.block_dict[i].rect.centerx, block_world.block_dict[i].rect.centery] for i in range(self.blocks_count)], selected_index=block_world.selected_block_id, goal_config=block_world.goal_config, screen_dims=(block_world.screen_width, block_world.screen_height)) action, block_id = self.get_next_action_supervised_t(state_t=None, state_s=state_s, q=None, nu=0) ''' action, block_id = self.get_next_action(curr_state, q_old, nu) if self.debug: print("Action: ", action, block_id) next_state = block_world.get_next_state_based_on_state_tuple( curr_state, (action, block_id)) new_reward = block_world.get_reward_for_state( next_state, curr_state) new_reward += block_world.get_reward_for_state_action_pramodith( curr_state, next_state) # print("Reward") # print(new_reward) if new_reward > 70: prev_reward = 71 print("Converged in %d", cnt) #return cnt #if self.debug: # print("q:", q_old.get(str(curr_state),None)) block_world.update_state_from_tuple_pramodith(next_state) block_world.render() pygame.image.save(block_world.screen, "sample_videos/3_block_" + str(cnt) + ".jpg") if prev_reward > 70: return time.sleep(0.1) return cnt