def set_action_values(self, action_values: Dict[FarmState, List[float]]): grid_dim_x, grid_dim_y = self.env.grid_shape for grid_arrows_row in self.grid_arrows: for grid_arrow in grid_arrows_row: self.board.delete(grid_arrow) self.grid_arrows: List[List[List]] = [] for pos_i in range(grid_dim_x): grid_arrows_row: List = [] for pos_j in range(grid_dim_y): state: FarmState = FarmState((pos_i, pos_j), self.goal_idx, self.plant_idxs, self.rocks_idxs) if self.env.is_terminal(state): continue for action, action_value in enumerate(action_values[state]): color = _get_color(action_value, self.val_min, self.val_max) # color = hsl_interp((action_value - self.val_min) / (self.val_max - self.val_min)) grid_arrow = self._create_arrow(action, pos_i, pos_j, color) grid_arrows_row.append(grid_arrow) self.grid_arrows.append(grid_arrows_row)
def _update_policy(self, policy: Dict[FarmState, List[float]]): grid_dim_x, grid_dim_y = self.env.grid_shape for grid_arrows_row in self.grid_arrows: for grid_arrow in grid_arrows_row: self.board.delete(grid_arrow) self.grid_arrows: List[List[List]] = [] for pos_i in range(grid_dim_x): grid_arrows_row: List = [] for pos_j in range(grid_dim_y): state: FarmState = FarmState((pos_i, pos_j), self.goal_idx, self.plant_idxs, self.rocks_idxs) if self.env.is_terminal(state): continue for action, policy_prob in enumerate(policy[state]): if policy_prob == 0.0: continue color: str = "gray%i" % (100 - 100 * policy_prob) grid_arrow = self._create_arrow(action, pos_i, pos_j, color) grid_arrows_row.append(grid_arrow) self.grid_arrows.append(grid_arrows_row)
def _update_action_vals_color_dqn(self, dqn, device): dqn.eval() cell_score_max: float = self.val_max cell_score_min: float = self.val_min grid_dim_x, grid_dim_y = self.env.grid_shape for pos_i in range(grid_dim_x): for pos_j in range(grid_dim_y): state: FarmState = FarmState((pos_i, pos_j), self.goal_idx, self.plant_idxs, self.rocks_idxs) if self.env.is_terminal(state): continue state_tens = torch.tensor(self.env.state_to_nnet_input(state), device=device) action_vals_state = dqn( state_tens.float()).cpu().data.numpy()[0, :] for action in range(self.num_actions): action_val: float = action_vals_state[action] color = _get_color(action_val, cell_score_min, cell_score_max) self.board.itemconfigure( self.action_val_arrows[pos_i][pos_j][action], fill=color)
def greedy_policy_vis(self, num_steps: int): def _update(): self.window.update() curr_state = FarmState(self.start_idx, self.goal_idx, self.plant_idxs, self.rocks_idxs) self.board.delete(self.agent_img) self.agent_img = self._place_imgs(self.board, self.robot_pic, [curr_state.agent_idx])[0] _update() time.sleep(self.wait) print("Step: ", end='', flush=True) for itr in range(num_steps): print("%i..." % itr, end='', flush=True) if self.env.is_terminal(curr_state): break action: int = int(np.argmax(self.action_vals[curr_state])) curr_state, _ = self.env.sample_transition(curr_state, action) self.board.delete(self.agent_img) self.agent_img = self._place_imgs(self.board, self.robot_pic, [curr_state.agent_idx])[0] _update() time.sleep(self.wait) print("")
def q_learning(self, epsilon: float, learning_rate: float, wait_step: float): state: FarmState = FarmState(self.start_idx, self.goal_idx, self.plant_idxs, self.rocks_idxs) def _update(): self._update_action_vals_color() self.window.update() episode_num: int = 0 print("Q-learning, episode %i" % episode_num) while episode_num < 1000: if self.env.is_terminal(state): episode_num = episode_num + 1 if episode_num % 100 == 0: print("Visualizing greedy policy") _update() self.greedy_policy_vis(40) state = FarmState(self.start_idx, self.goal_idx, self.plant_idxs, self.rocks_idxs) print("Q-learning, episode %i" % episode_num) state, self.action_vals = q_learning_step(self.env, state, self.action_vals, epsilon, learning_rate, self.discount) if wait_step > 0.0: self.board.delete(self.agent_img) self.agent_img = self._place_imgs(self.board, self.robot_pic, [state.agent_idx])[0] _update() time.sleep(wait_step) _update() print("DONE")
def _init_action_vals_color(self): grid_dim_x, grid_dim_y = self.env.grid_shape self.action_val_arrows = [] for pos_i in range(grid_dim_x): grid_arrows_row: List = [] for pos_j in range(grid_dim_y): state_action_val_arrows: List = [] state = FarmState((pos_i, pos_j), self.goal_idx, self.plant_idxs, self.rocks_idxs) if not self.env.is_terminal(state): for action in range(self.num_actions): grid_arrow = self._create_arrow( action, pos_i, pos_j, "white") state_action_val_arrows.append(grid_arrow) grid_arrows_row.append(state_action_val_arrows) self.action_val_arrows.append(grid_arrows_row)
def _update_action_vals_color(self): cell_score_max: float = self.val_max if self.val_min is None: cell_score_min: float = min(self.state_vals.values()) - 1E-9 else: cell_score_min: float = self.val_min grid_dim_x, grid_dim_y = self.env.grid_shape for pos_i in range(grid_dim_x): for pos_j in range(grid_dim_y): state: FarmState = FarmState((pos_i, pos_j), self.goal_idx, self.plant_idxs, self.rocks_idxs) if self.env.is_terminal(state): continue for action in range(self.num_actions): action_val: float = self.action_vals[state][action] color = _get_color(action_val, cell_score_min, cell_score_max) self.board.itemconfigure( self.action_val_arrows[pos_i][pos_j][action], fill=color)
def get_environment(env_name: str): env_name = env_name.lower() farm_regex = re.search("aifarm(_(\S+))?", env_name) env: Environment if farm_regex is not None: from environments.farm_grid_world import FarmGridWorld, FarmState from visualizer.farm_visualizer import InteractiveFarm grid = np.loadtxt("maps/map1.txt") grid = np.transpose(grid) assert np.sum(grid == 1) == 1, "Only one agent allowed" assert np.sum(grid == 2) == 1, "Only one goal allowed" env: FarmGridWorld = FarmGridWorld(grid.shape, float(farm_regex.group(2)), grid) viz = InteractiveFarm(env, grid) # get states states: List[FarmState] = [] for pos_i in range(grid.shape[0]): for pos_j in range(grid.shape[1]): state: FarmState = FarmState((pos_i, pos_j), viz.goal_idx, viz.plant_idxs, viz.rocks_idxs) states.append(state) elif env_name == "puzzle8": from environments.n_puzzle import NPuzzle env = NPuzzle(3) states = pickle.load(open("data/puzzle8.pkl", "rb"))['states'] viz = None else: raise ValueError('No known environment %s' % env_name) return env, viz, states
def greedy_policy_vis_dqn(self, num_steps: int, dqn: nn.Module, device): def _update(): self.window.update() curr_state = FarmState(self.start_idx, self.goal_idx, self.plant_idxs, self.rocks_idxs) self.board.delete(self.agent_img) self.agent_img = self._place_imgs(self.board, self.robot_pic, [curr_state.agent_idx])[0] _update() time.sleep(self.wait) print("Step: ", end='', flush=True) for itr in range(num_steps): print("%i..." % itr, end='', flush=True) if self.env.is_terminal(curr_state): break state_tens = torch.tensor(self.env.state_to_nnet_input(curr_state), device=device) action_vals_state = dqn( state_tens.float()).cpu().data.numpy()[0, :] action: int = int(np.argmax(action_vals_state)) curr_state, _ = self.env.sample_transition(curr_state, action) self.board.delete(self.agent_img) self.agent_img = self._place_imgs(self.board, self.robot_pic, [curr_state.agent_idx])[0] _update() time.sleep(self.wait) print("")
def astar(self, weight: float): # get nnet torch.set_num_threads(1) device: torch.device = torch.device("cpu") nnet = self.env.get_state_value_nnet() state_dict = torch.load( "saved_models/supervised_small/model_state_dict.pt") nnet.load_state_dict(state_dict) nnet.eval() # get heuristic function def heuristic_fn(states): # return np.zeros(len(states)) nnet_inputs_np_l = [ self.env.state_to_nnet_input(state_i) for state_i in states ] nnet_input_np = np.concatenate(nnet_inputs_np_l, axis=0) nnet_input = torch.tensor(nnet_input_np, device=device) state_vals: np.array = nnet( nnet_input.float()).cpu().data.numpy()[:, 0] return -state_vals state: FarmState = FarmState(self.start_idx, self.goal_idx, self.plant_idxs, self.rocks_idxs) astar = AStar(state, self.env, heuristic_fn, weight) grid_dim_x, grid_dim_y = self.env.grid_shape grid_text_astar: List[List[List]] = [] for pos_i in range(grid_dim_x): grid_text_rows: List = [] for pos_j in range(grid_dim_y): txt_i = (pos_i + 0.5) * self.width txt_j = pos_j * self.width + self.text_offset txt1 = self.board.create_text(txt_i, txt_j, text="", fill="black") txt2 = self.board.create_text(txt_i, txt_j + 20, text="", fill="black") txt3 = self.board.create_text(txt_i, txt_j + 40, text="", fill="black") grid_text_rows.append([txt1, txt2, txt3]) grid_text_astar.append(grid_text_rows) def _update(): for node in astar.instance.closed_dict.keys(): pos_i_up, pos_j_up = node.state.agent_idx self.board.itemconfigure(self.grid_squares[pos_i_up][pos_j_up], fill="red") for node in astar.instance.open_set: pos_i_up, pos_j_up = node.state.agent_idx self.board.itemconfigure(self.grid_squares[pos_i_up][pos_j_up], fill="grey") self.board.itemconfigure( grid_text_astar[pos_i_up][pos_j_up][0], text='g=%.1f' % node.path_cost) self.board.itemconfigure( grid_text_astar[pos_i_up][pos_j_up][1], text='h=%.1f' % node.heuristic) self.board.itemconfigure( grid_text_astar[pos_i_up][pos_j_up][2], text='f=%.1f' % node.cost) self.window.update() while not astar.is_solved(): if self.wait > 0: _update() time.sleep(self.wait) astar.step(heuristic_fn) if self.wait > 0: _update() time.sleep(self.wait) actions = astar.get_soln_actions() for action in actions: state = self.env.sample_transition(state, action)[0] self.board.delete(self.agent_img) self.agent_img = self._place_imgs(self.board, self.robot_pic, [state.agent_idx])[0] self.window.update() time.sleep(0.1)
def policy_gradient(self, learning_rate: float, wait_step: float): torch.set_num_threads(1) device: torch.device = torch.device("cpu") nnet = self.env.get_policy_nnet() optimizer: Optimizer = optim.Adam(nnet.parameters(), lr=learning_rate) def _update(): nnet.eval() policy: Dict[FarmState, List[float]] = {} for state_up in self.states: nnet_input_np_state_up = self.env.state_to_nnet_input(state_up) nnet_input_up = torch.tensor(nnet_input_np_state_up, device=device) policy[state_up] = list( nnet(nnet_input_up.float()).cpu().data.numpy()[0, :]) self._update_policy(policy) self.window.update() episode_num: int = 0 max_steps: int = 100 while episode_num < 1000: state = FarmState(self.start_idx, self.goal_idx, self.plant_idxs, self.rocks_idxs) print("Policy gradient, episode %i" % episode_num) episode_num = episode_num + 1 if episode_num % 10 == 0: print("Visualizing greedy policy") _update() time.sleep(self.wait) actions: List[int] = [] rewards: List[float] = [] nnet_inputs_np_l = [] nnet.eval() for episode_step in range(max_steps): nnet_input_np_state = self.env.state_to_nnet_input(state) nnet_input = torch.tensor(nnet_input_np_state, device=device) action_probs = nnet( nnet_input.float()).cpu().data.numpy()[0, :] nnet_inputs_np_l.append(nnet_input_np_state) action = np.random.choice(4, p=action_probs) state, reward = self.env.sample_transition(state, action) actions.append(action) rewards.append(reward) if self.env.is_terminal(state): break nnet_inputs_np = np.concatenate(nnet_inputs_np_l, axis=0) nnet.train() optimizer.zero_grad() nnet_inputs = torch.tensor(nnet_inputs_np, device=device) nnet_outputs = nnet(nnet_inputs.float()) actions_t = torch.tensor(np.array(actions), device=device).long().unsqueeze(1) log_prob = torch.log(nnet_outputs) log_prob_actions = log_prob.gather(1, actions_t)[:, 0] returns = np.cumsum(np.array(rewards)[::-1])[::-1] returns_t = torch.tensor(returns.astype(np.float32), device=device) loss = torch.mean(-returns_t * log_prob_actions) loss.backward() optimizer.step() if wait_step > 0.0: self.board.delete(self.agent_img) self.agent_img = self._place_imgs(self.board, self.robot_pic, [state.agent_idx])[0] _update() time.sleep(wait_step) _update() print("DONE")
def deep_q_learning(self, epsilon: float, learning_rate: float, batch_size: int, wait_step: float): state: FarmState = FarmState(self.start_idx, self.goal_idx, self.plant_idxs, self.rocks_idxs) torch.set_num_threads(1) device: torch.device = torch.device("cpu") dqn: nn.Module = get_dqn() optimizer: Optimizer = optim.Adam(dqn.parameters(), lr=0.001) dqn_target: nn.Module = get_dqn() dqn_target.eval() replay_buffer: List = [] def _update(): self._update_action_vals_color_dqn(dqn, device) self.window.update() _update() episode_num: int = 0 step_num: int = 0 update_num: int = 100 total_steps: int = 0 print("Q-learning, episode %i" % episode_num) while episode_num < 1000: dqn.eval() if self.env.is_terminal(state) or (step_num >= 50): episode_num = episode_num + 1 if episode_num % 100 == 0: print("Visualizing greedy policy") _update() self.greedy_policy_vis_dqn(40, dqn, device) # state = np.random.choice(self.states) state = FarmState(self.start_idx, self.goal_idx, self.plant_idxs, self.rocks_idxs) step_num: int = 0 print("Q-learning, episode %i" % episode_num) state, dqn, replay_buffer = deep_q_learning_step( self.env, state, dqn, dqn_target, epsilon, self.discount, batch_size, optimizer, device, replay_buffer) if total_steps % update_num == 0: dqn_target.load_state_dict(dqn.state_dict()) dqn_target.eval() if len(replay_buffer) > 10000: replay_buffer.pop(0) if wait_step > 0.0: self.board.delete(self.agent_img) self.agent_img = self._place_imgs(self.board, self.robot_pic, [state.agent_idx])[0] _update() time.sleep(wait_step) step_num += 1 total_steps += 1 _update() print("DONE")
def __init__(self, env: FarmGridWorld, grid: np.ndarray, discount: float, val_type: str, show_policy: bool = True, wait: float = 0.1, val_min: Optional[float] = None): # 0: up, 1: down, 2: left, 3: right super().__init__() # initialize environment self.wait: float = wait self.val_type: str = val_type.upper() self.show_policy: bool = show_policy self.val_min: float = val_min self.val_max: float = 0 self.env: FarmGridWorld = env self.discount: float = discount self.num_actions: int = 4 self.agent_idx: Tuple[int, int] = mask_to_idxs(grid, 1)[0] self.start_idx = self.agent_idx self.goal_idx: Tuple[int, int] = mask_to_idxs(grid, 2)[0] self.plant_idxs: List[Tuple[int, int]] = mask_to_idxs(grid, 3) self.rocks_idxs: List[Tuple[int, int]] = mask_to_idxs(grid, 4) # enumerate states self.states: List[FarmState] = [] for pos_i in range(grid.shape[0]): for pos_j in range(grid.shape[1]): state: FarmState = FarmState((pos_i, pos_j), self.goal_idx, self.plant_idxs, self.rocks_idxs) self.states.append(state) # enumerate value funcs self.state_vals: Dict[FarmState, float] = dict() self.action_vals: Dict[FarmState, List[float]] = dict() self.state_visits: Dict[FarmState, int] = dict() for state in self.states: self.state_visits[state] = 0 self.state_vals[state] = 0.0 if self.env.is_terminal(state): self.action_vals[state] = [0] * self.num_actions else: self.action_vals[state] = [0] * self.num_actions # initialize board self.window = tkinter.Tk() self.window.wm_title("CSE 790 Farm") self.width: int = 70 self.width_half: int = int(self.width / 2) self.text_offset: int = 17 # load pictures path = os.getcwd() + "/images/" self.goal_pic = ImageTk.PhotoImage(file=path + 'goal.png') self.plant_pic = ImageTk.PhotoImage(file=path + 'plant.png') self.robot_pic = ImageTk.PhotoImage(file=path + 'robot.png') self.rocks_pic = ImageTk.PhotoImage(file=path + 'rocks.png') grid_dim_x, grid_dim_y = env.grid_shape self.board: Canvas = Canvas(self.window, width=grid_dim_y * self.width + 2, height=grid_dim_x * self.width + 2) # create initial grid squares self.grid_squares: List[List] = [] for pos_i in range(grid_dim_x): grid_squares_row: List = [] for pos_j in range(grid_dim_y): square = self.board.create_rectangle( pos_i * self.width + 4, pos_j * self.width + 4, (pos_i + 1) * self.width + 4, (pos_j + 1) * self.width + 4, fill="white", width=1) grid_squares_row.append(square) self.grid_squares.append(grid_squares_row) # create figures self._place_imgs(self.board, self.goal_pic, [self.goal_idx]) self._place_imgs(self.board, self.plant_pic, self.plant_idxs) self._place_imgs(self.board, self.rocks_pic, self.rocks_idxs) self.agent_img = self._place_imgs(self.board, self.robot_pic, [self.agent_idx])[0] # create grid arrows self.grid_arrows: List[List[List]] = [] if self.val_type == "STATE": # create initial grid values self.grid_text: List[List] = [] for pos_i in range(grid_dim_x): grid_text_rows: List = [] for pos_j in range(grid_dim_y): val = self.board.create_text( pos_i * self.width + self.width_half, pos_j * self.width + self.width_half, text="", fill="black") grid_text_rows.append(val) self.grid_text.append(grid_text_rows) self.board.pack(side=LEFT) do_buttons: bool = False if do_buttons: # make control buttons panel = Frame(self.window) panel.pack(side=RIGHT) Label(text="Buttons\n", font="Verdana 12 bold").pack() value_itr_frame = Frame(self.window) value_itr_frame.pack() b1 = Button(text="Save Figure") b1.bind("<Button-1>", self.save_board) b1.pack() vi_button = Button(text="Value Iteration") vi_button.bind("<Button-1>", self.value_iteration) vi_button.pack() if self.val_type == "ACTION": self._init_action_vals_color() # self.update() # self.monte_carlo_policy_evaluation() # self.td_policy_evaluation(5) # self.td_lambda_policy_evaluation(0.5) # self.policy_evaluation() # self.q_learning() self.window.update()
def __init__(self, env: FarmGridWorld, grid: np.ndarray): # 0: up, 1: down, 2: left, 3: right super().__init__() # initialize environment self.val_max: float = 0 self.env: FarmGridWorld = env self.num_actions: int = 4 self.agent_idx: Tuple[int, int] = mask_to_idxs(grid, 1)[0] self.start_idx = self.agent_idx self.goal_idx: Tuple[int, int] = mask_to_idxs(grid, 2)[0] self.plant_idxs: List[Tuple[int, int]] = mask_to_idxs(grid, 3) self.rocks_idxs: List[Tuple[int, int]] = mask_to_idxs(grid, 4) # enumerate states self.states: List[FarmState] = [] for pos_i in range(grid.shape[0]): for pos_j in range(grid.shape[1]): state: FarmState = FarmState((pos_i, pos_j), self.goal_idx, self.plant_idxs, self.rocks_idxs) self.states.append(state) # enumerate value funcs self.state_vals: Dict[FarmState, float] = dict() self.action_vals: Dict[FarmState, List[float]] = dict() self.state_visits: Dict[FarmState, int] = dict() for state in self.states: self.state_visits[state] = 0 self.state_vals[state] = 0.0 if self.env.is_terminal(state): self.action_vals[state] = [0] * self.num_actions else: self.action_vals[state] = [0] * self.num_actions # initialize board self.window = tkinter.Tk() self.window.wm_title("AI Farm") self.width: int = 70 self.width_half: int = int(self.width / 2) self.text_offset: int = 17 # load pictures path = os.getcwd() + "/images/" self.goal_pic = ImageTk.PhotoImage(file=path + 'goal.png') self.plant_pic = ImageTk.PhotoImage(file=path + 'plant.png') self.robot_pic = ImageTk.PhotoImage(file=path + 'robot.png') self.rocks_pic = ImageTk.PhotoImage(file=path + 'rocks.png') grid_dim_x, grid_dim_y = env.grid_shape self.board: Canvas = Canvas(self.window, width=grid_dim_y * self.width + 2, height=grid_dim_x * self.width + 2) # create initial grid squares self.grid_squares: List[List] = [] for pos_i in range(grid_dim_x): grid_squares_row: List = [] for pos_j in range(grid_dim_y): square = self.board.create_rectangle( pos_i * self.width + 4, pos_j * self.width + 4, (pos_i + 1) * self.width + 4, (pos_j + 1) * self.width + 4, fill="white", width=1) grid_squares_row.append(square) self.grid_squares.append(grid_squares_row) # create figures self._place_imgs(self.board, self.goal_pic, [self.goal_idx]) self._place_imgs(self.board, self.plant_pic, self.plant_idxs) self._place_imgs(self.board, self.rocks_pic, self.rocks_idxs) self.agent_img = self._place_imgs(self.board, self.robot_pic, [self.agent_idx])[0] # create grid arrows self.grid_arrows: List[List[List]] = [] self.board.pack(side=LEFT) self.window.update()