def roll_out_in_env(self, start, goal, ultimate_goal, horizon, mode='test'): s_u_goal = self.grid.lls2hls(ultimate_goal) roll_out = Batch() break_var = False s = start d = False s_list = [] a_list = [] r_list = [] for step_i in range(horizon): self.episode_steps += 1 s_bar = self.grid.lls2hls(s) # GET THE TARGET VECTOR target_vec = (goal[0] - s[0], goal[1] - s[1]) s_save = copy.deepcopy( np.array( list(s[2:]) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) + list(target_vec))) s_pos_save = copy.deepcopy(np.array(s[:2])) s_list.append(np.concatenate((s_pos_save, s_save))) s = torch.tensor(s_save, dtype=torch.float).unsqueeze(0) a = self.policy.select_action(s) s_new, r, d, _, info = self.grid.step(action=10 * a) s_bar_cand = self.grid.lls2hls(s_new) d = (s_bar_cand[0] == goal[0]) and (s_bar_cand[1] == goal[1]) if d: info = False r = 1.0 success_var = ((s_bar_cand[0] == s_u_goal[0]) and (s_bar_cand[1] == s_u_goal[1])) if success_var: info = False break_var = True break_var = break_var or (not info) or (step_i + 1 == horizon) or ( self.episode_steps == self.config.time_horizon) a_list.append(a) r_list.append(r) roll_out.append( Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r], [s_new.astype(np.float32)], [0 if break_var else 1], [info], [1.0])) s = s_new if break_var: break return roll_out, s_list, a_list, r_list, d, s_new, s_bar_cand
def simulate_env(self, mode): batch = Batch() num_roll_outs = 0 num_steps = 0 total_success = 0 if mode == 'train': while num_steps < self.iter_size: roll_out, steps, states, actions, rewards, success, start_pos, goal_pos = self.roll_out_in_env( horizon=self.max_iter, mode='train') if self.total_steps > self.start_time_steps: for _ in range(40): self.manager_policy.train(self.replay_buffer, self.batch_size) batch.append(roll_out) num_roll_outs += 1 num_steps += steps total_success += success return batch, total_success / num_roll_outs, num_steps / num_roll_outs else: _, steps, states, actions, rewards, success, start_pos, goal_pos = self.roll_out_in_env( horizon=self.max_iter, mode='test') return success
def roll_out_in_env(self, start, horizon): roll_out = Batch() s = start s_list = [] a_list = [] r_list = [] success_var = False oob = False break_var = False for step_i in range(horizon): target_vec = s['desired_goal'] - s['observation'] s_save = copy.deepcopy( np.array( list(target_vec[:2]) + list(s['observation'][2:4]) + list(target_vec[2:4]))) s_list.append(copy.deepcopy(s['observation'])) s_tensor = torch.tensor(s_save, dtype=torch.float).unsqueeze(0) a = self.policy.select_action(s_tensor) s_new, r, d, info = self.grid.env.step(a) success_var = info["is_success"] info = not info["is_success"] r = 0.0 if success_var: r = 1.0 info = False break_var = True break_var = break_var or (not info) or (step_i + 1 == horizon) ib = self.grid.check_in_bounds(s_new['observation']) if not ib: oob = True a_list.append(a) r_list.append(r) roll_out.append( Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r], [s_new['observation'].astype(np.float32)], [0 if (break_var or oob) else 1], [info], [1.0])) s = s_new if break_var or oob: break s_list.append(copy.deepcopy(s['observation'])) return roll_out, s_list, a_list, r_list, success_var
def roll_out_in_env(self, start, goal, ultimate_goal, horizon): roll_out = Batch() s_u_goal = self.grid.lls2hls(ultimate_goal) s = self.grid.reset(start, goal) s_list = [] a_list = [] r_list = [] d = False for step_i in range(horizon): s_bar = self.grid.lls2hls(s) target_vec = goal - s[:2] s_save = copy.deepcopy(np.array(list(s[2:]) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) + list(target_vec))) s_pos_save = copy.deepcopy(np.array(s[:2])) s_list.append(np.concatenate((s_pos_save, s_save))) s = torch.tensor(s_save, dtype=torch.float).unsqueeze(0) a = self.policy.select_action(s) s_new, r, d, d_wp, info = self.grid.step(action=10*a) s_bar_cand = self.grid.lls2hls(s_new) break_var = False success_var = ((s_bar_cand[0] == s_u_goal[0]) and (s_bar_cand[1] == s_u_goal[1])) if success_var: info = False break_var = True a_list.append(a) r_list.append(r) roll_out.append( Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r], [s_new.astype(np.float32)], [0 if ((not info) or (step_i + 1 == horizon) or break_var) else 1], [info], [1.0])) s = s_new if (not info) or break_var: break return roll_out, s_list, a_list, r_list, d, s_new
def simulate_env(self, mode): batch = Batch() num_roll_outs = 0 num_steps = 0 total_success = 0 total_wp_success = 0 j = 0. jwp = 0. if mode == 'train': while num_steps < self.batch_size: self.grid.reset_env_terrain() start_pos = self.grid.sample_random_start_terrain(number=1)[0] goal_pos = self.grid.sample_random_goal_terrain(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) roll_out, states, actions, rewards, success, l_state = self.roll_out_in_env(start=start_pos, goal=goal_pos, ultimate_goal=goal_pos, horizon=self.max_iter) st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) jwp = 1. num_roll_outs += 1 num_steps += roll_out.length() batch.append(roll_out) total_success += success j += 1 return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs else: self.grid.reset_env_terrain() start_pos = self.grid.sample_random_start_terrain(number=1)[0] goal_pos = self.grid.sample_random_goal_terrain(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) roll_out, states, actions, rewards, success, l_state = self.roll_out_in_env(start=start_pos, goal=goal_pos, ultimate_goal=goal_pos, horizon=self.max_iter) st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) num_roll_outs += 1 num_steps += roll_out.length() batch.append(roll_out) total_success += success j += 1 return success
def simulate_env(self, mode): batch = Batch() num_roll_outs = 0 num_steps = 0 total_success = 0 total_wp_success = 0 j = 0. jwp = 0. if mode == 'train': while num_steps < self.batch_size: """ INITIALIZE THE ENVIRONMENT """ s_init = self.grid.reset() roll_out, _, _, _, success = self.roll_out_in_env( start=s_init, horizon=self.max_iter) jwp = 1. num_roll_outs += 1 num_steps += roll_out.length() batch.append(roll_out) total_success += success j += 1 return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs else: """ INITIALIZE THE ENVIRONMENT """ s_init = self.grid.reset() roll_out, state_list, action_list, reward_list, success = self.roll_out_in_env( start=s_init, horizon=self.max_iter) num_roll_outs += 1 num_steps += roll_out.length() batch.append(roll_out) total_success += success j += 1 return success
def simulate_env(self, mode): batch = Batch() num_roll_outs = 0 num_steps = 0 total_success = 0 total_wp_success = 0 j = 0. jwp = 0. if mode == 'train': while num_steps < self.config.policy_batch_size: """ INITIALIZE THE ENVIRONMENT """ self.grid.reset_env_terrain() start_pos = self.grid.sample_random_start_terrain(number=1)[0] goal_pos = self.grid.sample_random_goal_terrain(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) s_init = self.grid.reset(start_pos, goal_pos) self.episode_steps = 0 """ V MAP """ if self.config.optimistic_model: self.vi.update_p_table_optimistic( occupancy_map=self.grid.occupancy_map, walls=True) else: self.vi.update_p_table(occupancy_map=self.grid.terrain_map, walls=True) v, pi = self.vi.run_vi(grid=self.grid, goal=(s_goal[0], s_goal[1])) """ START THE EPISODE """ horizon_left = self.config.time_horizon st = start_pos success = False s_bar = self.grid.lls2hls(st) hl_s_list = [] hl_a_list = [] hl_r_list = [] hl_d_list = [] hl_s_list.append(s_bar) while (horizon_left > 0) and not success: # GET THE TARGET VECTOR self.dqn_steps += 1 self.eps = 0.01 + 0.99 * math.exp( -1. * self.dqn_steps / 10000) s_bar = self.grid.lls2hls(st) if torch.rand(1)[0] > self.eps: a_bar = int(pi[s_bar[0], s_bar[1]]) else: a_bar = randint(0, 7) self.vi.set_target(s_bar, a_bar) curr_goal = self.vi.get_target() roll_out, _, _, _, wp_success, l_state, s_bar_p = self.roll_out_in_env( start=s_init, goal=curr_goal, horizon=self.time_scale, ultimate_goal=goal_pos, mode='train') hl_s_list.append(s_bar_p) hl_a_list.append(a_bar) st = l_state[:2] s_init = l_state num_roll_outs += 1 num_steps += roll_out.length() horizon_left -= roll_out.length() total_wp_success += wp_success jwp += 1 st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) if success: hl_r_list.append(0) hl_d_list.append(True) else: hl_r_list.append(-1) hl_d_list.append(False) batch.append(roll_out) total_success += success j += 1 if not self.config.optimistic_model: x_temp, y_temp, w_temp = self.vi.generate_dataset_flat( self.grid.terrain_map, hl_s_list, hl_a_list) for bi in range(x_temp.shape[0]): self.buffer.add(x_temp[bi], y_temp[bi], w_temp[bi]) self.vi.train_net(buffer=self.buffer, bs=128, opt_iterations=40, rw=True) return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs else: self.grid.reset_env_terrain() start_pos = self.grid.sample_random_start_terrain(number=1)[0] goal_pos = self.grid.sample_random_goal_terrain(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) s_init = self.grid.reset(start_pos, goal_pos) self.episode_steps = 0 """ V MAP """ if self.config.optimistic_model: self.vi.update_p_table_optimistic( occupancy_map=self.grid.occupancy_map, walls=True) else: self.vi.update_p_table(occupancy_map=self.grid.terrain_map, walls=True) v, pi = self.vi.run_vi(grid=self.grid, goal=(s_goal[0], s_goal[1])) horizon_left = self.config.time_horizon st = start_pos success = False s_bar = self.grid.lls2hls(st) while (horizon_left > 0) and not success: # GET THE TARGET VECTOR a_bar = int(pi[s_bar[0], s_bar[1]]) self.vi.set_target(s_bar, a_bar) curr_goal = self.vi.get_target() roll_out, states, actions, rewards, wp_success, l_state, _ = self.roll_out_in_env( start=s_init, goal=curr_goal, horizon=self.time_scale, ultimate_goal=goal_pos, mode='test') st = l_state[:2] s_bar = self.grid.lls2hls(st) s_init = l_state num_roll_outs += 1 num_steps += roll_out.length() horizon_left -= roll_out.length() total_wp_success += wp_success jwp += 1 st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) return success
def simulate_env(self, mode): batch = Batch() num_roll_outs = 0 num_steps = 0 total_success = 0 total_wp_success = 0 j = 0. jwp = 0. if mode == 'train': while num_steps < self.config.policy_batch_size: """ INITIALIZE THE ENVIRONMENT """ a, b, c, d, e, f, g, h, p = self.grid.reset_env_random() structure = copy.deepcopy(a) objects = copy.deepcopy(b) obstacle_list = copy.deepcopy(c) occupancy_map = copy.deepcopy(d) default_starts = copy.deepcopy(e) state_cache = copy.deepcopy(f) occupancy_map_padded = copy.deepcopy(g) occupancy_map_un_padded = copy.deepcopy(h) occupancy_map_original = copy.deepcopy(p) self.grid = Ant44Env0( mj_ant_path=self.config.mj_ant_path, re_init=True, structure=structure, objects=objects, obstacle_list=obstacle_list, occupancy_map=occupancy_map, default_starts=default_starts, state_cache=state_cache, occupancy_map_padded=occupancy_map_padded, occupancy_map_un_padded=occupancy_map_un_padded, occupancy_map_original=occupancy_map_original) start_pos = self.grid.sample_random_pos(number=1)[0] goal_pos = self.grid.sample_random_pos(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) self.episode_steps = 0 s_init = self.grid.reset(start_pos) """ IMAGE INPUT """ image = np.zeros((1, 2, self.grid.x_size, self.grid.y_size)) image[0, 0, :, :] = self.grid.occupancy_map_un_padded image[0, 1, :, :] = -1 * np.ones( (self.grid.x_size, self.grid.y_size)) image[0, 1, s_goal[0], s_goal[1]] = 0 image = torch.from_numpy(image).float().cuda() with torch.no_grad(): v = self.mvprop_optimizer.mvprop(image) v = v.cpu().detach() """ START THE EPISODE """ horizon_left = self.config.time_horizon st = start_pos success = False s_bar = self.grid.lls2hls(st) hl_s_list = [] hl_a_list = [] hl_r_list = [] hl_d_list = [] hl_s_list.append(s_bar) while (horizon_left > 0) and not success: # GET THE TARGET VECTOR self.dqn_steps += 1 if self.config.mvprop_decay_type == 'exp': self.eps = 0.01 + 0.99 * math.exp( -1. * self.dqn_steps / self.config.mvprop_decay) if torch.rand(1)[0] > self.eps: with torch.no_grad(): options_x = [ s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1 ] options_y = [ s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1 ] options_x = [self.bound_x(e) for e in options_x] options_y = [self.bound_y(e) for e in options_y] v_options = v[0, 0, options_x, options_y] option = np.argmax(v_options) else: option = randint(0, 8) if option == 0: target = (s_bar[0], s_bar[1]) elif option == 1: target = (s_bar[0], s_bar[1] + 1) elif option == 2: target = (s_bar[0] + 1, s_bar[1] + 1) elif option == 3: target = (s_bar[0] + 1, s_bar[1]) elif option == 4: target = (s_bar[0] + 1, s_bar[1] - 1) elif option == 5: target = (s_bar[0], s_bar[1] - 1) elif option == 6: target = (s_bar[0] - 1, s_bar[1] - 1) elif option == 7: target = (s_bar[0] - 1, s_bar[1]) elif option == 8: target = (s_bar[0] - 1, s_bar[1] + 1) target = (max(0, min(target[0], self.grid.x_size - 1)), max(0, min(target[1], self.grid.y_size - 1))) roll_out, _, _, _, wp_success, l_state, s_bar_p = self.roll_out_in_env( start=s_init, goal=target, horizon=self.time_scale, ultimate_goal=goal_pos) s_init = l_state hl_s_list.append(s_bar_p) hl_a_list.append(option) st = l_state[:2] s_bar = self.grid.lls2hls(st) num_roll_outs += 1 num_steps += roll_out.length() horizon_left -= roll_out.length() total_wp_success += wp_success jwp += 1 st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) if success: hl_r_list.append(0) hl_d_list.append(True) else: hl_r_list.append(-1) hl_d_list.append(False) batch.append(roll_out) total_success += success j += 1 ### ADD TRANSITIONS TO BUFFER for ep_idx in range(len(hl_a_list)): self.memory.add(hl_s_list[ep_idx], hl_a_list[ep_idx], hl_s_list[ep_idx + 1], hl_r_list[ep_idx], hl_d_list[ep_idx], image) if self.config.mvprop_her: ### GET THE HINDSIGHT GOAL TRANSITION image_her = np.zeros( (1, 2, self.grid.x_size, self.grid.y_size)) image_her[0, 0, :, :] = self.grid.occupancy_map_un_padded image_her[0, 1, :, :] = -1 * np.ones( (self.grid.x_size, self.grid.y_size)) image_her[0, 1, hl_s_list[-1][0], hl_s_list[-1][1]] = 0 image_her = torch.from_numpy(image_her).float().cuda() if (hl_s_list[ep_idx + 1][0] == hl_s_list[-1][0]) and ( hl_s_list[ep_idx + 1][1] == hl_s_list[-1][1]): hgt_reward = 0 hgt_done = True else: hgt_reward = -1 hgt_done = False self.memory.add(hl_s_list[ep_idx], hl_a_list[ep_idx], hl_s_list[ep_idx + 1], hgt_reward, hgt_done, image_her) ### OPTIMIZE NETWORK PARAMETERS for _ in range(40): self.mvprop_optimizer.train(self.config.time_horizon / self.time_scale) # TARGET NET UPDATE if self.dqn_steps % self.config.mvprop_target_update_frequency == 0: tau = 0.05 for param, target_param in zip( self.mvprop_optimizer.mvprop.parameters(), self.mvprop_optimizer.target_mvprop.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) ant_path = self.grid.mj_ant_path + 'ant_copy.xml' tree = ET.parse(ant_path) tree.write(self.grid.mj_ant_path + 'ant.xml') return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs else: """ INITIALIZE THE ENVIRONMENT """ a, b, c, d, e, f, g, h, p = self.grid.reset_env_random() structure = copy.deepcopy(a) objects = copy.deepcopy(b) obstacle_list = copy.deepcopy(c) occupancy_map = copy.deepcopy(d) default_starts = copy.deepcopy(e) state_cache = copy.deepcopy(f) occupancy_map_padded = copy.deepcopy(g) occupancy_map_un_padded = copy.deepcopy(h) occupancy_map_original = copy.deepcopy(p) self.grid = Ant44Env0( mj_ant_path=self.config.mj_ant_path, re_init=True, structure=structure, objects=objects, obstacle_list=obstacle_list, occupancy_map=occupancy_map, default_starts=default_starts, state_cache=state_cache, occupancy_map_padded=occupancy_map_padded, occupancy_map_un_padded=occupancy_map_un_padded, occupancy_map_original=occupancy_map_original) start_pos = self.grid.sample_random_pos(number=1)[0] goal_pos = self.grid.sample_random_pos(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) self.episode_steps = 0 s_init = self.grid.reset(start_pos) image = np.zeros((1, 2, self.grid.x_size, self.grid.y_size)) image[0, 0, :, :] = self.grid.occupancy_map_un_padded image[0, 1, :, :] = -1 * np.ones( (self.grid.x_size, self.grid.y_size)) image[0, 1, s_goal[0], s_goal[1]] = 0 image = torch.from_numpy(image).float().cuda() with torch.no_grad(): v = self.mvprop_optimizer.target_mvprop(image) v = v.cpu().detach() horizon_left = self.config.time_horizon st = start_pos success = False s_bar = self.grid.lls2hls(st) while (horizon_left > 0) and not success: # GET THE TARGET VECTOR with torch.no_grad(): options_x = [ s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1 ] options_y = [ s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1 ] options_x = [self.bound_x(e) for e in options_x] options_y = [self.bound_y(e) for e in options_y] v_options = v[0, 0, options_x, options_y] option = np.argmax(v_options) if option == 0: target = (s_bar[0], s_bar[1]) elif option == 1: target = (s_bar[0], s_bar[1] + 1) elif option == 2: target = (s_bar[0] + 1, s_bar[1] + 1) elif option == 3: target = (s_bar[0] + 1, s_bar[1]) elif option == 4: target = (s_bar[0] + 1, s_bar[1] - 1) elif option == 5: target = (s_bar[0], s_bar[1] - 1) elif option == 6: target = (s_bar[0] - 1, s_bar[1] - 1) elif option == 7: target = (s_bar[0] - 1, s_bar[1]) elif option == 8: target = (s_bar[0] - 1, s_bar[1] + 1) target = (max(0, min(target[0], self.grid.x_size - 1)), max(0, min(target[1], self.grid.y_size - 1))) roll_out, states, actions, rewards, wp_success, l_state, _ = self.roll_out_in_env( start=s_init, goal=target, horizon=self.time_scale, ultimate_goal=goal_pos) s_init = l_state st = l_state[:2] s_bar = self.grid.lls2hls(st) num_roll_outs += 1 num_steps += roll_out.length() horizon_left -= roll_out.length() total_wp_success += wp_success jwp += 1 st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) ant_path = self.grid.mj_ant_path + 'ant_copy.xml' tree = ET.parse(ant_path) tree.write(self.grid.mj_ant_path + 'ant.xml') return success
def roll_out_in_env(self, start, goal, horizon, mode='train'): roll_out = Batch() done = False s = self.grid.reset(start, goal) s_list = [] a_list = [] r_list = [] state_seq = [] action_seq = [] s_bar = self.grid.lls2hls(s) s_manager = copy.deepcopy( np.array( list(s) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) + list(goal))) r_manager = 0. if mode == 'train': if self.total_steps < self.start_time_steps: sub_goal = (4 * np.random.random((2, ))) - 2 else: sub_goal = (self.manager_policy.select_action(s_manager) + np.random.normal(0, self.max_action * self.expl_noise, size=self.action_dim)).\ clip(-self.max_action, self.max_action) else: sub_goal = self.manager_policy.select_action(s_manager) self.manager.update(s=copy.deepcopy(s), sg=sub_goal) s_goal = self.grid.lls2hls(goal) for step_i in range(horizon): if mode == 'train': self.total_steps += 1 s_bar = self.grid.lls2hls(s) s_save = copy.deepcopy( np.array( list(s) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) + list(self.manager.target(s)))) s_list.append(s_save) s_tensor = torch.tensor(s_save, dtype=torch.float).unsqueeze(0) a = self.policy.select_action(s_tensor) state_seq.append(s_save) action_seq.append(a) s_new, _, _, _, _ = self.grid.step(action=10 * a) r = self.manager.reward(s_new) a_list.append(a) r_list.append(r) s_new_bar = self.grid.lls2hls(s_new) done = ((s_new_bar[0] == s_goal[0]) and (s_new_bar[1] == s_goal[1])) r_manager += -1 * float(not done) manager_update = (step_i + 1) % self.manager_time_scale == 0 roll_out.append( Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [ r ], [s_new.astype(np.float32)], [ 0 if ((step_i + 1 == horizon) or done or manager_update) else 1 ], [not done], [1.0])) s = s_new if manager_update or done: self.total_steps += 1 s_new_manager = copy.deepcopy( np.array( list(s) + list(self.grid.state_cache[s_new_bar[0], s_new_bar[1]]) + list(goal))) if mode == 'train': self.replay_buffer.add(s_manager, self.manager.sg, s_new_manager, r_manager, done, np.array(state_seq), np.array(action_seq)) if self.her_var: s_manager_her = np.concatenate((s_manager[:12], s[:2])) s_new_manager_her = np.concatenate( (s_new_manager[:12], s[:2])) r_manager_her = -len(state_seq) + 1 self.replay_buffer.add(s_manager_her, self.manager.sg, s_new_manager_her, r_manager_her, True, np.array(state_seq), np.array(action_seq)) state_seq = [] action_seq = [] s_manager = s_new_manager r_manager = 0. if mode == 'train': if self.total_steps < self.start_time_steps: sub_goal = (4 * np.random.random((2, ))) - 2 else: sub_goal = (self.manager_policy.select_action(s_manager) + np.random.normal(0, self.max_action * self.expl_noise, size=self.action_dim)). \ clip(-self.max_action, self.max_action) else: sub_goal = self.manager_policy.select_action(s_manager) self.manager.update(s=copy.deepcopy(s), sg=sub_goal) if done: break s_save = copy.deepcopy( np.array( list(s) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) + list(self.manager.target(s)))) s_list.append(s_save) return roll_out, step_i + 1, s_list, a_list, r_list, done
def roll_out_in_env(self, horizon, mode='train'): roll_out = Batch() oob = False s = self.grid.reset() s_list = [] a_list = [] r_list = [] state_seq = [] action_seq = [] ultimate_start_bar = self.grid.lls2hls(s['observation']) goal_bar = self.grid.lls2hls(s['desired_goal']) s_manager = copy.deepcopy(np.concatenate((s['observation'], s['desired_goal']))) r_manager = 0. if mode == 'train': if self.total_steps < self.start_time_steps: sub_goal = np.concatenate((0.08 * np.random.random((2, )) - 0.04, 2 * math.pi * np.random.random((1, )) - math.pi)) else: sub_goal = (self.manager_policy.select_action(s_manager) + np.random.normal(0, self.max_action * self.expl_noise, size=self.action_dim)).\ clip(-self.max_action, self.max_action) else: sub_goal = self.manager_policy.select_action(s_manager) self.manager.update(s=copy.deepcopy(s['observation']), sg=sub_goal) for step_i in range(horizon): if mode == 'train': self.total_steps += 1 manager_target = self.manager.target(s['observation']) s_save = copy.deepcopy( np.array(list(manager_target[:2]) + list(s['observation'][2:4]) + list(manager_target[2:4]))) s_list.append(s_save) s_tensor = torch.tensor(s_save, dtype=torch.float).unsqueeze(0) a = self.policy.select_action(s_tensor) state_seq.append(copy.deepcopy(np.array(list(manager_target[:2]) + list(s['observation'][2:4]) + self.manager.target_ang(s['observation'])))) action_seq.append(a) s_new, r, d, info = self.grid.env.step(a) info = info["is_success"] r = self.manager.reward(s_new['observation']) a_list.append(a) r_list.append(r) r_manager += float(info) manager_update = (step_i + 1) % self.manager_time_scale == 0 ib = self.grid.check_in_bounds(s_new['observation']) if not ib: oob = True roll_out.append( Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r], [s_new['observation'].astype(np.float32)], [0 if ((step_i + 1 == horizon) or info or oob or manager_update) else 1], [not info], [1.0])) s = s_new if manager_update or info: self.total_steps += 1 s_new_manager = copy.deepcopy(np.concatenate((s['observation'], s['desired_goal']))) if mode == 'train': self.replay_buffer.add(s_manager, self.manager.a, s_new_manager, r_manager, info, np.array(state_seq), np.array(action_seq)) s_manager_her = np.concatenate((s_manager[:6], s['observation'])) s_new_manager_her = np.concatenate((s_new_manager[:6], s['observation'])) self.replay_buffer.add(s_manager_her, self.manager.a, s_new_manager_her, 1.0, True, np.array(state_seq), np.array(action_seq)) state_seq = [] action_seq = [] s_manager = s_new_manager r_manager = 0. if mode == 'train': if self.total_steps < self.start_time_steps: sub_goal = np.concatenate( (0.08 * np.random.random((2,)) - 0.04, 2 * math.pi * np.random.random((1,)) - math.pi)) else: sub_goal = (self.manager_policy.select_action(s_manager) + np.random.normal(0, self.max_action * self.expl_noise, size=self.action_dim)). \ clip(-self.max_action, self.max_action) else: sub_goal = self.manager_policy.select_action(s_manager) self.manager.update(s=copy.deepcopy(s['observation']), sg=sub_goal) if info or oob: break manager_target = self.manager.target(s['observation']) s_save = copy.deepcopy(np.array(list(manager_target[:2]) + list(s['observation'][2:4]) + list(manager_target[2:4]))) s_list.append(s_save) return roll_out, step_i + 1, s_list, a_list, r_list, info, ultimate_start_bar, goal_bar
def simulate_env(self, mode): batch = Batch() num_roll_outs = 0 num_steps = 0 total_success = 0 total_wp_success = 0 j = 0. jwp = 0. if mode == 'train': while num_steps < self.batch_size: """ INITIALIZE THE ENVIRONMENT """ s_init = self.grid.reset() s_start = self.grid.lls2hls(s_init['observation']) s_goal = self.grid.lls2hls(s_init['desired_goal']) self.episode_steps = 0 """ IMAGE INPUT """ image = np.zeros((1, 2, self.grid.x_size, self.grid.y_size, 8)) image[0, 0, :, :, :] = np.ones((self.grid.x_size, self.grid.y_size, 8)) image[0, 1, :, :, :] = -1 * np.ones((self.grid.x_size, self.grid.y_size, 8)) image[0, 1, s_goal[0], s_goal[1], s_goal[2]] = 0 image = torch.from_numpy(image).float().cuda() with torch.no_grad(): v = self.mvprop_optimizer.mvprop(image) v = v.cpu().detach() """ START THE EPISODE """ horizon_left = self.max_iter success = False s_bar = self.grid.lls2hls(s_init['observation']) hl_s_list = [] hl_a_list = [] hl_r_list = [] hl_d_list = [] hl_s_list.append(s_bar) while (horizon_left > 0) and not success: # GET THE TARGET VECTOR self.dqn_steps += 1 self.eps = 0.01 + 0.99 * math.exp(-1. * self.dqn_steps / 10000) s_bar = self.grid.lls2hls(s_init['observation']) hl_s_list.append(s_bar) if torch.rand(1)[0] > self.eps: with torch.no_grad(): options_x = [s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1] options_y = [s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1] options_z = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7] options_x = [self.bound_x(e) for e in options_x] options_y = [self.bound_y(e) for e in options_y] v_options = v[0, 0, options_x, options_y, options_z] option = np.argmax(v_options) else: option = randint(0, 71) option_o = np.floor(option / 9) option_p = option % 9 if option_p == 0: target_p = (s_bar[0], s_bar[1]) elif option_p == 1: target_p = (s_bar[0], s_bar[1] + 1) elif option_p == 2: target_p = (s_bar[0] + 1, s_bar[1] + 1) elif option_p == 3: target_p = (s_bar[0] + 1, s_bar[1]) elif option_p == 4: target_p = (s_bar[0] + 1, s_bar[1] - 1) elif option_p == 5: target_p = (s_bar[0], s_bar[1] - 1) elif option_p == 6: target_p = (s_bar[0] - 1, s_bar[1] - 1) elif option_p == 7: target_p = (s_bar[0] - 1, s_bar[1]) elif option_p == 8: target_p = (s_bar[0] - 1, s_bar[1] + 1) target_p = (max(0, min(target_p[0], self.grid.x_size - 1)), max(0, min(target_p[1], self.grid.y_size - 1))) target = (target_p[0], target_p[1], int(option_o)) roll_out, _, _, _, wp_success, success, l_state, s_bar_p, oob = self.roll_out_in_env( horizon=self.time_scale, start=s_init, target=target) s_init = l_state hl_s_list.append(s_bar_p) hl_a_list.append(option) num_roll_outs += 1 num_steps += roll_out.length() horizon_left -= roll_out.length() total_wp_success += wp_success jwp += 1 st_bar = self.grid.lls2hls(l_state['observation']) success_tile = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1]) and (st_bar[2] == s_goal[2])) if success_tile: hl_r_list.append(0) hl_d_list.append(True) else: hl_r_list.append(-1) hl_d_list.append(False) batch.append(roll_out) if oob: break total_success += success j += 1 ### ADD TRANSITIONS TO BUFFER for ep_idx in range(len(hl_a_list)): self.memory.add(hl_s_list[ep_idx], hl_a_list[ep_idx], hl_s_list[ep_idx + 1], hl_r_list[ep_idx], hl_d_list[ep_idx], image) if True: ##### GET THE HINDSIGHT GOAL TRANSITION image_her = np.zeros((1, 2, self.grid.x_size, self.grid.y_size, 8)) image_her[0, 0, :, :, :] = np.ones((self.grid.x_size, self.grid.y_size, 8)) image_her[0, 1, :, :, :] = -1 * np.ones((self.grid.x_size, self.grid.y_size, 8)) image_her[0, 1, hl_s_list[-1][0], hl_s_list[-1][1], hl_s_list[-1][2]] = 0 image_her = torch.from_numpy(image_her).float().cuda() if (hl_s_list[ep_idx + 1][0] == hl_s_list[-1][0]) and \ (hl_s_list[ep_idx + 1][1] == hl_s_list[-1][1]) and \ (hl_s_list[ep_idx + 1][2] == hl_s_list[-1][2]): hgt_reward = 0 hgt_done = True else: hgt_reward = -1 hgt_done = False self.memory.add(hl_s_list[ep_idx], hl_a_list[ep_idx], hl_s_list[ep_idx + 1], hgt_reward, hgt_done, image_her) ### OPTIMIZE NETWORK PARAMETERS for _ in range(40): self.mvprop_optimizer.train(self.max_iter / self.time_scale) # TARGET NET UPDATE if self.dqn_steps % 1 == 0: tau = 0.05 for param, target_param in zip(self.mvprop_optimizer.mvprop.parameters(), self.mvprop_optimizer.target_mvprop.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs else: """ INITIALIZE THE ENVIRONMENT """ s_init = self.grid.reset() s_start = self.grid.lls2hls(s_init['observation']) s_goal = self.grid.lls2hls(s_init['desired_goal']) self.episode_steps = 0 """ IMAGE INPUT """ image = np.zeros((1, 2, self.grid.x_size, self.grid.y_size, 8)) image[0, 0, :, :, :] = np.ones((self.grid.x_size, self.grid.y_size, 8)) image[0, 1, :, :, :] = -1 * np.ones((self.grid.x_size, self.grid.y_size, 8)) image[0, 1, s_goal[0], s_goal[1], s_goal[2]] = 0 image = torch.from_numpy(image).float().cuda() with torch.no_grad(): v = self.mvprop_optimizer.target_mvprop(image) v = v.cpu().detach() """ START THE EPISODE """ horizon_left = self.max_iter success = False s_bar = self.grid.lls2hls(s_init['observation']) hl_s_list = [] hl_a_list = [] hl_r_list = [] hl_d_list = [] hl_s_list.append(s_bar) while (horizon_left > 0) and not success: # GET THE TARGET VECTOR s_bar = self.grid.lls2hls(s_init['observation']) hl_s_list.append(s_bar) with torch.no_grad(): options_x = [s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1] options_y = [s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1] options_z = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7] options_x = [self.bound_x(e) for e in options_x] options_y = [self.bound_y(e) for e in options_y] v_options = v[0, 0, options_x, options_y, options_z] option = np.argmax(v_options) option_o = np.floor(option / 9) option_p = option % 9 if option_p == 0: target_p = (s_bar[0], s_bar[1]) elif option_p == 1: target_p = (s_bar[0], s_bar[1] + 1) elif option_p == 2: target_p = (s_bar[0] + 1, s_bar[1] + 1) elif option_p == 3: target_p = (s_bar[0] + 1, s_bar[1]) elif option_p == 4: target_p = (s_bar[0] + 1, s_bar[1] - 1) elif option_p == 5: target_p = (s_bar[0], s_bar[1] - 1) elif option_p == 6: target_p = (s_bar[0] - 1, s_bar[1] - 1) elif option_p == 7: target_p = (s_bar[0] - 1, s_bar[1]) elif option_p == 8: target_p = (s_bar[0] - 1, s_bar[1] + 1) target_p = (max(0, min(target_p[0], self.grid.x_size - 1)), max(0, min(target_p[1], self.grid.y_size - 1))) target = (target_p[0], target_p[1], int(option_o)) roll_out, _, _, _, wp_success, success, l_state, s_bar_p, oob = self.roll_out_in_env( horizon=self.time_scale, start=s_init, target=target) s_init = l_state hl_s_list.append(s_bar_p) hl_a_list.append(option) num_roll_outs += 1 num_steps += roll_out.length() horizon_left -= roll_out.length() total_wp_success += wp_success jwp += 1 st_bar = self.grid.lls2hls(l_state['observation']) success_tile = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1]) and (st_bar[2] == s_goal[2])) if success_tile: hl_r_list.append(0) hl_d_list.append(True) else: hl_r_list.append(-1) hl_d_list.append(False) if oob: break j = 1. return success
def roll_out_in_env(self, start, target, horizon): roll_out = Batch() s = start s_list = [] a_list = [] r_list = [] d = False oob = False break_var = False target_vec = target_2_target_vec(target, s['observation']) goal = self.grid.lls2hls(s['observation'] + np.array(list(target_vec[:2]) + [0, 0] + list(target_vec[2:4]))) for step_i in range(horizon): self.episode_steps += 1 target_vec = target_2_target_vec(target, s['observation']) s_save = copy.deepcopy(np.array(list(target_vec[:2]) + list(s['observation'][2:4]) + list(target_vec[2:4]))) s_list.append(copy.deepcopy(s['observation'])) s_tensor = torch.tensor(s_save, dtype=torch.float).unsqueeze(0) a = self.policy.select_action(s_tensor) s_new, r, d, info = self.grid.env.step(a) s_new_bar = self.grid.lls2hls(s_new['observation']) success_var = info["is_success"] info = not info["is_success"] d = (s_new_bar[0] == goal[0]) and (s_new_bar[1] == goal[1]) and (s_new_bar[2] == goal[2]) r = 0.0 if d: r = 1.0 info = False if success_var: info = False break_var = True break_var = break_var or (not info) or (step_i + 1 == horizon) or (self.episode_steps == self.max_iter) ib = self.grid.check_in_bounds(s_new['observation']) if not ib: oob = True a_list.append(a) r_list.append(r) roll_out.append( Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r], [s_new['observation'].astype(np.float32)], [0 if (break_var or oob) else 1], [info], [1.0])) s = s_new if break_var or oob: break s_list.append(copy.deepcopy(s['observation'])) return roll_out, s_list, a_list, r_list, d, success_var, s_new, s_new_bar, oob
def simulate_env(self, mode): batch = Batch() num_roll_outs = 0 num_steps = 0 total_success = 0 total_wp_success = 0 j = 0. jwp = 0. if mode == 'train': while num_steps < self.batch_size: """ INITIALIZE THE ENVIRONMENT """ s_init = self.grid.reset() s_start = self.grid.lls2hls(s_init['observation']) s_goal = self.grid.lls2hls(s_init['desired_goal']) self.episode_steps = 0 """ V MAP """ if self.optimistic_model: self.vi.update_p_table_optimistic( occupancy_map=self.grid.occupancy_map, walls=False) else: self.vi.update_p_table( occupancy_map=self.grid.occupancy_map, walls=True) v, pi = self.vi.run_vi(grid=self.grid, goal=(s_goal[0], s_goal[1], s_goal[2])) """ START THE EPISODE """ horizon_left = self.max_iter success = False hl_s_list = [] hl_s_new_list = [] hl_a_list = [] hl_r_list = [] hl_d_list = [] while (horizon_left > 0) and not success: # GET THE TARGET VECTOR self.dqn_steps += 1 self.eps = 0.01 + 0.99 * math.exp( -1. * self.dqn_steps / 10000) s_bar = self.grid.lls2hls(s_init['observation']) hl_s_list.append(s_bar) if torch.rand(1)[0] > self.eps: a_bar = pi[s_bar[0], s_bar[1], s_bar[2]] else: a_bar = (randint(0, 7), randint(0, 7)) self.vi.set_target(s_bar, a_bar) roll_out, _, _, _, wp_success, success, l_state, s_bar_p, oob = self.roll_out_in_env( horizon=self.time_scale, start=s_init) hl_s_new_list.append(s_bar_p) hl_a_list.append(a_bar) s_init = l_state num_roll_outs += 1 num_steps += roll_out.length() horizon_left -= roll_out.length() total_wp_success += wp_success jwp += 1 if success: hl_r_list.append(0) hl_d_list.append(True) else: hl_r_list.append(-1) hl_d_list.append(False) batch.append(roll_out) if oob: break total_success += success j += 1 if not self.optimistic_model: x_temp, y_temp = self.vi.generate_dataset_flat( hl_s_list, hl_a_list, hl_s_new_list) for bi in range(x_temp.shape[0]): self.buffer.add(x_temp[bi], y_temp[bi]) self.vi.train_net(buffer=self.buffer, bs=128, opt_iterations=40) return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs else: """ INITIALIZE THE ENVIRONMENT """ s_init = self.grid.reset() s_start = self.grid.lls2hls(s_init['observation']) s_goal = self.grid.lls2hls(s_init['desired_goal']) self.episode_steps = 0 """ V MAP """ if self.optimistic_model: self.vi.update_p_table_optimistic( occupancy_map=self.grid.occupancy_map, walls=False) else: self.vi.update_p_table(occupancy_map=self.grid.occupancy_map, walls=True) v, pi = self.vi.run_vi(grid=self.grid, goal=(s_goal[0], s_goal[1], s_goal[2])) """ START THE EPISODE """ horizon_left = self.max_iter success = False while (horizon_left > 0) and not success: # GET THE TARGET VECTOR s_bar = self.grid.lls2hls(s_init['observation']) a_bar = pi[s_bar[0], s_bar[1], s_bar[2]] self.vi.set_target(s_bar, a_bar) roll_out, states, actions, rewards, wp_success, success, l_state, s_bar_p, oob = self.roll_out_in_env( horizon=self.time_scale, start=s_init) s_init = l_state num_roll_outs += 1 num_steps += roll_out.length() horizon_left -= roll_out.length() total_wp_success += wp_success jwp += 1 if oob: break return success
def simulate_env(self, mode): batch = Batch() num_roll_outs = 0 num_steps = 0 total_success = 0 total_wp_success = 0 j = 0. jwp = 0. if mode == 'train': while num_steps < self.config.policy_batch_size: """ INITIALIZE THE ENVIRONMENT """ self.grid.reset_env_terrain() start_pos = self.grid.sample_random_start_terrain(number=1)[0] goal_pos = self.grid.sample_random_goal_terrain(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) s_init = self.grid.reset(start_pos, goal_pos) self.episode_steps = 0 path_segment_len_list = self.grid.goal_management.reset(start_pos, goal_pos, self.grid) self.grid.old_distance = self.grid.goal_management.path_segment_len_list[0] """ START THE EPISODE """ horizon_left = self.config.time_horizon success = False while (horizon_left > 0) and not success: # GET THE TARGET VECTOR curr_goal = \ self.grid.goal_management.way_points[self.grid.goal_management.way_point_current] roll_out, _, _, _, wp_success, l_state, s_bar_p = self.roll_out_in_env( start=s_init, goal=curr_goal, horizon=horizon_left, ultimate_goal=goal_pos, mode='train' ) s_init = l_state num_roll_outs += 1 num_steps += roll_out.length() horizon_left -= roll_out.length() total_wp_success += wp_success jwp += 1 st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) batch.append(roll_out) total_success += success j += 1 return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs else: self.grid.reset_env_terrain() start_pos = self.grid.sample_random_start_terrain(number=1)[0] goal_pos = self.grid.sample_random_goal_terrain(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) s_init = self.grid.reset(start_pos, goal_pos) self.episode_steps = 0 path_segment_len_list = self.grid.goal_management.reset(start_pos, goal_pos, self.grid) self.grid.old_distance = self.grid.goal_management.path_segment_len_list[0] """ START THE EPISODE """ horizon_left = self.config.time_horizon success = False while (horizon_left > 0) and not success: # GET THE TARGET VECTOR curr_goal = \ self.grid.goal_management.way_points[self.grid.goal_management.way_point_current] roll_out, states, actions, rewards, wp_success, l_state, _ = self.roll_out_in_env( start=s_init, goal=curr_goal, horizon=horizon_left, ultimate_goal=goal_pos, mode='test' ) s_init = l_state num_roll_outs += 1 num_steps += roll_out.length() horizon_left -= roll_out.length() total_wp_success += wp_success jwp += 1 st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) return success
def simulate_env(self, mode): batch = Batch() num_roll_outs = 0 num_steps = 0 total_success = 0 total_wp_success = 0 j = 0. jwp = 0. if mode == 'train': while num_steps < self.batch_size: """ INITIALIZE THE ENVIRONMENT """ a, b, c, d, e, f, g, h, p = self.grid.reset_env_random() structure = copy.deepcopy(a) objects = copy.deepcopy(b) obstacle_list = copy.deepcopy(c) occupancy_map = copy.deepcopy(d) default_starts = copy.deepcopy(e) state_cache = copy.deepcopy(f) occupancy_map_padded = copy.deepcopy(g) occupancy_map_un_padded = copy.deepcopy(h) occupancy_map_original = copy.deepcopy(p) self.grid = Ant44Env0( mj_ant_path=self.mj_ant_path, re_init=True, structure=structure, objects=objects, obstacle_list=obstacle_list, occupancy_map=occupancy_map, default_starts=default_starts, state_cache=state_cache, occupancy_map_padded=occupancy_map_padded, occupancy_map_un_padded=occupancy_map_un_padded, occupancy_map_original=occupancy_map_original, waypoint=True) start_pos = self.grid.sample_random_pos(number=1)[0] goal_pos = self.grid.sample_random_pos(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) s_init = self.grid.reset(start_pos) horizon_left = self.max_iter success = False path_segment_len_list = self.grid.goal_management.reset( start_pos, goal_pos, self.grid) self.grid.old_distance = self.grid.goal_management.path_segment_len_list[ 0] while (horizon_left > 0) and not success: curr_goal = \ self.grid.goal_management.way_points[self.grid.goal_management.way_point_current] roll_out, states, actions, rewards, wp_success, l_state = self.roll_out_in_env( start=s_init, goal=curr_goal, ultimate_goal=goal_pos, horizon=horizon_left) s_init = l_state num_roll_outs += 1 num_steps += roll_out.length() horizon_left -= roll_out.length() total_wp_success += wp_success jwp += 1 st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) batch.append(roll_out) total_success += success j += 1 ant_path = self.grid.mj_ant_path + 'ant_copy.xml' tree = ET.parse(ant_path) tree.write(self.grid.mj_ant_path + 'ant.xml') return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs else: """ INITIALIZE THE ENVIRONMENT """ a, b, c, d, e, f, g, h, p = self.grid.reset_env_random() structure = copy.deepcopy(a) objects = copy.deepcopy(b) obstacle_list = copy.deepcopy(c) occupancy_map = copy.deepcopy(d) default_starts = copy.deepcopy(e) state_cache = copy.deepcopy(f) occupancy_map_padded = copy.deepcopy(g) occupancy_map_un_padded = copy.deepcopy(h) occupancy_map_original = copy.deepcopy(p) self.grid = Ant44Env0( mj_ant_path=self.mj_ant_path, re_init=True, structure=structure, objects=objects, obstacle_list=obstacle_list, occupancy_map=occupancy_map, default_starts=default_starts, state_cache=state_cache, occupancy_map_padded=occupancy_map_padded, occupancy_map_un_padded=occupancy_map_un_padded, occupancy_map_original=occupancy_map_original, waypoint=True) start_pos = self.grid.sample_random_pos(number=1)[0] goal_pos = self.grid.sample_random_pos(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) s_init = self.grid.reset(start_pos) horizon_left = self.max_iter success = False path_segment_len_list = self.grid.goal_management.reset( start_pos, goal_pos, self.grid) self.grid.old_distance = self.grid.goal_management.path_segment_len_list[ 0] while (horizon_left > 0) and not success: curr_goal = \ self.grid.goal_management.way_points[self.grid.goal_management.way_point_current] roll_out, states, actions, rewards, wp_success, l_state = self.roll_out_in_env( start=s_init, goal=curr_goal, ultimate_goal=goal_pos, horizon=horizon_left) s_init = l_state num_roll_outs += 1 num_steps += roll_out.length() horizon_left -= roll_out.length() total_wp_success += wp_success jwp += 1 st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) batch.append(roll_out) total_success += success j += 1 ant_path = self.grid.mj_ant_path + 'ant_copy.xml' tree = ET.parse(ant_path) tree.write(self.grid.mj_ant_path + 'ant.xml') return success
def roll_out_in_env(self, start, goal, ultimate_goal, horizon): roll_out = Batch() s_u_goal = self.grid.lls2hls(ultimate_goal) s = start s_list = [] a_list = [] r_list = [] d = False for step_i in range(horizon): s_bar = self.grid.lls2hls(s) target_vec = self.grid.goal_management.get_target_vec(s[:2]) s_save = copy.deepcopy( np.array( list(s[2:]) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) + list(target_vec))) s_pos_save = copy.deepcopy(np.array(s[:2])) s_list.append(np.concatenate((s_pos_save, s_save))) s = torch.tensor(s_save, dtype=torch.float).unsqueeze(0) a = self.policy.select_action(s) s_new, r, d, info = self.grid.step(a, goal) info = info['no_goal_reached'] dist_wp = np.sqrt((s_new[0] - self.grid.goal_management.way_points[ self.grid.goal_management.way_point_current][0])**2 + ( s_new[1] - self.grid.goal_management.way_points[ self.grid.goal_management.way_point_current][1])**2) d_wp = dist_wp < 0.5 new_distance = dist_wp d = np.sqrt((s_new[0] - goal[0])**2 + (s_new[1] - goal[1])**2) < 0.5 if d: info = False else: info = True r = d_wp # + self.grid.old_distance - 0.99 * new_distance self.grid.old_distance = new_distance s_bar_cand = self.grid.lls2hls(s_new) break_var = self.grid.goal_management.update_way_point( self.grid, (s_new[0], s_new[1]), d) success_var = ((s_bar_cand[0] == s_u_goal[0]) and (s_bar_cand[1] == s_u_goal[1])) if success_var: info = False break_var = True a_list.append(a) r_list.append(r) roll_out.append( Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r], [s_new.astype(np.float32)], [ 0 if ((not info) or (step_i + 1 == horizon) or break_var) else 1 ], [info], [1.0])) s = s_new if (not info) or break_var: break return roll_out, s_list, a_list, r_list, d, s_new
def simulate_env(self, mode): batch = Batch() num_roll_outs = 0 num_steps = 0 total_success = 0 total_wp_success = 0 j = 0. jwp = 0. if mode == 'train': while num_steps < self.config.policy_batch_size: """ INITIALIZE THE ENVIRONMENT """ a, b, c, d, e, f, g, h, p = self.grid.reset_env_random() structure = copy.deepcopy(a) objects = copy.deepcopy(b) obstacle_list = copy.deepcopy(c) occupancy_map = copy.deepcopy(d) default_starts = copy.deepcopy(e) state_cache = copy.deepcopy(f) occupancy_map_padded = copy.deepcopy(g) occupancy_map_un_padded = copy.deepcopy(h) occupancy_map_original = copy.deepcopy(p) self.grid = Ant44Env0( mj_ant_path=self.config.mj_ant_path, re_init=True, structure=structure, objects=objects, obstacle_list=obstacle_list, occupancy_map=occupancy_map, default_starts=default_starts, state_cache=state_cache, occupancy_map_padded=occupancy_map_padded, occupancy_map_un_padded=occupancy_map_un_padded, occupancy_map_original=occupancy_map_original) start_pos = self.grid.sample_random_pos(number=1)[0] goal_pos = self.grid.sample_random_pos(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) self.episode_steps = 0 s_init = self.grid.reset(start_pos) """ V MAP """ if self.config.optimistic_model: self.vi.update_p_table_optimistic( occupancy_map=self.grid.occupancy_map, walls=True) else: self.vi.update_p_table( occupancy_map=self.grid.occupancy_map, walls=True) v, pi = self.vi.run_vi(grid=self.grid, goal=(s_goal[0], s_goal[1])) """ START THE EPISODE """ horizon_left = self.config.time_horizon st = start_pos success = False s_bar = self.grid.lls2hls(st) hl_s_list = [] hl_a_list = [] hl_r_list = [] hl_d_list = [] hl_s_list.append(s_bar) while (horizon_left > 0) and not success: # GET THE TARGET VECTOR self.dqn_steps += 1 self.eps = 0.01 + 0.99 * math.exp( -1. * self.dqn_steps / 10000) s_bar = self.grid.lls2hls(st) if torch.rand(1)[0] > self.eps: a_bar = int(pi[s_bar[0], s_bar[1]]) else: a_bar = randint(0, 7) self.vi.set_target(s_bar, a_bar) curr_goal = self.vi.get_target() roll_out, _, _, _, wp_success, l_state, s_bar_p = self.roll_out_in_env( start=s_init, goal=curr_goal, horizon=self.time_scale, ultimate_goal=goal_pos) hl_s_list.append(s_bar_p) hl_a_list.append(a_bar) st = l_state[:2] s_init = l_state num_roll_outs += 1 num_steps += roll_out.length() horizon_left -= roll_out.length() total_wp_success += wp_success jwp += 1 st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) if success: hl_r_list.append(0) hl_d_list.append(True) else: hl_r_list.append(-1) hl_d_list.append(False) batch.append(roll_out) total_success += success j += 1 if not self.config.optimistic_model: x_temp, y_temp, w_temp = self.vi.generate_dataset_flat( self.grid.occupancy_map, hl_s_list, hl_a_list) for bi in range(x_temp.shape[0]): self.buffer.add(x_temp[bi], y_temp[bi], w_temp[bi]) self.vi.train_net(buffer=self.buffer, bs=128, opt_iterations=40, rw=True) ant_path = self.grid.mj_ant_path + 'ant_copy.xml' tree = ET.parse(ant_path) tree.write(self.grid.mj_ant_path + 'ant.xml') return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs else: """ INITIALIZE THE ENVIRONMENT """ a, b, c, d, e, f, g, h, p = self.grid.reset_env_random() structure = copy.deepcopy(a) objects = copy.deepcopy(b) obstacle_list = copy.deepcopy(c) occupancy_map = copy.deepcopy(d) default_starts = copy.deepcopy(e) state_cache = copy.deepcopy(f) occupancy_map_padded = copy.deepcopy(g) occupancy_map_un_padded = copy.deepcopy(h) occupancy_map_original = copy.deepcopy(p) self.grid = Ant44Env0( mj_ant_path=self.config.mj_ant_path, re_init=True, structure=structure, objects=objects, obstacle_list=obstacle_list, occupancy_map=occupancy_map, default_starts=default_starts, state_cache=state_cache, occupancy_map_padded=occupancy_map_padded, occupancy_map_un_padded=occupancy_map_un_padded, occupancy_map_original=occupancy_map_original) start_pos = self.grid.sample_random_pos(number=1)[0] goal_pos = self.grid.sample_random_pos(number=1)[0] s_goal = self.grid.lls2hls(goal_pos) self.episode_steps = 0 s_init = self.grid.reset(start_pos) """ V MAP """ if self.config.optimistic_model: self.vi.update_p_table_optimistic( occupancy_map=self.grid.occupancy_map, walls=True) else: self.vi.update_p_table(occupancy_map=self.grid.occupancy_map, walls=True) v, pi = self.vi.run_vi(grid=self.grid, goal=(s_goal[0], s_goal[1])) horizon_left = self.config.time_horizon st = start_pos success = False s_bar = self.grid.lls2hls(st) while (horizon_left > 0) and not success: # GET THE TARGET VECTOR a_bar = int(pi[s_bar[0], s_bar[1]]) self.vi.set_target(s_bar, a_bar) curr_goal = self.vi.get_target() roll_out, states, actions, rewards, wp_success, l_state, _ = self.roll_out_in_env( start=s_init, goal=curr_goal, horizon=self.time_scale, ultimate_goal=goal_pos) st = l_state[:2] s_bar = self.grid.lls2hls(st) s_init = l_state num_steps += roll_out.length() horizon_left -= roll_out.length() st_bar = self.grid.lls2hls(l_state) success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1])) ant_path = self.grid.mj_ant_path + 'ant_copy.xml' tree = ET.parse(ant_path) tree.write(self.grid.mj_ant_path + 'ant.xml') return success