class Controller(object): def __init__(self, env_name='harvest', num_agents=1): self.env_name = env_name if env_name == 'harvest': print('Initializing Harvest environment') self.env = HarvestEnv(ascii_map=HARVEST_MAP_CPR, num_agents=num_agents, render=True) elif env_name == 'cleanup': print('Initializing Cleanup environment') self.env = CleanupEnv(num_agents=num_agents, render=True) else: print('Error! Not a valid environment type') return self.num_agents = num_agents self.agent_policies = [] self.agents = list(self.env.agents.values()) # print(agents[0].action_space) self.action_dim = self.agents[0].action_space.n for _ in range(num_agents): # TODO right now only using 1 frame, update later to look back x (e.g. 4) frames. Later RNN/LSTM neural_net = ConvFC( conv_in_channels= 3, # harvest specific input is 15x15x3 (HARVEST_VIEW_SIZE = 7) conv_out_channels=3, input_size=15, hidden_size=64, output_size=self.action_dim) self.agent_policies.append( DQNAgent(0, self.action_dim - 1, neural_net)) self.env.reset() def process_experiences(self, id, i, obs, action_dict, rew, next_obs, dones, train_agents=False): # print(id) # print(i) agent_i = "agent-{}".format(i) self.agent_policies[i].push_experience( reshape_obs_for_convfc(obs[agent_i][0]), action_dict[agent_i], rew[agent_i], reshape_obs_for_convfc( next_obs[agent_i][0] ), # we here using without the reward info... can modify later but this is just a test dones[agent_i]) if train_agents: self.agent_policies[i].q_learn_update() # def train_parallel_agents(self, id, obs, action_dict, rew, next_obs, dones): # for i in range(self.num_agents): # # torch.multiprocessing.spawn(self.train_agent, args=(i, obs, action_dict, rew, next_obs, dones)) # self.train_agent(id, i, obs, action_dict, rew, next_obs, dones) def rollout(self, horizon, train_every=100, save_path=None, train_agents=True, print_act=False): """ Rollout several timesteps of an episode of the environment. Args: horizon: The number of timesteps to roll out. save_path: If provided, will save each frame to disk at this location. """ rewards = np.zeros(self.num_agents) observations = [] shape = self.env.world_map.shape full_obs = [ np.zeros((shape[0], shape[1], 3), dtype=np.uint8) for i in range(horizon) ] init_obs = self.env.reset() # print(init_obs) obs = init_obs for time_step in range(horizon): # print(time_step ) action_dim = self.action_dim # Single agent hardcoded for now hard_coded = False if hard_coded: action_cycle = 40 prep_time = 4 + 2 #10 single_obs = obs["agent-{}".format(0)][0] if time_step < prep_time - 2: # print(single_obs) # print(single_obs.shape) # print(single_obs[7][7]) # # print(single_obs[7][6]) # print(single_obs[6][7]) # print(single_obs[7][8]) # print(single_obs[8][7]) # if single_obs[8][7].sum() == 540 and single_obs[7][6].sum() == 540: # 200 if single_obs[6][7].sum() == 540 and single_obs[7][8].sum( ) == 540: # 200 # if single_obs[6][7].sum() == 540 and single_obs[7][6].sum() == 540: # 100 # if single_obs[8][7].sum() == 540 and single_obs[7][8].sum() == 540: # 100 action = 4 # elif single_obs[7][9].sum() == 0 and single_obs[5][7].sum() == 0: # lower and left empty # action = 5 else: action = 6 # got lazy, just keep turning otherwise # action = 5 # elif time_step == prep_time - 3: # # print(single_obs[7][6]) # # print(single_obs[6][7]) # # print(single_obs[7][8]) # # print(single_obs[8][7]) # action=2 # first up movement, start the cycle elif time_step == prep_time - 2: # print(single_obs[7][6]) # print(single_obs[6][7]) # print(single_obs[7][8]) # print(single_obs[8][7]) action = 1 #0 # first left movement, start the cycle # left and right are wrong? Yeah they messed it up # Um anyway... around 450 is optimal in this env. elif time_step == prep_time - 1: # print(single_obs[7][6]) # print(single_obs[6][7]) # print(single_obs[7][8]) # print(single_obs[8][7]) action = 2 # up again for smoe reason else: # if time_step == prep_time: # print(single_obs[7][6]) # print(single_obs[6][7]) # print(single_obs[7][8]) # print(single_obs[8][7]) # Assumes up orientation if (time_step - prep_time) % action_cycle < 16: action = 1 # left elif (time_step - prep_time) % action_cycle < 20: action = 2 elif (time_step - prep_time) % action_cycle < 36: action = 0 # right elif (time_step - prep_time) % action_cycle < 40: action = 3 # down # print(action) actions = [action] action_dict = {} if not hard_coded: actions = [] if train_agents: # for i in range(self.num_agents): # print(i) # action = self.agent_policies[i].act(reshape_obs_for_convfc(obs["agent-{}".format(i)]), print_act=print_act) # actions.append(action) actions = [ self.agent_policies[i].act(reshape_obs_for_convfc( obs["agent-{}".format(i)][0]), print_act=print_act) for i in range(self.num_agents) ] else: # can choose eps=0 or something else after actions = [ self.agent_policies[i].act(reshape_obs_for_convfc( obs["agent-{}".format(i)][0]), print_act=print_act) for i in range(self.num_agents) ] for i in range(self.num_agents): agent_i = "agent-{}".format(i) action_dict[agent_i] = actions[i] # if train_agents: # # print(ray.get(self.agent_policies[i].act.remote(reshape_obs_for_convfc(obs[agent_i])))) # action_dict[agent_i] = self.agent_policies[i].act.remote(reshape_obs_for_convfc(obs[agent_i])) # else: # action_dict[agent_i] = self.agent_policies[i].act.remote(reshape_obs_for_convfc(obs[agent_i]), epsilon=0) # # 1, obs[agent_i].shape[2], obs[agent_i].shape[0], obs[agent_i].shape[1] )) # batch size = 1 for 1 obs right now... next_obs, rew, dones, info, = self.env.step(action_dict) if not hard_coded: if train_agents: for i in range(self.num_agents): if ((time_step + 1) % train_every == 0): self.process_experiences(0, i, obs, action_dict, rew, next_obs, dones, train_agents=True) else: self.process_experiences(0, i, obs, action_dict, rew, next_obs, dones, train_agents=False) obs = next_obs sys.stdout.flush() if save_path is not None: self.env.render(filename=save_path + 'frame' + str(time_step).zfill(6) + '.png') rgb_arr = self.env.map_to_colors() full_obs[time_step] = rgb_arr.astype(np.uint8) # rewards.append(rew) observations.append(obs) for i in range(self.num_agents): agent_i = "agent-{}".format(i) rewards[i] += rew[agent_i] # observations.append(obs['agent-0']) # rewards.append(rew['agent-0']) return rewards, observations, full_obs def render_rollout(self, horizon=50, path=None, fps=8): """ Render a rollout into a video. Args: horizon: The number of timesteps to roll out. path: Directory where the video will be saved. render_type: Can be 'pretty' or 'fast'. Impliciations obvious. fps: Integer frames per second. """ if path is None: path = os.path.abspath(os.path.dirname(__file__)) + '/videos' print(path) if not os.path.exists(path): os.makedirs(path) video_name = self.env_name + '_trajectory' # if render_type == 'pretty': # image_path = os.path.join(path, 'frames/') # if not os.path.exists(image_path): # os.makedirs(image_path) # # rewards, observations, full_obs = self.rollout( # horizon=horizon, save_path=image_path, train_agents=False) # utility_funcs.make_video_from_image_dir(path, image_path, fps=fps, # video_name=video_name) # # # Clean up images # shutil.rmtree(image_path) # else: rewards, observations, full_obs = self.rollout(horizon=horizon, train_agents=False, print_act=False) utility_funcs.make_video_from_rgb_imgs(full_obs, path, fps=fps, video_name=video_name) return rewards
class Controller(object): def __init__(self, env_name='cleanup'): self.env_name = env_name if env_name == 'harvest': print('Initializing Harvest environment') self.env = HarvestEnv(num_agents=5, render=True) elif env_name == 'cleanup': print('Initializing Cleanup environment') self.env = CleanupEnv(num_agents=5, render=True) else: print('Error! Not a valid environment type') return self.env.reset() # TODO: initialize agents here def rollout(self, horizon=50, save_path=None): """ Rollout several timesteps of an episode of the environment. Args: horizon: The number of timesteps to roll out. save_path: If provided, will save each frame to disk at this location. """ rewards = [] observations = [] shape = self.env.world_map.shape full_obs = [ np.zeros((shape[0], shape[1], 3), dtype=np.uint8) for i in range(horizon) ] for i in range(horizon): agents = list(self.env.agents.values()) action_dim = agents[0].action_space.n rand_action = np.random.randint(action_dim, size=5) obs, rew, dones, info, = self.env.step({ 'agent-0': rand_action[0], 'agent-1': rand_action[1], 'agent-2': rand_action[2], 'agent-3': rand_action[3], 'agent-4': rand_action[4] }) sys.stdout.flush() if save_path is not None: self.env.render(filename=save_path + 'frame' + str(i).zfill(6) + '.png') rgb_arr = self.env.map_to_colors() full_obs[i] = rgb_arr.astype(np.uint8) observations.append(obs['agent-0']) rewards.append(rew['agent-0']) return rewards, observations, full_obs def render_rollout(self, horizon=50, path=None, render_type='pretty', fps=8): """ Render a rollout into a video. Args: horizon: The number of timesteps to roll out. path: Directory where the video will be saved. render_type: Can be 'pretty' or 'fast'. Impliciations obvious. fps: Integer frames per second. """ if path is None: path = os.path.abspath(os.path.dirname(__file__)) + '/videos' print(path) if not os.path.exists(path): os.makedirs(path) video_name = self.env_name + '_trajectory' if render_type == 'pretty': image_path = os.path.join(path, 'frames/') if not os.path.exists(image_path): os.makedirs(image_path) rewards, observations, full_obs = self.rollout( horizon=horizon, save_path=image_path) utility_funcs.make_video_from_image_dir(path, image_path, fps=fps, video_name=video_name) # Clean up images shutil.rmtree(image_path) else: rewards, observations, full_obs = self.rollout(horizon=horizon) utility_funcs.make_video_from_rgb_imgs(full_obs, path, fps=fps, video_name=video_name)
class Env(object): def __init__(self, config_env): self.name = 'ssd' self.config = config_env self.dim_obs = [self.config.obs_height, self.config.obs_width, 3] self.max_steps = self.config.max_steps self.cleaning_penalty = self.config.cleaning_penalty # Original space (not necessarily in this order, see # the original ssd files): # no-op, up, down, left, right, turn-ccw, turn-cw, penalty, clean if (self.config.disable_left_right_action and self.config.disable_rotation_action): self.l_action = 4 self.cleaning_action_idx = 3 # up, down, no-op, clean self.map_to_orig = {0: 2, 1: 3, 2: 4, 3: 8} elif self.config.disable_left_right_action: self.l_action = 6 self.cleaning_action_idx = 5 # up, down, no-op, rotate cw, rotate ccw, clean self.map_to_orig = {0: 2, 1: 3, 2: 4, 3: 5, 4: 6, 5: 8} elif self.config.disable_rotation_action: self.l_action = 6 self.cleaning_action_idx = 5 # left, right, up, down, no-op, clean self.map_to_orig = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 8} else: # full action space except penalty beam self.l_action = 8 self.cleaning_action_idx = 7 # Don't allow penalty beam self.map_to_orig = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 8} self.obs_cleaned_1hot = self.config.obs_cleaned_1hot self.n_agents = self.config.n_agents if self.config.map_name == 'cleanup_small_sym': ascii_map = maps.CLEANUP_SMALL_SYM elif self.config.map_name == 'cleanup_10x10_sym': ascii_map = maps.CLEANUP_10x10_SYM self.env = CleanupEnv( ascii_map=ascii_map, num_agents=self.n_agents, render=False, shuffle_spawn=self.config.shuffle_spawn, global_ref_point=self.config.global_ref_point, view_size=self.config.view_size, random_orientation=self.config.random_orientation, cleanup_params=self.config.cleanup_params, beam_width=self.config.beam_width) # length of action input to learned reward function if self.config.obs_cleaned_1hot: self.l_action_for_r = 2 else: self.l_action_for_r = self.l_action self.steps = 0 def process_obs(self, obs_dict): return [obs / 256.0 for obs in list(obs_dict.values())] def reset(self): """Resets the environemnt. Returns: List of agent observations """ obs = self.env.reset() self.steps = 0 return self.process_obs(obs) def step(self, actions): """Takes a step in env. Args: actions: list of integers Returns: List of observations, list of rewards, done, info """ actions = [self.map_to_orig[a] for a in actions] actions_dict = { 'agent-%d' % idx: actions[idx] for idx in range(self.n_agents) } # all objects returned by env.step are dicts obs_next, rewards, dones, info = self.env.step(actions_dict) self.steps += 1 obs_next = self.process_obs(obs_next) rewards = list(rewards.values()) if self.cleaning_penalty > 0: for idx in range(self.n_agents): if actions[idx] == 8: rewards[idx] -= self.cleaning_penalty # done = dones['__all__'] # apparently they hardcode done to False done = dones['__all__'] or self.steps == self.max_steps return obs_next, rewards, done, info def render(self): self.env.render()