def sample_state(self): """ produces a observation with one of each object :return obs: a sample observation :return agent_position: position of the agent within the observation """ objects = [self.one_hot(i - 1) for i in range(1, 9)] grid = objects + [[ 0 for _ in range( self.observation_vector_space.spaces['observation'].shape[2]) ] for _ in range(self.num_rows * self.num_cols - len(objects))] random.shuffle(grid) state = np.asarray(grid, dtype=int).reshape( self.observation_vector_space.spaces['observation'].shape) while np.argmax(state[self.agent_start[0]][self.agent_start[1]]) in [ 3, 4, 5, 6 ]: # don't start agent on rock, tree, house or bread np.random.shuffle(state) agent_encoding = self.one_hot(8) state[self.agent_start[0]][self.agent_start[1]] += agent_encoding agent_position = coord(self.agent_start[0], self.agent_start[1], self.num_rows - 1, self.num_cols - 1) return state, agent_position
def sample_state(self): """ produces a observation with one of each object :return obs: a sample observation :return agent_position: position of the agent within the observation """ objects = [_ for _ in range(1, 10)] objects = [self.one_hot(i - 1) for i in objects] grid = objects + [[ 0 for _ in range( self.observation_space.spaces['observation'].shape[2]) ] for _ in range(self.num_rows * self.num_cols - len(objects))] random.shuffle(grid) state = np.asarray(grid, dtype=int).reshape( self.observation_space.spaces['observation'].shape) agent_position = coord(int(np.where(np.argmax(state, axis=2) == 8)[0]), int(np.where(np.argmax(state, axis=2) == 8)[1]), self.num_rows - 1, self.num_cols - 1) return state, agent_position
def __init__(self, size=(10, 10), fixed_init_state=None, fixed_goal=None, tasks_to_ignore=None, store_gif=False, render_flipping=False, max_steps=300, task_list=TASK_LIST, pos_rewards=False): """ change the following parameters to create a custom environment :param size: size of the grid world :param fixed_init_state: a fixed initial observation to reset to :param fixed_goal: a fixed list of tasks for the agent to achieve :param tasks_to_ignore: a list of tasks to ignore when calculating reward :param store_gif: whether or not to store every episode as a gif in a /renders/ subdirectory :param render_flipping: set to true if only specific episodes need to be rendered :param max_steps: max number of steps the agent can take :param task_list: list of possible tasks """ self.metadata = {'render.modes': ['human', 'Non']} self.num_rows, self.num_cols = size self.max_steps = max_steps self.task_list = task_list if tasks_to_ignore: for task in tasks_to_ignore: self.task_list.remove(task) self.pos_rewards = pos_rewards self.observation_space = spaces.Dict({ 'observation': spaces.Box(low=0, high=1, shape=(self.num_rows, self.num_cols, len(OBJECTS) + 1 + len(PICKUPABLE)), dtype=int), 'desired_goal': spaces.Box(low=0, high=1, shape=(1, len(self.task_list)), dtype=int), 'achieved_goal': spaces.Box(low=0, high=1, shape=(1, len(self.task_list)), dtype=int), }) # TODO: wrapper that flattens to regular env, wrapper that changes desired goal to dict of rewards, reward wrapper self.fixed_goal = fixed_goal if self.fixed_goal: self.desired_goal = np.zeros(shape=(1, len(self.task_list)), dtype=int) for goal in self.fixed_goal: if goal not in self.task_list: self.fixed_goal.remove(goal) continue self.desired_goal[0][self.task_list.index(goal)] = 1 else: self.desired_goal = np.random.randint(2, size=(1, len(self.task_list))) self.achieved_goal = self.observation_space.spaces['achieved_goal'].low self.fixed_init_state = fixed_init_state if self.fixed_init_state is not None: self.obs = copy.deepcopy(self.fixed_init_state) self.agent_pos = coord( int(np.where(np.argmax(self.obs, axis=2) == 8)[0]), int(np.where(np.argmax(self.obs, axis=2) == 8)[1]), self.num_rows - 1, self.num_cols - 1) else: self.obs, self.agent_pos = self.sample_state() self.observation = { 'observation': self.obs, 'desired_goal': self.desired_goal, 'achieved_goal': self.achieved_goal } self.init_observation = copy.deepcopy(self.observation) self.ACTIONS = [ coord(-1, 0, name='up'), coord(0, 1, name='right'), coord(1, 0, name='down'), coord(0, -1, name='left'), 'pickup', 'drop' ] self.action_space = spaces.Discrete(len(self.ACTIONS)) self.reward = self.calculate_rewards() self.store_gif = store_gif self.render_flipping = render_flipping self.env_id = None self.fig, self.ax, self.ims = None, None, None self.ep_no = 0 self.step_num = 0 if self.store_gif: self.allow_gif_storage()
def reset(self, render_next=False): """ reset the environment """ # save episode as gif if self.store_gif is True and self.step_num != 0: # print('debug_final', len(self.ims)) anim = animation.ArtistAnimation(self.fig, self.ims, interval=100000, blit=False, repeat_delay=1000) anim.save('renders/env{}/episode_{}_({}).gif'.format( self.env_id, self.ep_no, self.step_num), writer=animation.PillowWriter(), dpi=100) if self.render_flipping is True: self.store_gif = render_next if self.fixed_goal: self.desired_goal = np.zeros(shape=(1, len(self.task_list)), dtype=int) for goal in self.fixed_goal: self.desired_goal[0][self.task_list.index(goal)] = 1 else: self.desired_goal = np.random.randint(2, size=(1, len(self.task_list))) self.achieved_goal = self.observation_space.spaces['achieved_goal'].low if self.fixed_init_state is not None: self.obs = copy.deepcopy(self.fixed_init_state) self.agent_pos = coord( int(np.where(np.argmax(self.obs, axis=2) == 8)[0]), int(np.where(np.argmax(self.obs, axis=2) == 8)[1]), self.num_rows - 1, self.num_cols - 1) else: self.obs, self.agent_pos = self.sample_state() self.observation = { 'observation': self.obs, 'desired_goal': self.desired_goal, 'achieved_goal': self.achieved_goal } self.init_observation = copy.deepcopy(self.observation) self.reward = self.calculate_rewards() if self.step_num != 0: # don't increment episode number if resetting after init self.ep_no += 1 self.step_num = 0 # reset gif plt.close('all') if self.store_gif: # if self.fig is None: # self.fig, self.ax = plt.subplots(1) # else: # plt.clf() self.fig, self.ax = plt.subplots(1) self.ims = [] self.__render_gif()
def imagine_obs(self): init_objects = { obj: self.get_objects(code, self.init_observation_vector['observation']) for code, obj in enumerate(OBJECTS) } agent_pos = self.agent_pos final_objects = copy.deepcopy(init_objects) tasks = { self.task_list[idx]: value for idx, value in enumerate(self.desired_goal_vector[0]) } for key, value in tasks.items(): if value == 1: if key == 'MakeBread': final_objects = self.__convert_item( final_objects, 'wheat', 'bread') if key == 'EatBread': final_objects = self.__convert_item(final_objects, 'bread') if key == 'ChopTree': final_objects = self.__convert_item( final_objects, 'tree', 'sticks') if key == 'ChopRock': final_objects = self.__convert_item(final_objects, 'rock') occupied_spaces = [] for i in final_objects.values(): occupied_spaces += i moving_tasks = { 'MoveAxe': 'axe', 'MoveHammer': 'hammer', 'MoveSticks': 'sticks' } for key, value in moving_tasks.items(): if key in tasks: if tasks[key] == 1: current_location = random.choice(final_objects[value]) occupied = True while occupied: new_location = [ random.randint(0, self.num_rows - 1), random.randint(0, self.num_cols - 1) ] if new_location not in occupied_spaces: final_objects[value].remove(current_location) occupied_spaces.remove(current_location) final_objects[value].append(new_location) occupied_spaces.append(new_location) break for key, value in tasks.items(): if value == 1: if key == 'BuildHouse': final_objects = self.__convert_item( final_objects, 'sticks', 'house') if key == 'GoToHouse': new_agent_pos = random.choice(final_objects['house']) agent_pos = coord(new_agent_pos[0], new_agent_pos[1], self.num_rows - 1, self.num_cols - 1) # self.__object_list_to_state(final_objects, agent_pos) # self.__object_list_to_state(final_objects, agent_pos) # return final_objects, agent_pos return self.__object_list_to_state(final_objects, agent_pos)