def reward_function(self, goal_info, done, **kwargs): # 0.2* dist(tcp, obj), 0.8*dist(obj,goal) if self.success_criterion(goal_info): reward = 1. elif done: reward = -1. else: goal_achieved_object = unwind_dict_values( goal_info["achieved"]['object_position']) goal_achieved_tcp = unwind_dict_values( goal_info["achieved"]['tcp_position']) goal_desired_object = unwind_dict_values( goal_info["desired"]['object_position']) goal_desired_tcp = unwind_dict_values( goal_info["desired"]['tcp_position']) # 0.8 * dist(obj, goal_obj), how close obj is to obj_goal reward_object = np.exp( -5 * np.linalg.norm(goal_desired_object - goal_achieved_object)) - 1 # 0.2 * dist(obj, tcp), how close tcp is to obj reward_tcp = np.exp( -5 * np.linalg.norm(goal_achieved_tcp - goal_desired_tcp)) - 1 # scale s.t. reward in [-1,1] reward = .8 * reward_object + .2 * reward_tcp return reward
def success_criterion(goal): # goal -> object + tcp goal_achieved = unwind_dict_values(goal["achieved"]["object_position"]) # desired -> both should be at same desired position goal_desired = unwind_dict_values(goal["desired"]["object_position"]) goal_distance = np.linalg.norm(goal_achieved - goal_desired) # more error margin return goal_distance < 0.05
def reward_function(self, goal_info, done, **kwargs): if self.success_criterion(goal_info): reward = 1. elif done: reward = -1. else: goal_achieved = unwind_dict_values(goal_info["achieved"]) goal_desired = unwind_dict_values(goal_info["desired"]) reward = np.exp( -5 * np.linalg.norm(goal_achieved - goal_desired)) - 1 return reward
models_folder = osp.join(models_folder, max(os.listdir(models_folder))) print(models_folder) agent.load(models_folder) while True: state, goal_info = env.reset() done = False actions = [] while not done: # print(state) state = unwind_dict_values(state) action = agent.predict([state], deterministic=True)[0] action[3:] = 0 actions.append(action.copy()) next_state, goal_info, done = env.step(action) done |= env.success_criterion(goal_info) # if done: print(action)
def success_criterion(goal_info): goal_achieved = unwind_dict_values(goal_info["achieved"]) goal_desired = unwind_dict_values(goal_info["desired"]) goal_distance = np.linalg.norm(goal_achieved - goal_desired) return goal_distance < 0.05
def process_responses(self, env_responses, mode: str): requests = [] results_episodes = [] for env_id, response in env_responses: func, data = response if func == "reset": if type(data) == AssertionError: requests.append( (env_id, "reset", self.get_initial_state(mode != "test", env_id))) else: self.trajectories.pop(env_id, None) self.trajectories[env_id].append(data) elif func == "step": state, goal_info, done = data self.trajectories[env_id].append((state, goal_info)) done |= self.success_criterion(goal_info) if done: trajectory = self.trajectories.pop(env_id) rewards = self.agent.add_experience_trajectory(trajectory) self.writer.add_scalar(f'{mode} reward episode', sum(rewards), sum(self.steps.values()) + 1) results_episodes.append(self.success_criterion(goal_info)) requests.append( (env_id, "reset", self.get_initial_state(mode != "test", env_id))) self.steps[env_id] += 1 else: raise NotImplementedError( f"Undefined behavior for {env_id} | {response}") required_predictions = [ env_id for env_id in self.trajectories.keys() if len(self.trajectories[env_id]) % 2 ] if required_predictions: if mode == "random": predictions = [ self.agent.action_space.sample() for _ in range(len(required_predictions)) ] else: states = [] for env_id in required_predictions: state, _ = self.trajectories[env_id][-1] state = unwind_dict_values(state) states.append(state) states = np.stack(states) predictions = self.agent.predict(states, deterministic=mode == "test") for env_id, prediction in zip(required_predictions, predictions): self.trajectories[env_id].append(prediction) requests.append((env_id, "step", prediction)) return requests, results_episodes