コード例 #1
0
ファイル: pick_place.py プロジェクト: tmdt-buw/karolos
    def reward_function(self, goal_info, done, **kwargs):
        # 0.2* dist(tcp, obj), 0.8*dist(obj,goal)
        if self.success_criterion(goal_info):
            reward = 1.
        elif done:
            reward = -1.
        else:

            goal_achieved_object = unwind_dict_values(
                goal_info["achieved"]['object_position'])
            goal_achieved_tcp = unwind_dict_values(
                goal_info["achieved"]['tcp_position'])
            goal_desired_object = unwind_dict_values(
                goal_info["desired"]['object_position'])
            goal_desired_tcp = unwind_dict_values(
                goal_info["desired"]['tcp_position'])

            # 0.8 * dist(obj, goal_obj), how close obj is to obj_goal
            reward_object = np.exp(
                -5 *
                np.linalg.norm(goal_desired_object - goal_achieved_object)) - 1

            # 0.2 * dist(obj, tcp), how close tcp is to obj
            reward_tcp = np.exp(
                -5 * np.linalg.norm(goal_achieved_tcp - goal_desired_tcp)) - 1

            # scale s.t. reward in [-1,1]
            reward = .8 * reward_object + .2 * reward_tcp
        return reward
コード例 #2
0
ファイル: pick_place.py プロジェクト: tmdt-buw/karolos
    def success_criterion(goal):
        # goal -> object + tcp
        goal_achieved = unwind_dict_values(goal["achieved"]["object_position"])
        # desired -> both should be at same desired position
        goal_desired = unwind_dict_values(goal["desired"]["object_position"])

        goal_distance = np.linalg.norm(goal_achieved - goal_desired)
        # more error margin
        return goal_distance < 0.05
コード例 #3
0
    def reward_function(self, goal_info, done, **kwargs):
        if self.success_criterion(goal_info):
            reward = 1.
        elif done:
            reward = -1.
        else:
            goal_achieved = unwind_dict_values(goal_info["achieved"])
            goal_desired = unwind_dict_values(goal_info["desired"])

            reward = np.exp(
                -5 * np.linalg.norm(goal_achieved - goal_desired)) - 1

        return reward
コード例 #4
0
models_folder = osp.join(models_folder, max(os.listdir(models_folder)))
print(models_folder)

agent.load(models_folder)

while True:
    state, goal_info = env.reset()
    done = False

    actions = []

    while not done:
        # print(state)

        state = unwind_dict_values(state)

        action = agent.predict([state], deterministic=True)[0]

        action[3:] = 0

        actions.append(action.copy())

        next_state, goal_info, done = env.step(action)

        done |= env.success_criterion(goal_info)

        # if done:

        print(action)
コード例 #5
0
    def success_criterion(goal_info):
        goal_achieved = unwind_dict_values(goal_info["achieved"])
        goal_desired = unwind_dict_values(goal_info["desired"])

        goal_distance = np.linalg.norm(goal_achieved - goal_desired)
        return goal_distance < 0.05
コード例 #6
0
    def process_responses(self, env_responses, mode: str):

        requests = []
        results_episodes = []

        for env_id, response in env_responses:
            func, data = response

            if func == "reset":
                if type(data) == AssertionError:
                    requests.append(
                        (env_id, "reset",
                         self.get_initial_state(mode != "test", env_id)))
                else:
                    self.trajectories.pop(env_id, None)
                    self.trajectories[env_id].append(data)
            elif func == "step":
                state, goal_info, done = data
                self.trajectories[env_id].append((state, goal_info))

                done |= self.success_criterion(goal_info)

                if done:
                    trajectory = self.trajectories.pop(env_id)

                    rewards = self.agent.add_experience_trajectory(trajectory)

                    self.writer.add_scalar(f'{mode} reward episode',
                                           sum(rewards),
                                           sum(self.steps.values()) + 1)

                    results_episodes.append(self.success_criterion(goal_info))

                    requests.append(
                        (env_id, "reset",
                         self.get_initial_state(mode != "test", env_id)))

                self.steps[env_id] += 1
            else:
                raise NotImplementedError(
                    f"Undefined behavior for {env_id} | {response}")

        required_predictions = [
            env_id for env_id in self.trajectories.keys()
            if len(self.trajectories[env_id]) % 2
        ]

        if required_predictions:

            if mode == "random":
                predictions = [
                    self.agent.action_space.sample()
                    for _ in range(len(required_predictions))
                ]
            else:
                states = []

                for env_id in required_predictions:
                    state, _ = self.trajectories[env_id][-1]

                    state = unwind_dict_values(state)

                    states.append(state)

                states = np.stack(states)

                predictions = self.agent.predict(states,
                                                 deterministic=mode == "test")

            for env_id, prediction in zip(required_predictions, predictions):
                self.trajectories[env_id].append(prediction)
                requests.append((env_id, "step", prediction))

        return requests, results_episodes