Example #1
0
 def act(self, ob):
     if self.policy is None:
         raise AttributeError('Policy attribute has to be set!')
     state = JointState(self.get_full_state(), ob)
     action = self.policy.predict(state)
     # print("self.policy.last_state: ",self.policy.last_state)
     return action
Example #2
0
    def state_callback(self, observe_info):
        robot_state = observe_info.robot_state
        robot_full_state = FullState(robot_state.pos_x, robot_state.pos_y,
                                     robot_state.vel_x, robot_state.vel_y,
                                     robot_state.radius, robot_state.goal_x,
                                     robot_state.goal_y, robot_state.vmax,
                                     robot_state.theta)
        peds_full_state = [
            ObservableState(ped_state.pos_x, ped_state.pos_y, ped_state.vel_x,
                            ped_state.vel_y, ped_state.radius)
            for ped_state in observe_info.ped_states
        ]
        observable_states = peds_full_state
        self.cur_state = JointState(robot_full_state, observable_states)
        action_cmd = ActionCmd()

        dis = np.sqrt((robot_full_state.px - robot_full_state.gx)**2 +
                      (robot_full_state.py - robot_full_state.gy)**2)
        if dis < 0.3:
            action_cmd.stop = True
            action_cmd.vel_x = 0.0
            action_cmd.vel_y = 0.0
        else:
            print("state callback")
            action_cmd.stop = False
            robot_action, robot_action_index = self.robot_policy.predict(
                self.cur_state)
            print('robot_action', robot_action.v, robot_action.r)
            action_cmd.vel_x = robot_action.v
            action_cmd.vel_y = robot_action.r
        self.robot_action_pub.publish(action_cmd)
Example #3
0
 def act(self, obs):
     robot_state, humans_state, local_map = obs
     state = JointState(robot_state, humans_state)
     action = self.policy.predict(state, local_map, None)
     action = ActionRot(robot_state.v_pref * action.v,
                        action.r)  # de-normalize
     return action
    def act(self, ob):
        if self.policy is None:
            raise AttributeError("Policy attribute has to be set!")

        state = JointState(self.get_full_state(), ob)
        action = self.policy.predict(state)
        return action
 def act(self, ob):
     """
     The state for human is its full state and all other agents' observable states
     :param ob:
     :return:
     """
     state = JointState(self.get_full_state(), ob)
     action = self.policy.predict(state)
     return action
Example #6
0
 def act_static(self, ob):
     """
     The state for human is its full state and all other agents' observable states
     :param ob:
     :return:
     """
     state = JointState(self.get_full_state(), ob)
     action = ActionXY(0, 0)
     return action
Example #7
0
    def planner(self):
        # update robot
        robot.set(self.px, self.py, self.gx, self.gy, self.vx, self.vy,
                  self.theta)
        dist_to_goal = np.linalg.norm(
            np.array(robot.get_position()) -
            np.array(robot.get_goal_position()))

        # compute command velocity
        if robot.reached_destination():
            self.cmd_vel.linear.x = 0
            self.cmd_vel.linear.y = 0
            self.cmd_vel.linear.z = 0
            self.cmd_vel.angular.x = 0
            self.cmd_vel.angular.y = 0
            self.cmd_vel.angular.z = 0
            self.Is_goal_reached = True

        else:
            """
            self state: FullState(px, py, vx, vy, radius, gx, gy, v_pref, theta)
            ob:[ObservableState(px1, py1, vx1, vy1, radius1),
                ObservableState(px1, py1, vx1, vy1, radius1),
                   .......                    
                ObservableState(pxn, pyn, vxn, vyn, radiusn)]
            """
            if len(self.ob) == 0:
                self.ob = [
                    ObservableState(FAKE_HUMAN_PX, FAKE_HUMAN_PY, 0, 0,
                                    HUMAN_RADIUS)
                ]

            self.state = JointState(robot.get_full_state(), self.ob)
            action = policy.predict(self.state)  # max_action
            self.cmd_vel.linear.x = action.v
            self.cmd_vel.linear.y = 0
            self.cmd_vel.linear.z = 0
            self.cmd_vel.angular.x = 0
            self.cmd_vel.angular.y = 0
            self.cmd_vel.angular.z = action.r

        ########### for debug ##########
        # dist_to_goal = np.linalg.norm(np.array(robot.get_position()) - np.array(robot.get_goal_position()))
        # if self.plan_counter % 10 == 0:
        #     rospy.loginfo("robot position:(%s,%s)" % (self.px, self.py))
        #     rospy.loginfo("Distance to goal is %s" % dist_to_goal)
        #     rospy.loginfo("self state:\n %s" % self.state.self_state)
        #     for i in range(len(self.state.human_states)):
        #         rospy.loginfo("human %s :\n %s" % (i+1, self.state.human_states[i]))
        #     rospy.loginfo("%s-th action is planned: \n v: %s m/s \n r: %s rad/s"
        #                   % (self.plan_counter, self.cmd_vel.linear.x, self.cmd_vel.angular.z))

        # publish velocity
        self.cmd_vel_pub.publish(self.cmd_vel)
        self.plan_counter += 1
        self.visualize_action()
    def act(self, ob):
        if self.policy is None:
            raise AttributeError('Policy attribute has to be set!')
        state = JointState(self.get_full_state(), ob)

        # print('----debug robot.py line 15 ')
        # print('robot cordinate = [',self.px,',',self.py,']')

        action = self.policy.predict(state)
        #IMPORTANT !!!!!!!!!!!!!!!!!!!!!!!!    CHNGE IT IF YOU DONT WANT TO CREATE SIMULATION DATA
        action = ActionXY(0, 0)
        return action
Example #9
0
    def act(self,
            ob,
            imitation_learning=True,
            recorded_unrotated_state=None,
            occ=None):
        if self.policy is None:
            raise AttributeError('Policy attribute has to be set!')
        if (imitation_learning):
            state = JointState(self.get_full_state(), ob)
            action = self.policy.predict(state)
        else:  #RL

            human_xy = recorded_unrotated_state[:, :, 9:
                                                11]  # [hist_len, n-human ,2(x,y)]
            robot_xy = recorded_unrotated_state[:, 0:1,
                                                0:2]  # [hist_len, 1 ,2(x,y)]
            joint_xy = torch.cat([robot_xy, human_xy], dim=-2)

            state = JointState(self.get_full_state(), ob)
            action = self.policy.predict(
                state, joint_xy, occ
            )  #.joint_xy = [hist_len ,n-human+1, 2(xy)]=[5 3 2]   ,  occ =[4,32] =[hist_len-1 ,32 ]
        return action
Example #10
0
 def predict(self, obs, env):
     self.simulator.time_step = env._get_dt()
     other_agent_states = [
         agent.get_observable_state()
         for agent in env.soadrl_sim.humans + env.soadrl_sim.other_robots
     ]
     action = self.simulator.predict(
         JointState(env.soadrl_sim.robot.get_full_state(),
                    other_agent_states),
         env.soadrl_sim.obstacle_vertices,
         env.soadrl_sim.robot,
     )
     if self.suicide_if_stuck:
         if action.v < 0.1:
             return Suicide()
     vx = action.v * np.cos(action.r)
     vy = action.v * np.sin(action.r)
     return np.array([vx, vy, 0.1 * (np.random.random() - 0.5)])
Example #11
0
    def compute_coordinate_observation(self, with_fov=False):
        # Todo: only consider humans in FOV
        px = self.robot_states[-1].position.x_val
        py = self.robot_states[-1].position.y_val
        if len(self.robot_states) == 1:
            vx = vy = 0
        else:
            # TODO: use kinematics.linear_velocity?
            vx = self.robot_states[-1].position.x_val - self.robot_states[-2].position.x_val
            vy = self.robot_states[-1].position.y_val - self.robot_states[-2].position.y_val
        r  = self.robot_radius
        gx = self.goal_position[0]
        gy = self.goal_position[1]
        v_pref = 1
        _, _, theta = airsim.to_eularian_angles(self.robot_states[-1].orientation)
        robot_state = FullState(px, py, vx, vy, r, gx, gy, v_pref, theta)

        human_states = []
        for i in range(self.human_num):
            if len(self.human_states[i]) == 1:
                vx = vy = 0
            else:
                vx = (self.human_states[i][-1].position.x_val - self.human_states[i][-2].position.x_val) / self.time_step
                vy = (self.human_states[i][-1].position.y_val - self.human_states[i][-2].position.y_val) / self.time_step
            px = self.human_states[i][-1].position.x_val
            py = self.human_states[i][-1].position.y_val

            if with_fov:
                angle = np.arctan2(py - robot_state.py, px - robot_state.px)
                if abs(angle - robot_state.theta) > self.fov / 2:
                    continue

            human_state = ObservableState(px, py, vx, vy, self.human_radius)
            human_states.append(human_state)

        if not human_states:
            human_states.append(ObservableState(-6, 0, 0, 0, self.human_radius))

        joint_state = JointState(robot_state, human_states)

        return joint_state
    def run_k_episodes(self,
                       k,
                       phase,
                       update_memory=False,
                       imitation_learning=False,
                       episode=None,
                       print_failure=False,
                       noise_explore=0.0,
                       progressbar=True,
                       notebook=False):
        self.robot.policy.set_phase(phase)
        success_times = []
        collision_times = []
        timeout_times = []
        success = 0
        collision = 0
        timeout = 0
        too_close = 0
        min_dist = []
        cumulative_rewards = []
        collision_cases = []
        timeout_cases = []

        if imitation_learning: human_obsv = []

        iter = range(k)
        if progressbar:
            if notebook:
                iter = tqdm_notebook(iter, leave=False)
            else:
                iter = tqdm(iter, leave=False)

        for i in iter:
            ob = self.env.reset(phase)
            done = False
            states = []
            actions = []
            rewards = []
            scene = []

            #print(self.robot.policy)

            while not done:

                # infer action from policy
                if 'RNN' in self.robot.policy.name:
                    action = self.robot.act(ob)
                elif self.num_frames == 1:
                    action = self.robot.act(ob)
                else:
                    last_ob = latest_frame(ob, self.num_frames)
                    action = self.robot.act(last_ob)
                actions.append(action)
                #print(self.target_policy.name)
                # state append
                if imitation_learning:
                    if 'RNN' in self.target_policy.name:
                        # TODO: enumerate and test differenct combinations of policies
                        joint_state = JointState(self.robot.get_full_state(),
                                                 ob)
                    elif 'SAIL' in self.target_policy.name or 'SARL' in self.target_policy.name:

                        joint_state = JointState(self.robot.get_full_state(),
                                                 ob)
                    else:
                        joint_state = self.robot.policy.last_state
                    last_state = self.target_policy.transform(joint_state)
                    states.append(last_state)
                    scene.append(obs_to_frame(ob))
                    #print(last_state)
                else:
                    states.append(self.robot.policy.last_state)

                # env step
                if imitation_learning and noise_explore > 0.0:
                    noise_magnitude = noise_explore * 2.0
                    action = ActionXY(
                        action[0] +
                        torch.rand(1).sub(0.5) * noise_magnitude, action[1] +
                        torch.rand(1).sub(0.5) * noise_magnitude
                    ) if self.robot.policy.kinematics == 'holonomic' else ActionRot(
                        action[0] +
                        torch.rand(1).sub(0.5) * noise_magnitude, action[1] +
                        torch.rand(1).sub(0.5) * noise_magnitude)

                ob, reward, done, info = self.env.step(action)
                rewards.append(reward)

                if isinstance(info, Danger):
                    too_close += 1
                    min_dist.append(info.min_dist)

            if imitation_learning: human_obsv.append(torch.FloatTensor(scene))

            if isinstance(info, ReachGoal):
                success += 1
                success_times.append(self.env.global_time)
            elif isinstance(info, Collision):
                collision += 1
                collision_cases.append(i)
                collision_times.append(self.env.global_time)
            elif isinstance(info, Timeout):
                timeout += 1
                timeout_cases.append(i)
                timeout_times.append(self.env.time_limit)
            else:
                raise ValueError('Invalid end signal from environment')

            if update_memory:
                self.update_memory(states, actions, rewards,
                                   imitation_learning)

            # episode result
            cumulative_rewards.append(
                sum([
                    pow(self.gamma,
                        t * self.robot.time_step * self.robot.v_pref) * reward
                    for t, reward in enumerate(rewards)
                ]))
            # if i % 100 == 0 and i > 0:
            #     logging.info("{:<5} Explore #{:d} / {:d} episodes".format(phase.upper(), i, k))

        # episodes statistics
        success_rate = success / k
        collision_rate = collision / k
        assert success + collision + timeout == k
        avg_nav_time = sum(success_times) / len(
            success_times) if success_times else self.env.time_limit

        extra_info = '' if episode is None else 'in episode {} '.format(
            episode)
        logging.info(
            '{:<5} {} success: {:.2f}, collision: {:.2f}, nav time: {:.2f}, reward: {:.4f} +- {:.4f}'
            .format(phase.upper(), extra_info, success_rate, collision_rate,
                    avg_nav_time, statistics.mean(cumulative_rewards),
                    (statistics.stdev(cumulative_rewards)
                     if len(cumulative_rewards) > 1 else 0.0)))
        print(
            '{:<5} {} success: {:.2f}, collision: {:.2f}, nav time: {:.2f}, reward: {:.4f} +- {:.4f}'
            .format(phase.upper(), extra_info, success_rate, collision_rate,
                    avg_nav_time, statistics.mean(cumulative_rewards),
                    (statistics.stdev(cumulative_rewards)
                     if len(cumulative_rewards) > 1 else 0.0)))

        if phase in ['val', 'test']:
            total_time = sum(success_times + collision_times +
                             timeout_times) * self.robot.time_step
            logging.info('Frequency of being in danger: %.2f',
                         too_close / total_time)

        if print_failure:
            logging.info('Collision cases: ' +
                         ' '.join([str(x) for x in collision_cases]))
            logging.info('Timeout cases: ' +
                         ' '.join([str(x) for x in timeout_cases]))

        if imitation_learning:
            return human_obsv
    def run_k_episodes(self,
                       k,
                       phase,
                       update_memory=False,
                       imitation_learning=False,
                       episode=None,
                       traj_head=None,
                       print_failure=False,
                       noise_explore=0.0,
                       progressbar=False,
                       output_info=False,
                       notebook=False,
                       print_info=False,
                       return_states=False,
                       suffix=None):
        self.robot.policy.set_phase(phase)
        success_times = []
        collision_times = []
        timeout_times = []
        success = 0
        collision = 0
        timeout = 0
        too_close = 0
        min_dist = []
        cumulative_rewards = []
        collision_cases = []
        timeout_cases = []

        states_s = []

        if imitation_learning: human_obsv = []

        iter = range(k)
        if progressbar:
            if notebook:
                iter = tqdm_notebook(iter, leave=False)
            else:
                iter = tqdm(iter, leave=False)
        traj_acc_sum = 0
        for i in iter:
            ob = self.env.reset(phase)
            done = False
            states = []
            actions = []
            rewards = []
            scene = []

            while not done:

                # infer action from policy
                if 'TRAJ' in self.robot.policy.name:
                    action = self.robot.act(ob)
                elif self.num_frames == 1:
                    action = self.robot.act(ob)
                else:
                    last_ob = latest_frame(ob, self.num_frames)
                    action = self.robot.act(last_ob)
                actions.append(action)

                # state append
                if imitation_learning:
                    # if 'RNN' in self.target_policy.name:
                    #     # TODO: enumerate and test differenct combinations of policies
                    #     joint_state = JointState(self.robot.get_full_state(), ob)
                    # elif 'SAIL' in self.target_policy.name:
                    #     joint_state = JointState(self.robot.get_full_state(), ob)
                    # else:
                    #     joint_state = self.robot.policy.last_state

                    joint_state = JointState(self.robot.get_full_state(), ob)
                    last_state = ExplorerDs.transform_mult(joint_state)
                    states.append(last_state)
                    scene.append(obs_to_frame(ob))
                else:
                    states.append(self.robot.policy.last_state)

                # env step
                if imitation_learning and noise_explore > 0.0:
                    noise_magnitude = noise_explore * 2.0
                    action = ActionXY(
                        action[0] +
                        torch.rand(1).sub(0.5) * noise_magnitude, action[1] +
                        torch.rand(1).sub(0.5) * noise_magnitude
                    ) if self.robot.policy.kinematics == 'holonomic' else ActionRot(
                        action[0] +
                        torch.rand(1).sub(0.5) * noise_magnitude, action[1] +
                        torch.rand(1).sub(0.5) * noise_magnitude)

                ob, reward, done, info = self.env.step(action)
                rewards.append(reward)

                if isinstance(info, Danger):
                    too_close += 1
                    min_dist.append(info.min_dist)

            if imitation_learning: human_obsv.append(torch.FloatTensor(scene))

            if isinstance(info, ReachGoal):
                success += 1
                success_times.append(self.env.global_time)
            elif isinstance(info, Collision):
                collision += 1
                collision_cases.append(i)
                collision_times.append(self.env.global_time)
            elif isinstance(info, Timeout):
                timeout += 1
                timeout_cases.append(i)
                timeout_times.append(self.env.time_limit)
            else:
                raise ValueError('Invalid end signal from environment')
            traj_accuracy = 0
            if traj_head is not None:
                for i, ([rob, hum], human_feats) in enumerate(states):
                    #next_human_pos = torch.cat([states[max(i, len(states)-1)][0][1] for i in range(len(states))], dim=1)
                    next_human_pos = []
                    for j in range(4):
                        n_p = states[min(i + j + 1,
                                         len(states) - 1)][0][1][-1, :, :2]
                        #print(states[min(i+j+1, len(states)-1)][0][1])
                        next_human_pos.append(n_p)
                    with torch.no_grad():
                        next_human_pos = torch.cat(next_human_pos, dim=1)
                        next_human_pos_pred = traj_head(human_feats)

                        # print(hum[-1])
                        # print(next_human_pos)
                        # print(next_human_pos_pred)
                        # print('\n')
                        accuracy = ((next_human_pos -
                                     next_human_pos_pred)**2).mean().item()
                        traj_accuracy += accuracy
                    #print(traj_accuracy)
            traj_acc_sum += traj_accuracy

            if update_memory:
                self.update_memory(states, actions, rewards,
                                   imitation_learning)

            states_s.append(states)

            # episode result
            cumulative_rewards.append(
                sum([
                    pow(self.gamma,
                        t * self.robot.time_step * self.robot.v_pref) * reward
                    for t, reward in enumerate(rewards)
                ]))
            # if i % 100 == 0 and i > 0:
            #     logging.info("{:<5} Explore #{:d} / {:d} episodes".format(phase.upper(), i, k))
        traj_acc_sum = traj_acc_sum / sum(len(s) for s in states_s)
        # episodes statistics
        success_rate = success / k
        collision_rate = collision / k
        timeout_rate = timeout / k
        assert success + collision + timeout == k
        avg_nav_time = sum(success_times) / len(
            success_times) if success_times else self.env.time_limit
        if not print_info:
            extra_info = '' if episode is None else 'in episode {} '.format(
                episode)
            logging.info(
                '{:<5} {} success: {:.2f}, collision: {:.2f}, nav time: {:.2f}, reward: {:.4f} +- {:.4f}'
                .format(phase.upper(), extra_info, success_rate,
                        collision_rate, avg_nav_time,
                        statistics.mean(cumulative_rewards),
                        (statistics.stdev(cumulative_rewards)
                         if len(cumulative_rewards) > 1 else 0.0)))
        info = {
            'success': success_rate,
            'collision': collision_rate,
            'nav time': avg_nav_time,
            'reward': statistics.mean(cumulative_rewards)
        }
        if print_info:
            info = 'success: {:.4f},collision: {:.4f},nav time: {:.2f},reward: {:.4f}, timeout : {:.4f}'.\
            format(success_rate, collision_rate, avg_nav_time, statistics.mean(cumulative_rewards), timeout_rate)
            if traj_head is not None:
                info += ', traj accuracy : {:.4f}'.format(traj_acc_sum)
            if suffix is not None:
                info = suffix + info
            print(info)
        # if phase in ['val', 'test']:
        #     total_time = sum(success_times + collision_times + timeout_times) * self.robot.time_step
        #     logging.info('Frequency of being in danger: %.2f', too_close / total_time)

        if print_failure:
            logging.info('Collision cases: ' +
                         ' '.join([str(x) for x in collision_cases]))
            logging.info('Timeout cases: ' +
                         ' '.join([str(x) for x in timeout_cases]))

        info_dic = {
            'success': success_rate,
            'collision': collision_rate,
            'nav time': avg_nav_time,
            'reward': statistics.mean(cumulative_rewards),
            'traj accuracy': traj_acc_sum
        }

        if output_info:
            if return_states:
                return info_dic, states_s
            else:
                return info_dic

        if imitation_learning:
            return human_obsv
Example #14
0
    def predict(self, state):
        """
        A base class for all methods that takes pairwise joint state as input to value network.
        The input to the value network is always of shape (batch_size, # humans, rotated joint state length)

        """
        if self.phase is None or self.device is None:
            raise AttributeError('Phase, device attributes have to be set!')
        if self.phase == 'train' and self.epsilon is None:
            raise AttributeError('Epsilon attribute has to be set in training phase')

        if self.reach_destination(state):
            return ActionXY(0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0)
        if self.action_space is None:
            self.build_action_space(state.robot_state.v_pref)
        if not state.human_states:
            assert self.phase != 'train'
            if hasattr(self, 'attention_weights'):
                self.attention_weights = list()
            return self.select_greedy_action(state.robot_state)
        max_action_index = 0
        occupancy_maps = None
        probability = np.random.random()
        if self.phase == 'train' and probability < self.epsilon:
            max_action_index = np.random.choice(len(self.action_space))
            max_action = self.action_space[max_action_index]
        else:
            self.action_values = list()
            max_value = float('-inf')
            max_action = None
            rewards = []
            action_index = 0
            batch_input_tensor = None
            for action in self.action_space:
                next_robot_state = self.propagate(state.robot_state, action)
                if self.query_env:
                    next_human_states, reward, done, info = self.env.onestep_lookahead(action)
                else:
                    next_human_states = [self.propagate(human_state, ActionXY(human_state.vx, human_state.vy))
                                         for human_state in state.human_states]
                    next_state = JointState(next_robot_state, next_human_states)
                    reward, _ = self.reward_estimator.estimate_reward_on_predictor(state, next_state)
                    rewards.append(reward)
                batch_next_states = torch.cat([torch.Tensor([next_robot_state + next_human_state]).to(self.device)
                                              for next_human_state in next_human_states], dim=0)
                rotated_batch_input = self.rotate(batch_next_states).unsqueeze(0)
                if self.with_om:
                    if occupancy_maps is None:
                        occupancy_maps = self.build_occupancy_maps(next_human_states).unsqueeze(0)
                    rotated_batch_input = torch.cat([rotated_batch_input, occupancy_maps], dim=2)
                if batch_input_tensor is None:
                    batch_input_tensor = rotated_batch_input
                else:
                    batch_input_tensor = torch.cat([batch_input_tensor, rotated_batch_input], dim=0)
            next_value = self.model(batch_input_tensor).squeeze(1)
            rewards_tensor = torch.tensor(rewards).to(self.device)
            value = rewards_tensor + next_value * pow(self.gamma, self.time_step * state.robot_state.v_pref)
            max_action_index = value.argmax()
            best_value = value[max_action_index]
            if best_value > max_value:
                max_action = self.action_space[max_action_index]

            if max_action is None:
                raise ValueError('Value network is not well trained. ')

        if self.phase == 'train':
            self.last_state = self.transform(state)

        return max_action, int(max_action_index)
Example #15
0
 def get_state(self, ob):
     state = JointState(self.get_full_state(), ob)
     if self.policy is None:
         raise AttributeError('Policy attribute has to be set!')
     return self.policy.transform(state)
Example #16
0
    def run_k_episodes(self,
                       k_train,
                       k_val,
                       phase,
                       update_memory=False,
                       episode=None,
                       imitation_learning=False,
                       print_failure=False,
                       with_render=False):

        self.robot.policy.set_phase(phase)
        success_times = []
        collision_times = []
        timeout_times = []
        success = 0
        collision = 0
        timeout = 0
        too_close = 0
        min_dist = []
        cumulative_rewards = []
        collision_cases = []
        timeout_cases = []
        k = k_val + k_train

        for i in range(k):

            ob = self.env.reset(phase='gen')
            robot_state = self.robot.get_full_state()
            done = False
            states = []
            actions = []
            rewards = []
            prev_obs = [JointState(robot_state, ob)]

            # If the policy is TLSGAN, either stay and observe or move according to ORCA
            for j in range(self.obs_len - 1):
                if isinstance(self.robot.policy, TLSGAN):
                    #action = ActionXY(0,0)
                    action = self.robot.act(prev_obs, test_policy=True)
                else:
                    action = self.robot.act(prev_obs)
                ob, reward, done, info = self.env.step(action)
                robot_state = self.robot.get_full_state()
                prev_obs.append(JointState(robot_state, ob))

            while not done:
                action = self.robot.act(prev_obs, test_policy=False)

                ob, reward, done, info = self.env.step(action)
                robot_state = self.robot.get_full_state()

                states.append(prev_obs)
                actions.append(action)
                rewards.append(reward)
                prev_obs = prev_obs[1:]
                prev_obs.append(JointState(robot_state, ob))

                if isinstance(info, Danger):
                    too_close += 1
                    min_dist.append(info.min_dist)

            # Render only the first episode
            if with_render and (i == 0):
                self.env.render(mode='video')

            if isinstance(info, ReachGoal):
                success += 1
                success_times.append(self.env.global_time)
            elif isinstance(info, Collision):
                collision += 1
                collision_cases.append(i)
                collision_times.append(self.env.global_time)
            elif isinstance(info, Timeout):
                timeout += 1
                timeout_cases.append(i)
                timeout_times.append(self.env.time_limit)
            else:
                raise ValueError('Invalid end signal from environment')

            if update_memory:
                if isinstance(info, ReachGoal) or isinstance(info, Collision):
                    # only add positive(success) or negative(collision) experience in experience set
                    if i < k_train:
                        memory = self.train_memory
                    else:
                        memory = self.val_memory
                    self.update_memory(memory, states, actions, rewards,
                                       imitation_learning)

            cumulative_rewards.append(
                sum([
                    pow(self.gamma,
                        t * self.robot.time_step * self.robot.v_pref) * reward
                    for t, reward in enumerate(rewards)
                ]))

        success_rate = success / k
        collision_rate = collision / k
        assert success + collision + timeout == k
        avg_nav_time = sum(success_times) / len(
            success_times) if success_times else self.env.time_limit

        extra_info = '' if episode is None else 'in episode {} '.format(
            episode)
        self.logger.info(
            '{:<5} {}has success rate: {:.2f}, collision rate: {:.2f}, nav time: {:.2f}, total reward: {:.4f}'
            .format(phase.upper(), extra_info, success_rate, collision_rate,
                    avg_nav_time, average(cumulative_rewards)))
        if phase in ['val', 'test']:
            total_time = sum(success_times + collision_times +
                             timeout_times) * self.robot.time_step
            self.logger.info(
                'Frequency of being in danger: %.2f and average min separate distance in danger: %.2f',
                too_close / total_time, average(min_dist))

        if print_failure:
            self.logger.info('Collision cases: ' +
                             ' '.join([str(x) for x in collision_cases]))
            self.logger.info('Timeout cases: ' +
                             ' '.join([str(x) for x in timeout_cases]))
Example #17
0
    def predict(self, states):
        """
        A base class for all methods that takes pairwise joint state as input to value network.
        The input to the value network is always of shape (batch_size, # humans, rotated joint state length)

        states is a list of joint states

        """
        if self.phase is None or self.device is None:
            raise AttributeError('Phase, device attributes have to be set!')
        if self.phase == 'train' and self.epsilon is None:
            raise AttributeError(
                'Epsilon attribute has to be set in training phase')

        if self.reach_destination(states[-1]):
            return ActionXY(
                0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0)
        if self.action_space is None:
            self.build_action_space(states[-1].self_state.v_pref)

        if self.policy_learning:

            trajs, rel_trajs, self_info = stateToTrajTensors([states])
            next_position = self.model(
                (trajs, rel_trajs, self_info)).data.squeeze(0)
            action = ActionXY(next_position[0], next_position[1])
            return action

        else:
            occupancy_maps = None
            probability = np.random.random()
            if self.phase == 'train' and probability < self.epsilon:
                max_action = self.action_space[np.random.choice(
                    len(self.action_space))]
            else:
                self.action_values = list()
                max_value = float('-inf')
                max_action = None
                for action in self.action_space:
                    next_self_state = self.propagate(
                        states[-1].self_state, action
                    )  # 'px', 'py', 'vx', 'vy', 'radius', 'gx', 'gy', 'v_pref', 'theta'
                    if self.query_env:
                        next_human_states, reward, done, info = self.env.onestep_lookahead(
                            action)
                    else:
                        next_human_states = [
                            self.propagate(
                                human_state,
                                ActionXY(human_state.vx, human_state.vy))
                            for human_state in states.human_states
                        ]  # 'px1', 'py1', 'vx1', 'vy1', 'radius1'
                        reward = self.compute_reward(next_self_state,
                                                     next_human_states)

                    # VALUE UPDATE
                    trajs, rel_trajs, self_info = stateToTrajTensors([
                        states +
                        [JointState(next_self_state, next_human_states)]
                    ])
                    next_state_value = self.model(
                        (trajs, rel_trajs, self_info)).data.item()
                    value = reward + pow(
                        self.gamma, self.time_step *
                        states[-1].self_state.v_pref) * next_state_value
                    self.action_values.append(value)
                    if value > max_value:
                        max_value = value
                        max_action = action
                if max_action is None:
                    raise ValueError('Value network is not well trained. ')

        if self.phase == 'train':
            self.last_state = states

        return max_action