Beispiel #1
0
def main():
    parser = argparse.ArgumentParser('Parse configuration file')
    parser.add_argument('--env_config', type=str, default='configs/env.config')
    parser.add_argument('--policy_config', type=str, default='configs/policy.config')
    parser.add_argument('--policy', type=str, default='orca')
    parser.add_argument('--model_file', type=str, default=None)
    parser.add_argument('--result_dir', type=str, default='result')
    parser.add_argument('--gpu', default=False, action='store_true')
    parser.add_argument('--visualize', default=False, action='store_true')
    parser.add_argument('--phase', type=str, default='test')
    parser.add_argument('--num_frames', type=int, default=1)
    parser.add_argument('--test_case', type=int, default=None)
    parser.add_argument('--safety_space', type=float, default=0.15)
    parser.add_argument('--square', default=False, action='store_true')
    parser.add_argument('--circle', default=False, action='store_true')
    parser.add_argument('--traj', default=False, action='store_true')
    args = parser.parse_args()

    policy_config_file = args.policy_config

    if not os.path.exists(args.result_dir):
        os.makedirs(args.result_dir)

    # configure logging and device
    logging.basicConfig(level=logging.INFO, format='%(asctime)s, %(levelname)s: %(message)s',
                        datefmt="%Y-%m-%d %H:%M:%S")
    device = torch.device("cuda:0" if torch.cuda.is_available() and args.gpu else "cpu")
    logging.info(' =========== TEST %s ============ ', args.policy)
    logging.info('Using device: %s', device)

    # configure policy
    policy = policy_factory[args.policy]()
    policy_config = configparser.RawConfigParser()
    policy_config.read(policy_config_file)
    policy.configure(policy_config)
    if policy.trainable:
        if args.model_file is None:
            logging.warning('Trainable policy is NOT specified with a model weights directory')
        else:
            try:
                policy.get_model().load_state_dict(torch.load(args.model_file))
                logging.info('Loaded policy from %s', args.model_file)
            except Exception as e:
                logging.warning(e)
                policy.get_model().load_state_dict(torch.load(args.model_file), strict=False)
                logging.info('Loaded policy partially from %s', args.model_file)

    # configure environment
    env_config = configparser.RawConfigParser()
    env_config.read(args.env_config)
    env = gym.make('CrowdSim-v0')
    env.configure(env_config)
    if args.square:
        env.test_sim = 'square_crossing'
    if args.circle:
        env.test_sim = 'circle_crossing'
    robot = Robot(env_config, 'robot')
    robot.set_policy(policy)
    env.set_robot(robot)

    # multi-frame env
    if args.num_frames > 1:
        logging.info("stack %d frames", args.num_frames)
        env = FrameStack(env, args.num_frames)

    gamma = policy_config.getfloat('rl', 'gamma')
    explorer = Explorer(env, robot, device, args.num_frames, gamma=gamma)

    policy.set_phase(args.phase)
    policy.set_device(device)
    # set safety space for ORCA in non-cooperative simulation
    if isinstance(robot.policy, ORCA):
        if robot.visible:
            # robot.policy.safety_space = 0
            robot.policy.safety_space = args.safety_space
        else:
            # because invisible case breaks the reciprocal assumption
            # adding some safety space improves ORCA performance. Tune this value based on your need.
            robot.policy.safety_space = args.safety_space
        logging.warning('ORCA agent buffer: %f', robot.policy.safety_space)

    policy.set_env(env)
    robot.print_info()
    if args.visualize:
        ob = env.reset(args.phase, args.test_case)
        done = False
        last_pos = np.array(robot.get_position())
        while not done:
            # retrieve frame stack
            if not 'RNN' in robot.policy.name:
                ob = latest_frame(ob, args.num_frames)
            action = robot.act(ob)
            ob, _, done, info = env.step(action)
            current_pos = np.array(robot.get_position())
            logging.debug('Speed: %.2f', np.linalg.norm(current_pos - last_pos) / robot.time_step)
            last_pos = current_pos

        output_file = os.path.join(args.result_dir, 'traj_' + str(args.test_case))
        print("output_file", output_file)
        if args.traj:
            env.render('traj', output_file + '.png')
        else:
            env.render('video', output_file + '.mp4')

        logging.info('It takes %.2f seconds to finish. Final status is %s', env.get_global_time(), info)
        if robot.visible and info == 'reach goal':
            human_times = env.get_human_times()
            logging.info('Average time for humans to reach goal: %.2f', sum(human_times) / len(human_times))
    else:
        explorer.run_k_episodes(env.case_size[args.phase], args.phase, print_failure=False)
class ExplorerDs(object):
    def __init__(self,
                 env,
                 robot,
                 device,
                 num_frames,
                 memory=None,
                 gamma=None,
                 target_policy=None):
        self.env = env
        self.robot = robot
        self.device = device
        self.memory = memory
        self.gamma = gamma
        self.num_frames = num_frames
        self.target_model = None

        if self.num_frames > 1:
            self.env = FrameStack(self.env, self.num_frames)

    def update_target_model(self, target_model):
        self.target_model = copy.deepcopy(target_model)

    @staticmethod
    def transform_mult(state):
        """ Transform state object to tensor input of RNN policy
        """

        robot_state = torch.Tensor([
            state.self_state.px, state.self_state.py, state.self_state.vx,
            state.self_state.vy, state.self_state.gx, state.self_state.gy
        ])

        num_human = len(state.human_states[0])
        memory_size = len(state.human_states)
        human_state = torch.empty([memory_size, num_human, 4])
        for k in range(num_human):
            for t in range(memory_size):
                human_state[t, k, 0] = state.human_states[t][k].px
                human_state[t, k, 1] = state.human_states[t][k].py
                human_state[t, k, 2] = state.human_states[t][k].vx
                human_state[t, k, 3] = state.human_states[t][k].vy

        return [robot_state, human_state]

    # @profile
    def run_k_episodes(self,
                       k,
                       phase,
                       update_memory=False,
                       imitation_learning=False,
                       episode=None,
                       traj_head=None,
                       print_failure=False,
                       noise_explore=0.0,
                       progressbar=False,
                       output_info=False,
                       notebook=False,
                       print_info=False,
                       return_states=False,
                       suffix=None):
        self.robot.policy.set_phase(phase)
        success_times = []
        collision_times = []
        timeout_times = []
        success = 0
        collision = 0
        timeout = 0
        too_close = 0
        min_dist = []
        cumulative_rewards = []
        collision_cases = []
        timeout_cases = []

        states_s = []

        if imitation_learning: human_obsv = []

        iter = range(k)
        if progressbar:
            if notebook:
                iter = tqdm_notebook(iter, leave=False)
            else:
                iter = tqdm(iter, leave=False)
        traj_acc_sum = 0
        for i in iter:
            ob = self.env.reset(phase)
            done = False
            states = []
            actions = []
            rewards = []
            scene = []

            while not done:

                # infer action from policy
                if 'TRAJ' in self.robot.policy.name:
                    action = self.robot.act(ob)
                elif self.num_frames == 1:
                    action = self.robot.act(ob)
                else:
                    last_ob = latest_frame(ob, self.num_frames)
                    action = self.robot.act(last_ob)
                actions.append(action)

                # state append
                if imitation_learning:
                    # if 'RNN' in self.target_policy.name:
                    #     # TODO: enumerate and test differenct combinations of policies
                    #     joint_state = JointState(self.robot.get_full_state(), ob)
                    # elif 'SAIL' in self.target_policy.name:
                    #     joint_state = JointState(self.robot.get_full_state(), ob)
                    # else:
                    #     joint_state = self.robot.policy.last_state

                    joint_state = JointState(self.robot.get_full_state(), ob)
                    last_state = ExplorerDs.transform_mult(joint_state)
                    states.append(last_state)
                    scene.append(obs_to_frame(ob))
                else:
                    states.append(self.robot.policy.last_state)

                # env step
                if imitation_learning and noise_explore > 0.0:
                    noise_magnitude = noise_explore * 2.0
                    action = ActionXY(
                        action[0] +
                        torch.rand(1).sub(0.5) * noise_magnitude, action[1] +
                        torch.rand(1).sub(0.5) * noise_magnitude
                    ) if self.robot.policy.kinematics == 'holonomic' else ActionRot(
                        action[0] +
                        torch.rand(1).sub(0.5) * noise_magnitude, action[1] +
                        torch.rand(1).sub(0.5) * noise_magnitude)

                ob, reward, done, info = self.env.step(action)
                rewards.append(reward)

                if isinstance(info, Danger):
                    too_close += 1
                    min_dist.append(info.min_dist)

            if imitation_learning: human_obsv.append(torch.FloatTensor(scene))

            if isinstance(info, ReachGoal):
                success += 1
                success_times.append(self.env.global_time)
            elif isinstance(info, Collision):
                collision += 1
                collision_cases.append(i)
                collision_times.append(self.env.global_time)
            elif isinstance(info, Timeout):
                timeout += 1
                timeout_cases.append(i)
                timeout_times.append(self.env.time_limit)
            else:
                raise ValueError('Invalid end signal from environment')
            traj_accuracy = 0
            if traj_head is not None:
                for i, ([rob, hum], human_feats) in enumerate(states):
                    #next_human_pos = torch.cat([states[max(i, len(states)-1)][0][1] for i in range(len(states))], dim=1)
                    next_human_pos = []
                    for j in range(4):
                        n_p = states[min(i + j + 1,
                                         len(states) - 1)][0][1][-1, :, :2]
                        #print(states[min(i+j+1, len(states)-1)][0][1])
                        next_human_pos.append(n_p)
                    with torch.no_grad():
                        next_human_pos = torch.cat(next_human_pos, dim=1)
                        next_human_pos_pred = traj_head(human_feats)

                        # print(hum[-1])
                        # print(next_human_pos)
                        # print(next_human_pos_pred)
                        # print('\n')
                        accuracy = ((next_human_pos -
                                     next_human_pos_pred)**2).mean().item()
                        traj_accuracy += accuracy
                    #print(traj_accuracy)
            traj_acc_sum += traj_accuracy

            if update_memory:
                self.update_memory(states, actions, rewards,
                                   imitation_learning)

            states_s.append(states)

            # episode result
            cumulative_rewards.append(
                sum([
                    pow(self.gamma,
                        t * self.robot.time_step * self.robot.v_pref) * reward
                    for t, reward in enumerate(rewards)
                ]))
            # if i % 100 == 0 and i > 0:
            #     logging.info("{:<5} Explore #{:d} / {:d} episodes".format(phase.upper(), i, k))
        traj_acc_sum = traj_acc_sum / sum(len(s) for s in states_s)
        # episodes statistics
        success_rate = success / k
        collision_rate = collision / k
        timeout_rate = timeout / k
        assert success + collision + timeout == k
        avg_nav_time = sum(success_times) / len(
            success_times) if success_times else self.env.time_limit
        if not print_info:
            extra_info = '' if episode is None else 'in episode {} '.format(
                episode)
            logging.info(
                '{:<5} {} success: {:.2f}, collision: {:.2f}, nav time: {:.2f}, reward: {:.4f} +- {:.4f}'
                .format(phase.upper(), extra_info, success_rate,
                        collision_rate, avg_nav_time,
                        statistics.mean(cumulative_rewards),
                        (statistics.stdev(cumulative_rewards)
                         if len(cumulative_rewards) > 1 else 0.0)))
        info = {
            'success': success_rate,
            'collision': collision_rate,
            'nav time': avg_nav_time,
            'reward': statistics.mean(cumulative_rewards)
        }
        if print_info:
            info = 'success: {:.4f},collision: {:.4f},nav time: {:.2f},reward: {:.4f}, timeout : {:.4f}'.\
            format(success_rate, collision_rate, avg_nav_time, statistics.mean(cumulative_rewards), timeout_rate)
            if traj_head is not None:
                info += ', traj accuracy : {:.4f}'.format(traj_acc_sum)
            if suffix is not None:
                info = suffix + info
            print(info)
        # if phase in ['val', 'test']:
        #     total_time = sum(success_times + collision_times + timeout_times) * self.robot.time_step
        #     logging.info('Frequency of being in danger: %.2f', too_close / total_time)

        if print_failure:
            logging.info('Collision cases: ' +
                         ' '.join([str(x) for x in collision_cases]))
            logging.info('Timeout cases: ' +
                         ' '.join([str(x) for x in timeout_cases]))

        info_dic = {
            'success': success_rate,
            'collision': collision_rate,
            'nav time': avg_nav_time,
            'reward': statistics.mean(cumulative_rewards),
            'traj accuracy': traj_acc_sum
        }

        if output_info:
            if return_states:
                return info_dic, states_s
            else:
                return info_dic

        if imitation_learning:
            return human_obsv

    def update_memory(self,
                      states,
                      actions,
                      rewards,
                      imitation_learning=False):
        if self.memory is None or self.gamma is None:
            raise ValueError('Memory or gamma value is not set!')

        for i, state in enumerate(states):

            cumulative_reward = sum([
                pow(self.gamma,
                    max(t - i, 0) * self.robot.time_step * self.robot.v_pref) *
                reward * (1 if t >= i else 0)
                for t, reward in enumerate(rewards)
            ])
            value = torch.tensor([cumulative_reward],
                                 dtype=torch.float32).to(self.device)

            action = torch.tensor([actions[i][0], actions[i][1]],
                                  dtype=torch.float32).to(self.device)

            self.memory.push((state, action, value))