def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)

    task_name = get_task_short_name(args)
    logger.configure(dir='log_trpo_cartpole/%s' % task_name)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return build_policy(env, 'mlp', value_network='copy')

    import logging
    import os.path as osp
    import bench
    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)

    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    if args.task == 'train':
        from utils.mujoco_dset import Dset_gym
        expert_observations = np.genfromtxt(
            'expert_data/cartpole/observations.csv')
        expert_actions = np.genfromtxt('expert_data/cartpole/actions.csv',
                                       dtype=np.int32)
        expert_dataset = Dset_gym(inputs=expert_observations,
                                  labels=expert_actions,
                                  randomize=True)
        # expert_dataset = (expert_observations, expert_actions)
        reward_giver = Discriminator(env,
                                     args.adversary_hidden_size,
                                     entcoeff=args.adversary_entcoeff)
        reward_guidance = Guidance(env,
                                   args.policy_hidden_size,
                                   expert_dataset=expert_dataset)
        train(env, args.seed, policy_fn, reward_giver, reward_guidance,
              expert_dataset, args.algo, args.g_step, args.d_step,
              args.policy_entcoeff, args.num_timesteps, args.save_per_iter,
              args.checkpoint_dir, args.log_dir, args.pretrained,
              args.BC_max_iter, args.loss_percent, task_name)
    elif args.task == 'evaluate':
        avg_len, avg_ret = runner(env,
                                  policy_fn,
                                  args.load_model_path,
                                  timesteps_per_batch=1024,
                                  number_trajs=100,
                                  stochastic_policy=args.stochastic_policy,
                                  save=args.save_sample)
        result = np.array([avg_ret, avg_len])
        txt_name = args.load_model_path + 'result.txt'
        np.savetxt(txt_name, result, fmt="%d", delimiter=" ")
        print(args.load_model_path, avg_ret, avg_len)
        print('保存成功')
    else:
        raise NotImplementedError
    env.close()
Esempio n. 2
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)

    task_name = get_task_short_name(args)
    logger.configure(dir='log_trpo_mujoco/%s' % task_name)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         reuse=reuse,
                         hid_size=args.policy_hidden_size,
                         num_hid_layers=2)

    import logging
    import os.path as osp
    import bench
    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)

    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    if args.task == 'train':
        expert_dataset = Mujoco_Dset(expert_path=args.expert_path,
                                     traj_limitation=args.traj_limitation)
        reward_giver = Discriminator(env,
                                     args.adversary_hidden_size,
                                     entcoeff=args.adversary_entcoeff)
        reward_guidance = Guidance(env,
                                   args.policy_hidden_size,
                                   expert_dataset=expert_dataset)
        train(env, args.seed, policy_fn, reward_giver, reward_guidance,
              expert_dataset, args.algo, args.g_step, args.d_step,
              args.policy_entcoeff, args.num_timesteps, args.save_per_iter,
              args.checkpoint_dir, args.log_dir, args.pretrained,
              args.BC_max_iter, args.loss_percent, task_name)
    elif args.task == 'evaluate':
        avg_len, avg_ret = runner(env,
                                  policy_fn,
                                  args.load_model_path,
                                  timesteps_per_batch=1024,
                                  number_trajs=100,
                                  stochastic_policy=args.stochastic_policy,
                                  save=args.save_sample)

        result = np.array([avg_ret, avg_len])
        txt_name = args.load_model_path + 'result.txt'
        np.savetxt(txt_name, result, fmt="%d", delimiter=" ")
        print(args.load_model_path, avg_ret, avg_len)
        print('保存成功')
    else:
        raise NotImplementedError
    env.close()