def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)

    task_name = get_task_short_name(args)
    logger.configure(dir='log_trpo_cartpole/%s' % task_name)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return build_policy(env, 'mlp', value_network='copy')

    import logging
    import os.path as osp
    import bench
    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)

    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    if args.task == 'train':
        from utils.mujoco_dset import Dset_gym
        expert_observations = np.genfromtxt(
            'expert_data/cartpole/observations.csv')
        expert_actions = np.genfromtxt('expert_data/cartpole/actions.csv',
                                       dtype=np.int32)
        expert_dataset = Dset_gym(inputs=expert_observations,
                                  labels=expert_actions,
                                  randomize=True)
        # expert_dataset = (expert_observations, expert_actions)
        reward_giver = Discriminator(env,
                                     args.adversary_hidden_size,
                                     entcoeff=args.adversary_entcoeff)
        reward_guidance = Guidance(env,
                                   args.policy_hidden_size,
                                   expert_dataset=expert_dataset)
        train(env, args.seed, policy_fn, reward_giver, reward_guidance,
              expert_dataset, args.algo, args.g_step, args.d_step,
              args.policy_entcoeff, args.num_timesteps, args.save_per_iter,
              args.checkpoint_dir, args.log_dir, args.pretrained,
              args.BC_max_iter, args.loss_percent, task_name)
    elif args.task == 'evaluate':
        avg_len, avg_ret = runner(env,
                                  policy_fn,
                                  args.load_model_path,
                                  timesteps_per_batch=1024,
                                  number_trajs=100,
                                  stochastic_policy=args.stochastic_policy,
                                  save=args.save_sample)
        result = np.array([avg_ret, avg_len])
        txt_name = args.load_model_path + 'result.txt'
        np.savetxt(txt_name, result, fmt="%d", delimiter=" ")
        print(args.load_model_path, avg_ret, avg_len)
        print('保存成功')
    else:
        raise NotImplementedError
    env.close()
Example #2
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)

    task_name = get_task_short_name(args)
    logger.configure(dir='log_trpo_mujoco/%s' % task_name)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         reuse=reuse,
                         hid_size=args.policy_hidden_size,
                         num_hid_layers=2)

    import logging
    import os.path as osp
    import bench
    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)

    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    if args.task == 'train':
        expert_dataset = Mujoco_Dset(expert_path=args.expert_path,
                                     traj_limitation=args.traj_limitation)
        reward_giver = Discriminator(env,
                                     args.adversary_hidden_size,
                                     entcoeff=args.adversary_entcoeff)
        reward_guidance = Guidance(env,
                                   args.policy_hidden_size,
                                   expert_dataset=expert_dataset)
        train(env, args.seed, policy_fn, reward_giver, reward_guidance,
              expert_dataset, args.algo, args.g_step, args.d_step,
              args.policy_entcoeff, args.num_timesteps, args.save_per_iter,
              args.checkpoint_dir, args.log_dir, args.pretrained,
              args.BC_max_iter, args.loss_percent, task_name)
    elif args.task == 'evaluate':
        avg_len, avg_ret = runner(env,
                                  policy_fn,
                                  args.load_model_path,
                                  timesteps_per_batch=1024,
                                  number_trajs=100,
                                  stochastic_policy=args.stochastic_policy,
                                  save=args.save_sample)

        result = np.array([avg_ret, avg_len])
        txt_name = args.load_model_path + 'result.txt'
        np.savetxt(txt_name, result, fmt="%d", delimiter=" ")
        print(args.load_model_path, avg_ret, avg_len)
        print('保存成功')
    else:
        raise NotImplementedError
    env.close()
Example #3
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    from dp_env_v3 import DPEnv
    env = DPEnv

    task_name = get_task_short_name(args)

    if rank == 0:
        logger.configure(dir='log_gail/%s' % task_name)
    if rank != 0:
        logger.set_level(logger.DISABLED)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         reuse=reuse,
                         hid_size=args.policy_hidden_size,
                         num_hid_layers=2)

    import logging
    import os.path as osp
    import bench
    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    if args.task == 'train':
        dataset = Mujoco_Dset(expert_path=args.expert_path,
                              traj_limitation=args.traj_limitation)
        reward_giver = TransitionClassifier(env,
                                            args.adversary_hidden_size,
                                            entcoeff=args.adversary_entcoeff)
        train(env, args.seed, policy_fn, reward_giver, dataset, args.algo,
              args.g_step, args.d_step, args.policy_entcoeff,
              args.num_timesteps, args.save_per_iter, args.checkpoint_dir,
              args.log_dir, args.pretrained, args.BC_max_iter, task_name)
    elif args.task == 'evaluate':
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=1024,
               number_trajs=10,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample)
    else:
        raise NotImplementedError
    env.close()
Example #4
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    # from dp_env_v2 import DPEnv
    from dp_env_v3 import DPEnv
    # from dp_env_test import DPEnv
    env = DPEnv()
    # env = gym.make('Humanoid-v2')

    task_name = get_task_short_name(args)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)

    if args.task == 'train':
        import logging
        import os.path as osp
        import bench
        if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
            logger.configure(dir='log_tmp/%s'%task_name)
        if MPI.COMM_WORLD.Get_rank() != 0:
            logger.set_level(logger.DISABLED)
        env = bench.Monitor(env, logger.get_dir() and
                            osp.join(logger.get_dir(), "monitor.json"))
        env.seed(args.seed)
        gym.logger.setLevel(logging.WARN)
        task_name = get_task_short_name(args)
        args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
        args.log_dir = osp.join(args.log_dir, task_name)

        train(env,
              args.seed,
              policy_fn,
              args.g_step,
              args.policy_entcoeff,
              args.pretrained_weight_path,
              args.num_timesteps,
              args.save_per_iter,
              args.checkpoint_dir,
              args.log_dir,
              task_name)
    elif args.task == 'evaluate':
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=1024,
               number_trajs=100,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample)
    else:
        raise NotImplementedError
    env.close()
def load_policy(model_path,
                input_dim,
                output_dim,
                num_hidden,
                num_layers,
                init_logstd=1.,
                discrete=False,
                beta=1.0):
    observation_space = Box(low=-np.inf, high=np.inf, shape=(input_dim, ))
    if discrete:
        action_space = Discrete(n=output_dim)
    else:
        action_space = Box(low=-np.inf, high=np.inf, shape=(output_dim, ))
    tf.reset_default_graph()
    config = tf.ConfigProto(allow_soft_placement=True,
                            inter_op_parallelism_threads=8,
                            intra_op_parallelism_threads=8,
                            device_count={'CPU': 8})
    config.gpu_options.allow_growth = True
    sess = U.make_session(make_default=True, config=config)
    network = mlp(num_hidden=num_hidden, num_layers=num_layers)
    # policy_train = build_policy(observation_space, action_space, network, trainable_variance=True,
    #                             state_dependent_variance=True, beta=beta, init_logstd=init_logstd)()
    ob_space = env.observation_space
    ac_space = env.action_space

    ob = observation_placeholder(ob_space)
    policy_train = build_policy_trpo(
        env, network, value_network='copy')(observ_placeholder=ob)
    U.initialize()
    if model_path != '':
        policy_train.load(model_path)
    return policy_train