Esempio n. 1
0
def train(env_id, gpu, num_timesteps, seed, config):
    from ppo.ppo_rl import PPO
    set_global_seeds(seed, gpu)
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    if hasattr(config, 'wrap_env_fn'):
        env = config.wrap_env_fn(env)
        env.seed(seed)
    ppo_rl = PPO(env,
                 gpu=gpu,
                 policy=config.policy,
                 timesteps_per_batch=config.timesteps_per_batch,
                 clip_param=config.clip_param,
                 entcoeff=config.entcoeff,
                 optim_epochs=config.optim_epochs,
                 optim_stepsize=config.optim_stepsize,
                 optim_batchsize=config.optim_batchsize,
                 gamma=config.gamma,
                 lam=config.lam,
                 max_timesteps=num_timesteps,
                 schedule=config.schedule)
    ppo_rl.run()
    env.close()
Esempio n. 2
0
def train(env, gpu, num_timesteps, seed, config, log_dir, load_path):
    from ppo.ppo_rl import PPO
    set_global_seeds(seed, gpu)
    env = bench.Monitor(env,
                        logger.get_dir()
                        and osp.join(logger.get_dir(), "monitor.json"),
                        allow_early_resets=True)
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    if hasattr(config, 'wrap_env_fn'):
        env = config.wrap_env_fn(env)
        env.seed(seed)
    ppo_rl = PPO(env,
                 gpu=gpu,
                 policy=config.policy,
                 prob_dist=config.prob_dist,
                 num_hid_layers=config.num_hid_layers,
                 hid_size=config.hid_size,
                 timesteps_per_batch=config.timesteps_per_batch,
                 clip_param=config.clip_param,
                 beta=config.beta,
                 entcoeff=config.entcoeff,
                 optim_epochs=config.optim_epochs,
                 optim_stepsize=config.optim_stepsize,
                 optim_batchsize=config.optim_batchsize,
                 gamma=config.gamma,
                 lam=config.lam,
                 max_timesteps=num_timesteps,
                 schedule=config.schedule,
                 record_video_freq=config.record_video_freq,
                 log_dir=log_dir,
                 load_path=load_path)
    ppo_rl.run()
    env.close()
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)

    task_name = get_task_short_name(args)
    logger.configure(dir='log_trpo_cartpole/%s' % task_name)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return build_policy(env, 'mlp', value_network='copy')

    import logging
    import os.path as osp
    import bench
    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)

    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    if args.task == 'train':
        from utils.mujoco_dset import Dset_gym
        expert_observations = np.genfromtxt(
            'expert_data/cartpole/observations.csv')
        expert_actions = np.genfromtxt('expert_data/cartpole/actions.csv',
                                       dtype=np.int32)
        expert_dataset = Dset_gym(inputs=expert_observations,
                                  labels=expert_actions,
                                  randomize=True)
        # expert_dataset = (expert_observations, expert_actions)
        reward_giver = Discriminator(env,
                                     args.adversary_hidden_size,
                                     entcoeff=args.adversary_entcoeff)
        reward_guidance = Guidance(env,
                                   args.policy_hidden_size,
                                   expert_dataset=expert_dataset)
        train(env, args.seed, policy_fn, reward_giver, reward_guidance,
              expert_dataset, args.algo, args.g_step, args.d_step,
              args.policy_entcoeff, args.num_timesteps, args.save_per_iter,
              args.checkpoint_dir, args.log_dir, args.pretrained,
              args.BC_max_iter, args.loss_percent, task_name)
    elif args.task == 'evaluate':
        avg_len, avg_ret = runner(env,
                                  policy_fn,
                                  args.load_model_path,
                                  timesteps_per_batch=1024,
                                  number_trajs=100,
                                  stochastic_policy=args.stochastic_policy,
                                  save=args.save_sample)
        result = np.array([avg_ret, avg_len])
        txt_name = args.load_model_path + 'result.txt'
        np.savetxt(txt_name, result, fmt="%d", delimiter=" ")
        print(args.load_model_path, avg_ret, avg_len)
        print('保存成功')
    else:
        raise NotImplementedError
    env.close()
Esempio n. 4
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)

    task_name = get_task_short_name(args)
    logger.configure(dir='log_trpo_mujoco/%s' % task_name)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         reuse=reuse,
                         hid_size=args.policy_hidden_size,
                         num_hid_layers=2)

    import logging
    import os.path as osp
    import bench
    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)

    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    if args.task == 'train':
        expert_dataset = Mujoco_Dset(expert_path=args.expert_path,
                                     traj_limitation=args.traj_limitation)
        reward_giver = Discriminator(env,
                                     args.adversary_hidden_size,
                                     entcoeff=args.adversary_entcoeff)
        reward_guidance = Guidance(env,
                                   args.policy_hidden_size,
                                   expert_dataset=expert_dataset)
        train(env, args.seed, policy_fn, reward_giver, reward_guidance,
              expert_dataset, args.algo, args.g_step, args.d_step,
              args.policy_entcoeff, args.num_timesteps, args.save_per_iter,
              args.checkpoint_dir, args.log_dir, args.pretrained,
              args.BC_max_iter, args.loss_percent, task_name)
    elif args.task == 'evaluate':
        avg_len, avg_ret = runner(env,
                                  policy_fn,
                                  args.load_model_path,
                                  timesteps_per_batch=1024,
                                  number_trajs=100,
                                  stochastic_policy=args.stochastic_policy,
                                  save=args.save_sample)

        result = np.array([avg_ret, avg_len])
        txt_name = args.load_model_path + 'result.txt'
        np.savetxt(txt_name, result, fmt="%d", delimiter=" ")
        print(args.load_model_path, avg_ret, avg_len)
        print('保存成功')
    else:
        raise NotImplementedError
    env.close()
Esempio n. 5
0
 def _thunk():
     env = gym.make(env_id)
     env.seed(seed + rank)
     env = bench.Monitor(
         env, os.path.join(log_dir, "{}.monitor.json".format(rank)))
     # Ugly hack to detect atari.
     if hasattr(env.env, 'env') and hasattr(env.env.env, 'ale'):
         env = wrap_deepmind(env)
         env = WrapPyTorch(env)
     return env
Esempio n. 6
0
def main():
    """
    Run the atari test
    """
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int, default=1)
    parser.add_argument('--dueling', type=int, default=1)
    parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
    parser.add_argument('--num-timesteps', type=int, default=int(1e7))

    args = parser.parse_args()
    logger.configure()
    set_global_seeds(args.seed)
    env = make_atari(args.env)
    env = bench.Monitor(env, logger.get_dir())
    env = wrap_atari_dqn(env)
    policy = partial(CnnPolicy, dueling=args.dueling == 1)

    # model = DQN(
    #     env=env,
    #     policy=policy,
    #     learning_rate=1e-4,
    #     buffer_size=10000,
    #     exploration_fraction=0.1,
    #     exploration_final_eps=0.01,
    #     train_freq=4,
    #     learning_starts=10000,
    #     target_network_update_freq=1000,
    #     gamma=0.99,
    #     prioritized_replay=bool(args.prioritized),
    #     prioritized_replay_alpha=args.prioritized_replay_alpha,
    # )
    model = DQN(
        env=env,
        policy_class=CnnPolicy,
        learning_rate=1e-4,
        buffer_size=10000,
        double_q=False,
        prioritized_replay=True,
        prioritized_replay_alpha=0.6,
        dueling=True,
        train_freq=4,
        learning_starts=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        target_network_update_freq=1000,
        model_path='atari_Breakout_duel'
    )
    # model.learn(total_timesteps=args.num_timesteps, seed=args.seed)
    model.load('atari_Breakout_duel')
    model.evaluate(100)
    env.close()
Esempio n. 7
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    from dp_env_v3 import DPEnv
    env = DPEnv

    task_name = get_task_short_name(args)

    if rank == 0:
        logger.configure(dir='log_gail/%s' % task_name)
    if rank != 0:
        logger.set_level(logger.DISABLED)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         reuse=reuse,
                         hid_size=args.policy_hidden_size,
                         num_hid_layers=2)

    import logging
    import os.path as osp
    import bench
    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    if args.task == 'train':
        dataset = Mujoco_Dset(expert_path=args.expert_path,
                              traj_limitation=args.traj_limitation)
        reward_giver = TransitionClassifier(env,
                                            args.adversary_hidden_size,
                                            entcoeff=args.adversary_entcoeff)
        train(env, args.seed, policy_fn, reward_giver, dataset, args.algo,
              args.g_step, args.d_step, args.policy_entcoeff,
              args.num_timesteps, args.save_per_iter, args.checkpoint_dir,
              args.log_dir, args.pretrained, args.BC_max_iter, task_name)
    elif args.task == 'evaluate':
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=1024,
               number_trajs=10,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample)
    else:
        raise NotImplementedError
    env.close()
Esempio n. 8
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    # from dp_env_v2 import DPEnv
    from dp_env_v3 import DPEnv
    # from dp_env_test import DPEnv
    env = DPEnv()
    # env = gym.make('Humanoid-v2')

    task_name = get_task_short_name(args)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)

    if args.task == 'train':
        import logging
        import os.path as osp
        import bench
        if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
            logger.configure(dir='log_tmp/%s'%task_name)
        if MPI.COMM_WORLD.Get_rank() != 0:
            logger.set_level(logger.DISABLED)
        env = bench.Monitor(env, logger.get_dir() and
                            osp.join(logger.get_dir(), "monitor.json"))
        env.seed(args.seed)
        gym.logger.setLevel(logging.WARN)
        task_name = get_task_short_name(args)
        args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
        args.log_dir = osp.join(args.log_dir, task_name)

        train(env,
              args.seed,
              policy_fn,
              args.g_step,
              args.policy_entcoeff,
              args.pretrained_weight_path,
              args.num_timesteps,
              args.save_per_iter,
              args.checkpoint_dir,
              args.log_dir,
              task_name)
    elif args.task == 'evaluate':
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=1024,
               number_trajs=100,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample)
    else:
        raise NotImplementedError
    env.close()
Esempio n. 9
0
def create_gvgai_environment(env_id):
    from common.atari_wrappers import wrap_deepmind, make_atari, ActionDirectionEnv
    initial_direction = {'gvgai-testgame1': 3, 'gvgai-testgame2': 3}
    logger.configure()
    game_name = env_id.split('-lvl')[0]
    does_need_action_direction = False

    # Environment creation
    env = make_atari(env_id)
    env = bench.Monitor(env, logger.get_dir())
    env = wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=False, scale=True)
    if game_name in initial_direction:
        print("We should model with action direction")
        env = ActionDirectionEnv(env, initial_direction=initial_direction[game_name])
        does_need_action_direction = True
    return env, does_need_action_direction, game_name
Esempio n. 10
0
def main():
    """
    Run the atari test
    """
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int, default=1)
    parser.add_argument('--dueling', type=int, default=1)
    parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
    parser.add_argument('--num-timesteps', type=int, default=int(1e7))

    args = parser.parse_args()
    logger.configure()
    set_global_seeds(args.seed)

    env = make_atari(args.env)
    env.action_space.seed(args.seed)
    env = bench.Monitor(env, logger.get_dir())
    env = wrap_atari_dqn(env)

    model = DQN(env=env,
                policy_class=CnnPolicy,
                buffer_size=10000,
                learning_rate=1e-4,
                learning_starts=10000,
                target_network_update_freq=1000,
                train_freq=4,
                exploration_final_eps=0.01,
                exploration_fraction=0.1,
                prioritized_replay=True,
                model_path='atari_test_Breakout')
    model.learn(total_timesteps=args.num_timesteps)
    env.close()
Esempio n. 11
0
def main(args):
    from ppo1 import mlp_policy
    U.make_session(num_cpu=args.num_cpu).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)
    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            reuse=reuse, hid_size=64, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and
        osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)
    dataset = Mujoco_Dset(expert_path=args.expert_path, ret_threshold=args.ret_threshold, traj_limitation=args.traj_limitation)
    pretrained_weight = None
    if (args.pretrained and args.task == 'train') or args.algo == 'bc':
        # Pretrain with behavior cloning
        from gailtf.algo import behavior_clone
        if args.algo == 'bc' and args.task == 'evaluate':
            behavior_clone.evaluate(env, policy_fn, args.load_model_path, stochastic_policy=args.stochastic_policy)
            sys.exit()
        pretrained_weight = behavior_clone.learn(env, policy_fn, dataset,
            max_iters=args.BC_max_iter, pretrained=args.pretrained,
            ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, task_name=task_name)
        if args.algo == 'bc':
            sys.exit()

    from gailtf.network.adversary import TransitionClassifier
    # discriminator
    discriminator = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
    if args.algo == 'trpo':
        # Set up for MPI seed
        from mpi4py import MPI
        rank = MPI.COMM_WORLD.Get_rank()
        if rank != 0:
            logger.set_level(logger.DISABLED)
        workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()
        set_global_seeds(workerseed)
        env.seed(workerseed)
        from gailtf.algo import trpo_mpi
        if args.task == 'train':
            trpo_mpi.learn(env, policy_fn, discriminator, dataset,
                pretrained=args.pretrained, pretrained_weight=pretrained_weight,
                g_step=args.g_step, d_step=args.d_step,
                timesteps_per_batch=1024,
                max_kl=args.max_kl, cg_iters=10, cg_damping=0.1,
                max_timesteps=args.num_timesteps,
                entcoeff=args.policy_entcoeff, gamma=0.995, lam=0.97,
                vf_iters=5, vf_stepsize=1e-3,
                ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir,
                save_per_iter=args.save_per_iter, load_model_path=args.load_model_path,
                task_name=task_name)
        elif args.task == 'evaluate':
            trpo_mpi.evaluate(env, policy_fn, args.load_model_path, timesteps_per_batch=1024,
                number_trajs=10, stochastic_policy=args.stochastic_policy)
        else: raise NotImplementedError
    elif args.algo == 'ppo':
        # Set up for MPI seed
        from mpi4py import MPI
        rank = MPI.COMM_WORLD.Get_rank()
        if rank != 0:
            logger.set_level(logger.DISABLED)
        workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()
        set_global_seeds(workerseed)
        env.seed(workerseed)
        from gailtf.algo import ppo_mpi
        if args.task == 'train':
            ppo_mpi.learn(env, policy_fn, discriminator, dataset,
                           # pretrained=args.pretrained, pretrained_weight=pretrained_weight,
                           timesteps_per_batch=1024,
                           g_step=args.g_step, d_step=args.d_step,
                           # max_kl=args.max_kl, cg_iters=10, cg_damping=0.1,
                           clip_param= 0.2,entcoeff=args.policy_entcoeff,
                           max_timesteps=args.num_timesteps,
                            gamma=0.99, lam=0.95,
                           # vf_iters=5, vf_stepsize=1e-3,
                            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
                          d_stepsize=3e-4,
                          schedule='linear', ckpt_dir=args.checkpoint_dir,
                          save_per_iter=100, task=args.task,
                          sample_stochastic=args.stochastic_policy,
                          load_model_path=args.load_model_path,
                          task_name=task_name)
        elif args.task == 'evaluate':
            ppo_mpi.evaluate(env, policy_fn, args.load_model_path, timesteps_per_batch=1024,
                              number_trajs=10, stochastic_policy=args.stochastic_policy)
        else:
            raise NotImplementedError
    else: raise NotImplementedError

    env.close()