def train(env_id, gpu, num_timesteps, seed, config): from ppo.ppo_rl import PPO set_global_seeds(seed, gpu) env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) if hasattr(config, 'wrap_env_fn'): env = config.wrap_env_fn(env) env.seed(seed) ppo_rl = PPO(env, gpu=gpu, policy=config.policy, timesteps_per_batch=config.timesteps_per_batch, clip_param=config.clip_param, entcoeff=config.entcoeff, optim_epochs=config.optim_epochs, optim_stepsize=config.optim_stepsize, optim_batchsize=config.optim_batchsize, gamma=config.gamma, lam=config.lam, max_timesteps=num_timesteps, schedule=config.schedule) ppo_rl.run() env.close()
def train(env, gpu, num_timesteps, seed, config, log_dir, load_path): from ppo.ppo_rl import PPO set_global_seeds(seed, gpu) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) env.seed(seed) gym.logger.setLevel(logging.WARN) if hasattr(config, 'wrap_env_fn'): env = config.wrap_env_fn(env) env.seed(seed) ppo_rl = PPO(env, gpu=gpu, policy=config.policy, prob_dist=config.prob_dist, num_hid_layers=config.num_hid_layers, hid_size=config.hid_size, timesteps_per_batch=config.timesteps_per_batch, clip_param=config.clip_param, beta=config.beta, entcoeff=config.entcoeff, optim_epochs=config.optim_epochs, optim_stepsize=config.optim_stepsize, optim_batchsize=config.optim_batchsize, gamma=config.gamma, lam=config.lam, max_timesteps=num_timesteps, schedule=config.schedule, record_video_freq=config.record_video_freq, log_dir=log_dir, load_path=load_path) ppo_rl.run() env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) task_name = get_task_short_name(args) logger.configure(dir='log_trpo_cartpole/%s' % task_name) def policy_fn(name, ob_space, ac_space, reuse=False): return build_policy(env, 'mlp', value_network='copy') import logging import os.path as osp import bench env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': from utils.mujoco_dset import Dset_gym expert_observations = np.genfromtxt( 'expert_data/cartpole/observations.csv') expert_actions = np.genfromtxt('expert_data/cartpole/actions.csv', dtype=np.int32) expert_dataset = Dset_gym(inputs=expert_observations, labels=expert_actions, randomize=True) # expert_dataset = (expert_observations, expert_actions) reward_giver = Discriminator(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) reward_guidance = Guidance(env, args.policy_hidden_size, expert_dataset=expert_dataset) train(env, args.seed, policy_fn, reward_giver, reward_guidance, expert_dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, args.loss_percent, task_name) elif args.task == 'evaluate': avg_len, avg_ret = runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=100, stochastic_policy=args.stochastic_policy, save=args.save_sample) result = np.array([avg_ret, avg_len]) txt_name = args.load_model_path + 'result.txt' np.savetxt(txt_name, result, fmt="%d", delimiter=" ") print(args.load_model_path, avg_ret, avg_len) print('保存成功') else: raise NotImplementedError env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) task_name = get_task_short_name(args) logger.configure(dir='log_trpo_mujoco/%s' % task_name) def policy_fn(name, ob_space, ac_space, reuse=False): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) import logging import os.path as osp import bench env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': expert_dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) reward_giver = Discriminator(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) reward_guidance = Guidance(env, args.policy_hidden_size, expert_dataset=expert_dataset) train(env, args.seed, policy_fn, reward_giver, reward_guidance, expert_dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, args.loss_percent, task_name) elif args.task == 'evaluate': avg_len, avg_ret = runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=100, stochastic_policy=args.stochastic_policy, save=args.save_sample) result = np.array([avg_ret, avg_len]) txt_name = args.load_model_path + 'result.txt' np.savetxt(txt_name, result, fmt="%d", delimiter=" ") print(args.load_model_path, avg_ret, avg_len) print('保存成功') else: raise NotImplementedError env.close()
def _thunk(): env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor( env, os.path.join(log_dir, "{}.monitor.json".format(rank))) # Ugly hack to detect atari. if hasattr(env.env, 'env') and hasattr(env.env.env, 'ale'): env = wrap_deepmind(env) env = WrapPyTorch(env) return env
def main(): """ Run the atari test """ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--num-timesteps', type=int, default=int(1e7)) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) env = make_atari(args.env) env = bench.Monitor(env, logger.get_dir()) env = wrap_atari_dqn(env) policy = partial(CnnPolicy, dueling=args.dueling == 1) # model = DQN( # env=env, # policy=policy, # learning_rate=1e-4, # buffer_size=10000, # exploration_fraction=0.1, # exploration_final_eps=0.01, # train_freq=4, # learning_starts=10000, # target_network_update_freq=1000, # gamma=0.99, # prioritized_replay=bool(args.prioritized), # prioritized_replay_alpha=args.prioritized_replay_alpha, # ) model = DQN( env=env, policy_class=CnnPolicy, learning_rate=1e-4, buffer_size=10000, double_q=False, prioritized_replay=True, prioritized_replay_alpha=0.6, dueling=True, train_freq=4, learning_starts=10000, exploration_fraction=0.1, exploration_final_eps=0.01, target_network_update_freq=1000, model_path='atari_Breakout_duel' ) # model.learn(total_timesteps=args.num_timesteps, seed=args.seed) model.load('atari_Breakout_duel') model.evaluate(100) env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) from dp_env_v3 import DPEnv env = DPEnv task_name = get_task_short_name(args) if rank == 0: logger.configure(dir='log_gail/%s' % task_name) if rank != 0: logger.set_level(logger.DISABLED) def policy_fn(name, ob_space, ac_space, reuse=False): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) import logging import os.path as osp import bench env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, task_name) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample) else: raise NotImplementedError env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) # from dp_env_v2 import DPEnv from dp_env_v3 import DPEnv # from dp_env_test import DPEnv env = DPEnv() # env = gym.make('Humanoid-v2') task_name = get_task_short_name(args) def policy_fn(name, ob_space, ac_space, reuse=False): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) if args.task == 'train': import logging import os.path as osp import bench if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.configure(dir='log_tmp/%s'%task_name) if MPI.COMM_WORLD.Get_rank() != 0: logger.set_level(logger.DISABLED) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_short_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) train(env, args.seed, policy_fn, args.g_step, args.policy_entcoeff, args.pretrained_weight_path, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, task_name) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=100, stochastic_policy=args.stochastic_policy, save=args.save_sample) else: raise NotImplementedError env.close()
def create_gvgai_environment(env_id): from common.atari_wrappers import wrap_deepmind, make_atari, ActionDirectionEnv initial_direction = {'gvgai-testgame1': 3, 'gvgai-testgame2': 3} logger.configure() game_name = env_id.split('-lvl')[0] does_need_action_direction = False # Environment creation env = make_atari(env_id) env = bench.Monitor(env, logger.get_dir()) env = wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=False, scale=True) if game_name in initial_direction: print("We should model with action direction") env = ActionDirectionEnv(env, initial_direction=initial_direction[game_name]) does_need_action_direction = True return env, does_need_action_direction, game_name
def main(): """ Run the atari test """ parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--num-timesteps', type=int, default=int(1e7)) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) env = make_atari(args.env) env.action_space.seed(args.seed) env = bench.Monitor(env, logger.get_dir()) env = wrap_atari_dqn(env) model = DQN(env=env, policy_class=CnnPolicy, buffer_size=10000, learning_rate=1e-4, learning_starts=10000, target_network_update_freq=1000, train_freq=4, exploration_final_eps=0.01, exploration_fraction=0.1, prioritized_replay=True, model_path='atari_test_Breakout') model.learn(total_timesteps=args.num_timesteps) env.close()
def main(args): from ppo1 import mlp_policy U.make_session(num_cpu=args.num_cpu).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=64, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) dataset = Mujoco_Dset(expert_path=args.expert_path, ret_threshold=args.ret_threshold, traj_limitation=args.traj_limitation) pretrained_weight = None if (args.pretrained and args.task == 'train') or args.algo == 'bc': # Pretrain with behavior cloning from gailtf.algo import behavior_clone if args.algo == 'bc' and args.task == 'evaluate': behavior_clone.evaluate(env, policy_fn, args.load_model_path, stochastic_policy=args.stochastic_policy) sys.exit() pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, pretrained=args.pretrained, ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, task_name=task_name) if args.algo == 'bc': sys.exit() from gailtf.network.adversary import TransitionClassifier # discriminator discriminator = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) if args.algo == 'trpo': # Set up for MPI seed from mpi4py import MPI rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env.seed(workerseed) from gailtf.algo import trpo_mpi if args.task == 'train': trpo_mpi.learn(env, policy_fn, discriminator, dataset, pretrained=args.pretrained, pretrained_weight=pretrained_weight, g_step=args.g_step, d_step=args.d_step, timesteps_per_batch=1024, max_kl=args.max_kl, cg_iters=10, cg_damping=0.1, max_timesteps=args.num_timesteps, entcoeff=args.policy_entcoeff, gamma=0.995, lam=0.97, vf_iters=5, vf_stepsize=1e-3, ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, save_per_iter=args.save_per_iter, load_model_path=args.load_model_path, task_name=task_name) elif args.task == 'evaluate': trpo_mpi.evaluate(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy) else: raise NotImplementedError elif args.algo == 'ppo': # Set up for MPI seed from mpi4py import MPI rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env.seed(workerseed) from gailtf.algo import ppo_mpi if args.task == 'train': ppo_mpi.learn(env, policy_fn, discriminator, dataset, # pretrained=args.pretrained, pretrained_weight=pretrained_weight, timesteps_per_batch=1024, g_step=args.g_step, d_step=args.d_step, # max_kl=args.max_kl, cg_iters=10, cg_damping=0.1, clip_param= 0.2,entcoeff=args.policy_entcoeff, max_timesteps=args.num_timesteps, gamma=0.99, lam=0.95, # vf_iters=5, vf_stepsize=1e-3, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, d_stepsize=3e-4, schedule='linear', ckpt_dir=args.checkpoint_dir, save_per_iter=100, task=args.task, sample_stochastic=args.stochastic_policy, load_model_path=args.load_model_path, task_name=task_name) elif args.task == 'evaluate': ppo_mpi.evaluate(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy) else: raise NotImplementedError else: raise NotImplementedError env.close()