def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) task_name = get_task_short_name(args) logger.configure(dir='log_trpo_cartpole/%s' % task_name) def policy_fn(name, ob_space, ac_space, reuse=False): return build_policy(env, 'mlp', value_network='copy') import logging import os.path as osp import bench env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': from utils.mujoco_dset import Dset_gym expert_observations = np.genfromtxt( 'expert_data/cartpole/observations.csv') expert_actions = np.genfromtxt('expert_data/cartpole/actions.csv', dtype=np.int32) expert_dataset = Dset_gym(inputs=expert_observations, labels=expert_actions, randomize=True) # expert_dataset = (expert_observations, expert_actions) reward_giver = Discriminator(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) reward_guidance = Guidance(env, args.policy_hidden_size, expert_dataset=expert_dataset) train(env, args.seed, policy_fn, reward_giver, reward_guidance, expert_dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, args.loss_percent, task_name) elif args.task == 'evaluate': avg_len, avg_ret = runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=100, stochastic_policy=args.stochastic_policy, save=args.save_sample) result = np.array([avg_ret, avg_len]) txt_name = args.load_model_path + 'result.txt' np.savetxt(txt_name, result, fmt="%d", delimiter=" ") print(args.load_model_path, avg_ret, avg_len) print('保存成功') else: raise NotImplementedError env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) task_name = get_task_short_name(args) logger.configure(dir='log_trpo_mujoco/%s' % task_name) def policy_fn(name, ob_space, ac_space, reuse=False): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) import logging import os.path as osp import bench env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': expert_dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) reward_giver = Discriminator(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) reward_guidance = Guidance(env, args.policy_hidden_size, expert_dataset=expert_dataset) train(env, args.seed, policy_fn, reward_giver, reward_guidance, expert_dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, args.loss_percent, task_name) elif args.task == 'evaluate': avg_len, avg_ret = runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=100, stochastic_policy=args.stochastic_policy, save=args.save_sample) result = np.array([avg_ret, avg_len]) txt_name = args.load_model_path + 'result.txt' np.savetxt(txt_name, result, fmt="%d", delimiter=" ") print(args.load_model_path, avg_ret, avg_len) print('保存成功') else: raise NotImplementedError env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) from dp_env_v3 import DPEnv env = DPEnv task_name = get_task_short_name(args) if rank == 0: logger.configure(dir='log_gail/%s' % task_name) if rank != 0: logger.set_level(logger.DISABLED) def policy_fn(name, ob_space, ac_space, reuse=False): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) import logging import os.path as osp import bench env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, task_name) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample) else: raise NotImplementedError env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) # from dp_env_v2 import DPEnv from dp_env_v3 import DPEnv # from dp_env_test import DPEnv env = DPEnv() # env = gym.make('Humanoid-v2') task_name = get_task_short_name(args) def policy_fn(name, ob_space, ac_space, reuse=False): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) if args.task == 'train': import logging import os.path as osp import bench if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.configure(dir='log_tmp/%s'%task_name) if MPI.COMM_WORLD.Get_rank() != 0: logger.set_level(logger.DISABLED) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_short_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) train(env, args.seed, policy_fn, args.g_step, args.policy_entcoeff, args.pretrained_weight_path, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, task_name) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=100, stochastic_policy=args.stochastic_policy, save=args.save_sample) else: raise NotImplementedError env.close()
def load_policy(model_path, input_dim, output_dim, num_hidden, num_layers, init_logstd=1., discrete=False, beta=1.0): observation_space = Box(low=-np.inf, high=np.inf, shape=(input_dim, )) if discrete: action_space = Discrete(n=output_dim) else: action_space = Box(low=-np.inf, high=np.inf, shape=(output_dim, )) tf.reset_default_graph() config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=8, intra_op_parallelism_threads=8, device_count={'CPU': 8}) config.gpu_options.allow_growth = True sess = U.make_session(make_default=True, config=config) network = mlp(num_hidden=num_hidden, num_layers=num_layers) # policy_train = build_policy(observation_space, action_space, network, trainable_variance=True, # state_dependent_variance=True, beta=beta, init_logstd=init_logstd)() ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) policy_train = build_policy_trpo( env, network, value_network='copy')(observ_placeholder=ob) U.initialize() if model_path != '': policy_train.load(model_path) return policy_train