def train(args): from algo import pposgd_simple, pposgd_origin from nn import cnn_policy, cnn_lstm_policy, mlp_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() task_name = "ppo." + args.taskname + "." + args.env_id.split( "-")[0] + ".seed_" + ("%d" % args.seed) args.log_dir = osp.join(args.log_dir, task_name) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank( ) if args.seed is not None else None set_global_seeds(workerseed) env = make_env(args.env_id, seed=args.seed, frame_stack=False, save_camera=True, save_path="../saved_camera", no_cnn=False)() def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name, ob_space, ac_space, hid_size=64, num_hid_layers=1) #return cnn_lstm_policy.CnnSenLSTMPolicy(name, ob_space, ac_space, hid_size=64, num_hid_layers = 1) #return mlp_policy.MlpPolicy(name, ob_space, ac_space, hid_size=64, num_hid_layers = 1) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=int(args.num_timesteps * 1.1), timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', save_per_iter=100, ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, task_name=task_name, task=args.task, load_model_path=args.load_model_path, sample_stochastic=args.sample_stochastic) env.close()
ckpt_dir = osp.join(args.checkpoint_dir, task_name) if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank( ) if args.seed is not None else None set_global_seeds(workerseed) args.no_cnn = True if args.policy_type == 'dense' else False env = make_env(args.env_id, seed=args.seed, frame_stack=False, save_camera=False, remove_dyn=False, no_cnn=args.no_cnn)() policy = PPO(env.observation_space, env.action_space, args.policy_type, args) seg_gen = traj_segment_generator(policy.pi, env, args.timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards distbuffer = deque(maxlen=100)
ac1, vpred1 = self._act(stochastic, ob) return ac1[0], vpred1[0] def max_pool(img, k): return tf.nn.max_pool(img, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding = "SAME") if __name__ == "__main__": from env.env_util import make_env import rospy rospy.init_node("sample") env = make_env("PipelineTrack-v1")() pol = CnnPolicy("pi", env.observation_space, env.action_space, hid_size=256, num_hid_layers=1) ob = env.reset() sess = U.single_threaded_session() sess.__enter__() U.initialize() a, v = pol.act(True, ob) print(a) print(v)