コード例 #1
0
def train(args):
    from algo import pposgd_simple, pposgd_origin
    from nn import cnn_policy, cnn_lstm_policy, mlp_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()

    task_name = "ppo." + args.taskname + "." + args.env_id.split(
        "-")[0] + ".seed_" + ("%d" % args.seed)
    args.log_dir = osp.join(args.log_dir, task_name)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank(
    ) if args.seed is not None else None
    set_global_seeds(workerseed)
    env = make_env(args.env_id,
                   seed=args.seed,
                   frame_stack=False,
                   save_camera=True,
                   save_path="../saved_camera",
                   no_cnn=False)()

    def policy_fn(name, ob_space, ac_space):  #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name,
                                    ob_space,
                                    ac_space,
                                    hid_size=64,
                                    num_hid_layers=1)
        #return cnn_lstm_policy.CnnSenLSTMPolicy(name, ob_space, ac_space, hid_size=64, num_hid_layers = 1)
        #return mlp_policy.MlpPolicy(name, ob_space, ac_space, hid_size=64, num_hid_layers = 1)

    env.seed(workerseed)

    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=int(args.num_timesteps * 1.1),
                        timesteps_per_actorbatch=256,
                        clip_param=0.2,
                        entcoeff=0.01,
                        optim_epochs=4,
                        optim_stepsize=1e-3,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear',
                        save_per_iter=100,
                        ckpt_dir=args.checkpoint_dir,
                        log_dir=args.log_dir,
                        task_name=task_name,
                        task=args.task,
                        load_model_path=args.load_model_path,
                        sample_stochastic=args.sample_stochastic)
    env.close()
コード例 #2
0
ckpt_dir = osp.join(args.checkpoint_dir, task_name)

if rank == 0:
    logger.configure()
else:
    logger.configure(format_strs=[])

workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank(
) if args.seed is not None else None
set_global_seeds(workerseed)

args.no_cnn = True if args.policy_type == 'dense' else False

env = make_env(args.env_id,
               seed=args.seed,
               frame_stack=False,
               save_camera=False,
               remove_dyn=False,
               no_cnn=args.no_cnn)()
policy = PPO(env.observation_space, env.action_space, args.policy_type, args)

seg_gen = traj_segment_generator(policy.pi,
                                 env,
                                 args.timesteps_per_actorbatch,
                                 stochastic=True)

episodes_so_far = 0
timesteps_so_far = 0
iters_so_far = 0
lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
distbuffer = deque(maxlen=100)
コード例 #3
0
        ac1, vpred1 = self._act(stochastic, ob)
        return ac1[0], vpred1[0]



def max_pool(img, k):
    return tf.nn.max_pool(img, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding = "SAME")



if __name__ == "__main__":
    from env.env_util import make_env
    import rospy
    rospy.init_node("sample")

    env = make_env("PipelineTrack-v1")()

    pol = CnnPolicy("pi", env.observation_space, env.action_space, hid_size=256, num_hid_layers=1)
    ob = env.reset()

    sess = U.single_threaded_session()
    sess.__enter__()
    
    U.initialize()
    
    a, v = pol.act(True, ob)
    print(a)
    print(v)