Exemple #1
0
def train(env_id, num_timesteps, seed, total_gen):
    import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env = make_mujoco_env(env_id, seed)
    pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='constant',
        total_gen=total_gen,
    )
    env.close()
Exemple #2
0
def train(num_timesteps, seed):
    import mlp_policy, pposgd_simple
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    workerseed = seed + 10000 * rank
    set_global_seeds(workerseed)
    env = RobotPath.env(render=False, max_step=2000)

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=512,
                                    num_hid_layers=3)

    pposgd_simple.learn(sess,
                        env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_actorbatch=2048,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=5,
                        optim_stepsize=3e-4,
                        optim_batchsize=256,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear')
    env.close()
Exemple #3
0
def train(env_id, num_timesteps, seed):
    import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1, num_gpu=0).__enter__()
    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=128,
                                    num_hid_layers=2)

    env.seed(seed)
    pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_batch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=1e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='constant',
    )
    env.close()
def train(env_id, num_timesteps, seed):
    from baselines.pposgd import mlp_policy
    import pposgd_simple
    sess=U.make_session(num_cpu=1)
    sess.__enter__()
    logger.session().__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)

    env = bench.Monitor(env, "monitor.json")
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    pposgd_simple.learn(env, policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_batch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
            gamma=0.99, lam=0.95,
             schedule='linear'
        )
    env.close()

    # save model
    saver = tf.train.Saver()
    saver.save(sess, "model/model.ckpt")
Exemple #5
0
def train(env_id, num_timesteps, seed):
    import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=32,
                                    num_hid_layers=2)

    env = make_mujoco_env(env_id, seed)
    logger.log("========observation_space %s action_space %s" %
               (str(env.observation_space), str(env.action_space)))
    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_actorbatch=1024,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=10,
                        optim_stepsize=3e-4,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear')
    env.close()
Exemple #6
0
def train(env_id, num_timesteps, seed, num_options, app, saves, wsaves, epoch,
          dc, method, mpath):
    #from baselines.ppo1
    import mlp_policy
    import pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2,
                                    num_options=num_options,
                                    dc=dc)

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)

    if num_options == 1:
        optimsize = 64
    elif num_options == 2:
        optimsize = 32
    else:
        print("Only two options or primitive actions is currently supported.")
        sys.exit()

    assert method in METHODS, "Method should be either of " + str(METHODS)
    pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        #timesteps_per_batch=2048,
        timesteps_per_batch=(
            2048 * 5
        ),  # this part is changed to realize more stable learning 2019/01/31
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=optimsize,
        gamma=0.99,
        lam=0.95,
        schedule='constant',
        num_options=num_options,
        app=app,
        saves=saves,
        wsaves=wsaves,
        epoch=epoch,
        seed=seed,
        dc=dc,
        method=method,
        mpath=mpath)
    env.close()
Exemple #7
0
def train():
    env = RLCube()
    num_timesteps = 10000
    timesteps_per_actorbatch = 1000
    import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    pi = pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=timesteps_per_actorbatch,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
    )
Exemple #8
0
def train(num_timesteps, seed, resume):
    map_folder = "."
    map_name = "empty"
    map_ = Map2D(map_folder, map_name)
    print("Map '{}' loaded.".format(map_name))
    # RL multi-agent simulator
    import pposgd_simple
    import cnn_policy
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    env = PepperRLEnv(args)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank(
    ) if seed is not None else None
    set_global_seeds(workerseed)

    def policy_fn(name, ob_space, ac_space):  #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space)

    pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=int(num_timesteps * 1.1),
        timesteps_per_actorbatch=256,
        clip_param=0.2,
        entcoeff=0.01,
        optim_epochs=4,
        optim_stepsize=1e-3,  # original 1e-3
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
        resume_training=resume,
    )
    env.close()
Exemple #9
0
def train(env, num_timesteps, seed):

    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    # mujoco_py.ignore_mujoco_warnings().__enter__()
    workerseed = seed + 10000 * rank
    set_global_seeds(workerseed)

    # env = make_robotics_env(env_id, workerseed, rank=rank)
    # def policy_fn(name, ob_space, ac_space):
    #     return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
    #         hid_size=256, num_hid_layers=3)

    # def policy_fn(name, ob_space, ac_space):
    #     return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_dims_p=[64, 64],
                         hid_dims_v=[64, 64])

    pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,
        entcoeff=0.01,
        optim_epochs=5,
        optim_stepsize=3e-4,
        optim_batchsize=256,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
    )
    env.close()
def train(args):
    rank = MPI.COMM_WORLD.Get_rank()

    ncpu = get_cpu_per_task()
    ncpu //= 8

    sys.stdout.flush()

    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()

    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])

    workerseed = int(args.seed)
    set_global_seeds(workerseed)

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env = make_mujoco_env(args.env, args.seed)
    running_scores = pposgd_simple.learn(env,
                                         policy_fn,
                                         timesteps_per_actorbatch=2048,
                                         optim_stepsize=3e-4,
                                         optim_batchsize=64,
                                         gamma=0.99,
                                         lam=0.95,
                                         schedule='linear',
                                         args=args)

    env.close()

    # Save result for run
    if MPI.COMM_WORLD.Get_rank() == 0:
        pkl_res(running_scores, args)
Exemple #11
0
 def train(self,num_timesteps,seed,
     model_path=None,model_iter=None,argtype=None):
     sess = U.single_threaded_session()
     sess.__enter__()
     def policy_fn(name, ob_space, ac_space):
         return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                 hid_size=128, num_hid_layers=3)
     self.set_seed(seed)
     pi = pposgd.learn(self, policy_fn,
             max_timesteps=num_timesteps,
             timesteps_per_actorbatch=512,
             clip_param=0.2,entcoeff=0.0,
             optim_epochs=10,optim_stepsize=1e-4,optim_batchsize=64, 
             gamma=0.99, 
             lam=0.95,
             schedule='linear',
             model_path=model_path,
             model_iter=model_iter,
             mode=argtype
         )
     return pi
Exemple #12
0
def main(args):
    # mpi communicator.
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    # seed.
    workerseed = args.seed + 10000 * comm.Get_rank() if args.seed is not None else None
    if workerseed is not None:
        tc.manual_seed(workerseed % 2 ** 32)
        np.random.seed(workerseed % 2 ** 32)
        random.seed(workerseed % 2 ** 32)

    # logger.
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])

    # env.
    env = make_atari(args.env_name)
    env.seed(workerseed)
    env = Monitor(env, logger.get_dir() and
              os.path.join(logger.get_dir(), str(rank)))
    print(f"frame_stacking: {args.frame_stacking}")
    env = wrap_deepmind(env, frame_stack=args.frame_stacking,
                        clip_rewards=(args.mode =='train'),
                        episode_life=(args.mode =='train'))  # See Mnih et al., 2015 -> Methods -> Training Details.
    env.seed(workerseed)

    # agent.
    agent = CnnPolicy(
        img_channels=env.observation_space.shape[-1],
        num_actions=env.action_space.n,
        kind=args.model_type)

    # optimizer and scheduler.
    max_grad_steps = args.optim_epochs * args.env_steps // (comm.Get_size() * args.optim_batchsize)

    optimizer = tc.optim.Adam(agent.parameters(), lr=args.optim_stepsize, eps=1e-5)
    scheduler = tc.optim.lr_scheduler.OneCycleLR(
        optimizer=optimizer, max_lr=args.optim_stepsize, total_steps=max_grad_steps,
        pct_start=0.0, anneal_strategy='linear', cycle_momentum=False,
        div_factor=1.0)

    # checkpoint.
    if rank == 0:
        try:
            state_dict = tc.load(os.path.join(args.checkpoint_dir, args.model_name, 'model.pth'))
            agent.load_state_dict(state_dict)
            print(f"Continuing from checkpoint found at {os.path.join(args.checkpoint_dir, args.model_name, 'model.pth')}")
        except FileNotFoundError:
            print("Bad checkpoint or none on process 0. Continuing from scratch.")

    # sync.
    with tc.no_grad():
        for p in agent.parameters():
            p_data = p.data.numpy()
            comm.Bcast(p_data, root=0)
            p.data.copy_(tc.tensor(p_data).float())

    # operations.
    if args.mode == 'train':
        learn(env=env, agent=agent, optimizer=optimizer, scheduler=scheduler, comm=comm,
              timesteps_per_actorbatch=args.timesteps_per_actorbatch, max_timesteps=args.env_steps,
              optim_epochs=args.optim_epochs, optim_batchsize=args.optim_batchsize,
              gamma=args.gamma, lam=args.lam, clip_param=args.epsilon, entcoeff=args.ent_coef,
              checkpoint_dir=args.checkpoint_dir, model_name=args.model_name)
        env.close()

    elif args.mode == 'play':
        if comm.Get_rank() == 0:
            play(env=env, agent=agent, args=args)
            env.close()

    elif args.mode == 'movie':
        if comm.Get_rank() == 0:
            movie(env=env, agent=agent, args=args)
            env.close()

    else:
        raise NotImplementedError("Mode of operation not supported!")