def train(env_id, num_timesteps, seed):
    from baselines.ppo_linear import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env = make_pybullet_env(env_id, seed)
    test_env = make_pybullet_env(env_id, seed)
    pposgd_simple.learn(env,
                        test_env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_actorbatch=2048,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=10,
                        optim_stepsize=1e-5,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear')
    test_env.close()
    env.close()
Exemple #2
0
def main():
    """
    Train on CartPole.
    """

    args = pybullet_arg_parser().parse_args()

    logger.configure(format_strs=['stdout', 'log', 'csv'],
                     log_suffix="Uber-GA-" + args.env + "_seed_" +
                     str(args.seed))
    logger.log("Algorithm:Uber-GA-" + args.env + "_seed_" + str(args.seed))
    env_id = args.env
    seed = args.seed
    generation = 0
    with make_session() as sess:
        env = make_pybullet_env(env_id, seed)
        try:
            model = simple_mlp(sess, env)
            sess.run(tf.global_variables_initializer())
            learn_sess = LearningSession(sess, model)
            while True:
                if generation >= 10000 or learn_sess.timesteps_so_far >= 5e6:
                    break
                pop = learn_sess.generation(env,
                                            trials=1,
                                            population=POPULATION)
                generation += 1
                # rewards = [x[0] for x in pop]
                # print('mean=%f best=%s' % (sum(rewards)/len(rewards), str(rewards[:10])))
        finally:
            env.close()
def train(env_id, num_timesteps, seed):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    # if rank == 0:
    #     logger.configure()
    # else:
    #     logger.configure(format_strs=[])
    #     logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=64,
                         num_hid_layers=2)

    env = make_pybullet_env(env_id, seed)
    trpo_mpi.learn(env,
                   policy_fn,
                   timesteps_per_batch=1024,
                   max_kl=0.01,
                   cg_iters=10,
                   cg_damping=0.1,
                   max_timesteps=num_timesteps,
                   gamma=0.99,
                   lam=0.98,
                   vf_iters=5,
                   vf_stepsize=1e-3)
    env.close()
def train(env_id, num_timesteps, seed):
    max_fitness = -100000
    popsize = 32
    gensize = 30 # gen size for each iteration
    bounds = [-5.0, 5.0]
    sigma = 0.1
    eval_iters = 1
    from baselines.cmaes_layer_entire import mlp_policy, cmaes_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64, num_hid_layers=2)

    base_env = make_pybullet_env(env_id, seed)
    cmaes_simple.learn(base_env,
                       policy_fn,
                       max_fitness = max_fitness,  # has to be negative, as cmaes consider minization
                       popsize = popsize,
                       gensize = gensize,
                       bounds = bounds,
                       sigma = sigma,
                       eval_iters = eval_iters,
                       max_timesteps=num_timesteps,
                       timesteps_per_actorbatch=2048,
                       seed=seed)
    base_env.close()
def train(env_id, num_timesteps, seed):
    max_fitness = -100000
    popsize = 10
    gensize = 100 # For each iterations
    max_v_train_iter = 10
    bounds = [-5.0, 5.0]
    sigma = 6e-5
    eval_iters = 1
    from baselines.ppo_cmaes_surrogate1_uniform import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)

    env = make_pybullet_env(env_id, seed)
    # test_env = make_pybullet_env(env_id, seed)
    pposgd_simple.learn(env, policy_fn,
                        max_fitness = max_fitness,  # has to be negative, we use minization
                        popsize = popsize,
                        gensize = gensize,
                        bounds = bounds,
                        sigma = sigma,
                        eval_iters = eval_iters,
                        max_v_train_iter = max_v_train_iter,
                        max_timesteps=num_timesteps,
                        timesteps_per_actorbatch=2048,
                        clip_param=0.2, entcoeff=0.0,
                        optim_epochs=10, optim_stepsize=3e-4,
                        optim_batchsize=64,
                        gamma=0.99, lam=0.95, schedule='linear',
                        seed=seed,
                        env_id=env_id)
    env.close()
Exemple #6
0
def train(env_id, num_timesteps, seed):
    max_fitness = -100000
    popsize = 32
    gensize = 10000
    alpha = 0.005
    sigma = 0.1
    eval_iters = 1
    from baselines.openai_es import mlp_policy, es_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64, num_hid_layers=2)

    base_env = make_pybullet_env(env_id, seed)
    # test_env.render(mode = "human")
    es_simple.learn(base_env,
                       policy_fn,
                       max_fitness = max_fitness,  # has to be negative, as cmaes consider minization
                       popsize = popsize,
                       gensize = gensize,
                       sigma = sigma,
                       alpha = alpha,
                       eval_iters = eval_iters,
                       max_timesteps=num_timesteps,
                       timesteps_per_actorbatch=2048,
                       seed=seed)
    base_env.close()
def train(env_id, num_timesteps, seed):
    from baselines.ppo_dual_nac_advantage import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env = make_pybullet_env(env_id, seed)
    pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        rho=0.95,  # Gradient weighting factor
        update_step_threshold=25,  # Updating step threshold
        schedule='linear')
    env.close()
Exemple #8
0
def train(env_id, num_timesteps, seed):
    max_fitness = -100000
    popsize = 101
    gensize = 10000
    truncation_size = 20
    sigma = 0.1
    eval_iters = 1
    from baselines.uber_ga import mlp_policy, ga_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64, num_hid_layers=2)

    base_env = make_pybullet_env(env_id, seed)
    ga_simple.learn(base_env,
                       policy_fn,
                       max_fitness = max_fitness,  # has to be negative, as cmaes consider minization
                       popsize = popsize,
                       gensize = gensize,
                       truncation_size = truncation_size,
                       sigma = sigma,
                       eval_iters = eval_iters,
                       max_timesteps=num_timesteps,
                       timesteps_per_actorbatch=2048,
                       seed=seed)
    base_env.close()
def train(env_id, num_timesteps, seed):
    from baselines.ars import ars
    main_loop_size = 1000
    horizon = 1000
    step_size = 0.005
    noise = 0.03
    hp = ars.Hp(main_loop_size, horizon, num_timesteps, step_size, noise)
    set_global_seeds(seed)
    env = make_pybullet_env(env_id, seed)
    # env = wrappers.Monitor(env, monitor_dir, force=True)
    num_inputs = env.observation_space.shape[0]
    num_outputs = env.action_space.shape[0]
    policy = ars.Policy(num_inputs, num_outputs, hp)
    normalizer = ars.Ob_Normalizer(num_inputs)
    ars.train(env, policy, normalizer, hp)
    env.close()
Exemple #10
0
def train(env_id, num_timesteps, seed):
    env = make_pybullet_env(env_id, seed)
    with tf.Session(config=tf.ConfigProto()):
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]
        with tf.variable_scope("vf"):
            vf = NeuralNetValueFunction(ob_dim, ac_dim)
        with tf.variable_scope("pi"):
            policy = GaussianMlpPolicy(ob_dim, ac_dim)

        learn(env,
              policy=policy,
              vf=vf,
              gamma=0.99,
              lam=0.97,
              timesteps_per_batch=2500,
              desired_kl=0.002,
              num_timesteps=num_timesteps,
              animate=False)

        env.close()