예제 #1
0
def render(hid_size, load_path, video_path, env_id, seed, hist_len, block_high, give_state):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()
    def policy_fn(name, ob_space, ac_space, ob_name):
        return CompatibleMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=hid_size, num_hid_layers=2, ob_name=ob_name)

    env = make_control_env(env_id, seed, hist_len=hist_len,
                           block_high=block_high, version0=True, give_state=give_state)
    pi = policy_fn("pi", env.observation_space, env.action_space, ob_name="ob")
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess, load_path)
    ob = env.reset()
    frames = []
    while True:
        frame = env.unwrapped.render(mode='rgb_array')
        frames.append(frame)
        ac, vpred = pi.act(stochastic=False, ob=ob)
        print(ob)
        ob, rwd, done, _ = env.step(ac)
        if done:
            imageio.mimsave(video_path+'result.mp4', frames, fps=20)
            break
    env.close()
예제 #2
0
def train_copos(env_id, num_timesteps, seed, trial, hist_len, block_high,
                nsteps, method, hid_size, give_state, vf_iters):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    workerseed = seed * 10000

    def policy_fn(name, ob_space, ac_space):
        return CompatibleMlpPolicy(name=name,
                                   ob_space=ob_space,
                                   ac_space=ac_space,
                                   hid_size=hid_size,
                                   num_hid_layers=2)
        # return CompatiblecnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
        #      hid_size=hid_size, num_hid_layers=2)

    set_global_seeds(workerseed)
    # env = gym.make(env_id)

    env = make_control_env(env_id,
                           seed,
                           hist_len=hist_len,
                           block_high=block_high,
                           version0=True,
                           give_state=give_state)
    env.seed(workerseed)

    timesteps_per_batch = nsteps
    beta = -1
    if beta < 0:
        nr_episodes = num_timesteps // timesteps_per_batch
        # Automatically compute beta based on initial entropy and number of iterations
        tmp_pi = policy_fn("tmp_pi", env.observation_space, env.action_space)

        sess.run(tf.global_variables_initializer())

        tmp_ob = np.zeros((1, ) + env.observation_space.shape)
        entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.ob: tmp_ob})
        beta = 2 * entropy / nr_episodes
        print("Initial entropy: " + str(entropy) + ", episodes: " +
              str(nr_episodes))
        print("Automatically set beta: " + str(beta))

    copos_mpi.learn(env,
                    policy_fn,
                    timesteps_per_batch=timesteps_per_batch,
                    epsilon=0.01,
                    beta=beta,
                    cg_iters=10,
                    cg_damping=0.1,
                    max_timesteps=num_timesteps,
                    gamma=0.99,
                    lam=0.98,
                    vf_iters=vf_iters,
                    vf_stepsize=1e-3,
                    trial=trial,
                    method=method)
    env.close()
예제 #3
0
def train(env_id, num_timesteps, seed, num_trials=5):
    from baselines.ppo1 import mlp_policy, ppo_guided, pporocksample, ppo_guided2, pposgd_simple
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)
        # return mlp_policy.MlpBetaPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
        #     hid_size=64, num_hid_layers=2)

    for i_trial in range(num_trials):

        env = make_control_env(env_id, seed)

        # normalized histroy stepsize 15 for field vision full position rocksample
        # env = make_rocksample_env(seed, map_name="5x7", observation_type="field_vision_full_pos",
        # observation_noise=True, n_steps=15)

        # normalized fully observable rocksample
        # env = make_rocksample_env(seed, map_name="5x7", observation_type="fully_observable",
        #                            observation_noise=False, n_steps=15)

        # # guided way of normalized fully observable rocksample with history timestep 15
        # genv = make_control_env(env_id, seed)
        #
        # ppo_guided.learn(env, genv, i_trial, policy_fn,
        #         max_iters=1000,
        #         timesteps_per_actorbatch=5000,
        #         clip_param=0.2, entp=0.5,
        #         optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=32,
        #         gamma=0.99, lam=0.95, schedule='linear', useentr=False, retrace=False
        #                     )

        # pposgd_simple.learn(env, i_trial, policy_fn,
        #         max_iters=1000,
        #         timesteps_per_actorbatch=2048,
        #         clip_param=0.2, entcoeff=0.5,
        #         optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=32,
        #         gamma=0.99, lam=0.95, schedule='linear')

        pporocksample.learn(env,
                            i_trial,
                            policy_fn,
                            max_iters=800,
                            timesteps_per_actorbatch=2048,
                            clip_param=0.2,
                            entp=0.3,
                            optim_epochs=10,
                            optim_stepsize=3e-4,
                            optim_batchsize=32,
                            gamma=0.99,
                            lam=0.95,
                            schedule='linear',
                            useentr=True,
                            retrace=False)
        env.close()
예제 #4
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_policy, pposgd_simple, ppo_guided
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=32,
                                    num_hid_layers=2)

    # env = make_mujoco_env(env_id, seed)
    env = make_control_env(env_id, seed)
    i_trial = 1

    # genv = make_control_env(env_id, seed)
    #
    #
    # ppo_guided.learn(env, genv, i_trial, policy_fn,
    #         max_iters=100,
    #         timesteps_per_actorbatch=2048,
    #         clip_param=0.2, entp=0.5,
    #         optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
    #         gamma=0.99, lam=0.95, schedule='linear', useentr=False, retrace=False
    #                     )

    pposgd_simple.learn(env,
                        i_trial,
                        policy_fn,
                        max_iters=100,
                        timesteps_per_actorbatch=2048,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=10,
                        optim_stepsize=3e-4,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear')
    env.close()
def train_trpo(env_id, num_timesteps, seed, hist_len, block_high, nsteps,
               hid_size, give_state):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=hid_size,
                         num_hid_layers=2)

    set_global_seeds(workerseed)

    env = make_control_env(env_id,
                           workerseed,
                           hist_len=hist_len,
                           block_high=block_high,
                           not_guided=True,
                           give_state=False)
    env.seed(workerseed)

    timesteps_per_batch = nsteps

    trpo_mpi.learn(env,
                   policy_fn,
                   timesteps_per_batch=timesteps_per_batch,
                   max_kl=0.01,
                   cg_iters=10,
                   cg_damping=0.1,
                   max_timesteps=num_timesteps,
                   gamma=0.99,
                   lam=0.98,
                   vf_iters=5,
                   vf_stepsize=1e-3)
    env.close()
예제 #6
0
def train(env_id, num_timesteps, seed, num_trials=1):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    # if rank == 0:
    #     logger.configure()
    # else:
    #     logger.configure(format_strs=[])
    #     logger.set_level(logger.DISABLED)

    def policy_fn(name, ob_name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_name=ob_name, ob_space=ob_space, ac_space=ac_space,
            hid_size=32, num_hid_layers=2)
    for i_trial in range(num_trials):
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
        # env = make_mujoco_env(env_id, workerseed)
        env = make_control_env(env_id, workerseed)
        trpo_guided.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=20, cg_damping=0.1,
            max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3, i_trial=i_trial)
        env.close()
예제 #7
0
def train(env_id, num_timesteps, seed, trial, hist_len):
    env = make_control_env(env_id, seed, hist_len=hist_len)

    with tf.Session(config=tf.ConfigProto()):
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]
        with tf.variable_scope("vf"):
            vf = NeuralNetValueFunction(ob_dim, ac_dim)
        with tf.variable_scope("pi"):
            policy = GaussianMlpPolicy(ob_dim, ac_dim)

        learn(env,
              policy=policy,
              vf=vf,
              gamma=0.99,
              lam=0.97,
              timesteps_per_batch=2048,
              desired_kl=0.002,
              trial=trial,
              num_timesteps=num_timesteps,
              animate=False)

        env.close()
예제 #8
0
def train_copos(env_id, num_timesteps, seed, hist_len, block_high, nsteps,
                hid_size, give_state):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

    def policy_fn(name, ob_space, ac_space):
        return CompatibleMlpPolicy(name=name,
                                   ob_space=ob_space,
                                   ac_space=ac_space,
                                   hid_size=hid_size,
                                   num_hid_layers=2)

    set_global_seeds(workerseed)

    env = make_control_env(env_id,
                           workerseed,
                           hist_len=hist_len,
                           block_high=block_high,
                           not_guided=True,
                           give_state=True)
    env.seed(workerseed)

    timesteps_per_batch = nsteps

    ###TODO: The following several lines are used for evaluation
    # pi = policy_fn('pi', env.observation_space, env.action_space)
    # sess.run(tf.global_variables_initializer())
    # saver = tf.train.Saver()
    # saver.restore(sess, '/work/scratch/rz97hoku/ReinforcementLearning/tmp/hist4/copos-ratio/copos-ratio-1-11-05-20-11/checkpoints/00976.ckpt')
    # for m in range(100):
    #     ob = env.reset()
    #     ep_rwd = []
    #     while True:
    #         ac, _ = pi.act(stochastic=False, ob=ob)
    #         ob, rew, new, _ = env.step(ac)
    #         ep_rwd.append(rew)
    #         if new:
    #             break
    #     logger.record_tabular("Reward", np.sum(ep_rwd))
    #     logger.record_tabular("Episode", m)
    #     logger.dump_tabular()

    beta = -1
    if beta < 0:
        nr_episodes = num_timesteps // timesteps_per_batch
        # Automatically compute beta based on initial entropy and number of iterations
        tmp_pi = policy_fn("tmp_pi", env.observation_space, env.action_space)

        sess.run(tf.global_variables_initializer())

        tmp_ob = np.zeros((1, ) + env.observation_space.shape)
        entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.ob: tmp_ob})
        beta = 2 * entropy / nr_episodes
        print("Initial entropy: " + str(entropy) + ", episodes: " +
              str(nr_episodes))
        print("Automatically set beta: " + str(beta))

    #copos_mpi.learn(env, policy_fn, timesteps_per_batch=timesteps_per_batch, epsilon=0.01,
    #                beta=beta, cg_iters=10, cg_damping=0.1, sess=sess,
    #                max_timesteps=num_timesteps, gamma=0.99,
    #                lam=0.98, vf_iters=vf_iters, vf_stepsize=1e-3, trial=trial, method=method)
    copos_mpi.learn(env,
                    policy_fn,
                    timesteps_per_batch=timesteps_per_batch,
                    epsilon=0.01,
                    beta=beta,
                    cg_iters=10,
                    cg_damping=0.1,
                    max_timesteps=num_timesteps,
                    gamma=0.99,
                    lam=0.98,
                    vf_iters=5,
                    vf_stepsize=1e-3)
    env.close()