Esempio n. 1
0
def main(env_id, policy_file, record, stochastic, extra_kwargs):
    import gym
    from gym import wrappers
    import tensorflow as tf
    from es_distributed.policies import ESAtariPolicy
    from es_distributed.atari_wrappers import ScaledFloatFrame, wrap_deepmind
    from es_distributed.es import get_ref_batch
    import numpy as np

    is_atari_policy = "NoFrameskip" in env_id

    env = gym.make(env_id)
    if is_atari_policy:
        env = wrap_deepmind(env)

    if record:
        import uuid
        env = wrappers.Monitor(env, '/tmp/' + str(uuid.uuid4()), force=True)

    if extra_kwargs:
        import json
        extra_kwargs = json.loads(extra_kwargs)

    with tf.Session():
        pi = ESAtariPolicy.Load(policy_file, extra_kwargs=extra_kwargs)
        pi.set_ref_batch(get_ref_batch(env, batch_size=128))

        while True:
            if is_atari_policy:
                rews, t, novelty_vector = pi.rollout(env, render=True, random_stream=np.random if stochastic else None)
            print('return={:.4f} len={}'.format(rews, t))

            if record:
                env.close()
                return
Esempio n. 2
0
def main(env_ids, policy_directory, record, stochastic, extra_kwargs):
    import gym
    from gym import wrappers
    import tensorflow as tf
    from es_distributed.policies import MujocoPolicy, ESAtariPolicy, GAAtariPolicy
    from es_distributed.atari_wrappers import ScaledFloatFrame, wrap_deepmind
    from es_distributed.es import get_ref_batch
    import es_distributed.ns as ns
    import numpy as np
    import os

    env_ids = env_ids.split(' ')

    is_atari_policy = "NoFrameskip" in env_ids[0]

    files = 0

    for policy_name in os.listdir(policy_directory):
        files += 1
        policy_file = "%s/%s" % (policy_directory, policy_name)
        pid = os.fork()
        if (pid == 0):
            env = []
            for i in range(0, len(env_ids)):
                env.append(gym.make(env_ids[i]))
                if env_ids[i].endswith('NoFrameskip-v4'):
                    env[i] = wrap_deepmind(env[i])

            if extra_kwargs:
                import json
                extra_kwargs = json.loads(extra_kwargs)

            with tf.Session():
                if is_atari_policy:
                    pi = GAAtariPolicy.Load(policy_file, extra_kwargs=extra_kwargs)
                    if pi.needs_ref_batch:
                        pi.set_ref_batch(get_ref_batch(env[0], batch_size=128))
                else:
                    pi = MujocoPolicy.Load(policy_file, extra_kwargs=extra_kwargs)

                while True:
                    if is_atari_policy:
                        rews, t, novelty_vector = pi.rollout(env, render=True, random_stream=np.random if stochastic else None)

    for i in range(0, files):
        os.wait()
def evaluate_policy_on_levels(policy_file, ids, n_rep, record=False):

    import tensorflow as tf
    from es_distributed.policies import MujocoPolicy, ESAtariPolicy, GAAtariPolicy, GAGenesisPolicy
    from es_distributed.atari_wrappers import ScaledFloatFrame, wrap_deepmind
    from es_distributed.es import get_ref_batch
    import numpy as np

    is_atari_policy = True

    all_scores = []
    all_lengths = []
    all_percs = []
    all_rewards = []

    tf.reset_default_graph()

    with tf.Session():
        # create a baseline policy
        if policy_file == 'baseline':
            pi = BaselinePolicy()
        else:
            # load the policy just once
            pi = GAGenesisPolicy.Load(policy_file)
        # load the policy just once
        # play each env
        for id in ids:
            env = make_env(id, record=record)

            temp_all_scores = []
            temp_all_lengths = []
            temp_all_percs = []
            temp_all_rewards = []

            for i in range(0, n_rep):

                if pi.needs_ref_batch:
                    pi.set_ref_batch(get_ref_batch(env, batch_size=128))
                # play on this env
                rews, t, res_dict = pi.rollout(env, render=False)
                temp_all_lengths.append(t)
                # store the list of rewards
                perc = res_dict['max_perc'] if 'max_perc' in res_dict else 0
                temp_all_percs.append(perc)
                score = res_dict['max_score'] if 'max_score' in res_dict else 0
                temp_all_scores.append(score)
                temp_all_rewards.append(rews.sum())
            del env

            print(temp_all_percs)

            all_scores.append(np.mean(temp_all_scores))
            all_rewards.append(np.mean(temp_all_rewards))
            all_lengths.append(np.mean(temp_all_lengths))
            all_percs.append(np.mean(temp_all_percs))

    # scores, how far Sonic got to the goal, level length, and in-game score
    if len(all_scores) == 1:
        return all_scores[0], all_percs[0], all_lengths[0], all_rewards[0]
    else:
        return all_scores, all_percs, all_lengths, all_rewards