Example #1
0
env = gym.make(params.env_name)
env = Monitor(env, params.video_dir, force=True)

random_process = GaussianNoise(mu=0.0, sigma=0.0)
agent = DDPG(Actor, Critic, env.action_space.shape[0], random_process, params)

global_timestep = tf.compat.v1.train.get_or_create_global_step()

all_distances, all_rewards, all_actions = list(), list(), list()
distance_func = get_distance(
    agent.params.env_name)  # create the distance measure func
print("=== Evaluation Mode ===")
for ep in range(params.n_trial):
    env.record_start()
    obs = env.reset()
    state = obs["flat_obs"]
    done = False
    episode_reward = 0
    while not done:
        action = agent.eval_predict(state)
        # action = env.action_space.sample()

        # scale for execution in env (in DDPG, every action is clipped between [-1, 1] in agent.predict)
        obs, reward, done, info = env.step(action * env.action_space.high)
        # print(action, reward)
        next_flat_state, next_graph_state = obs["flat_obs"], obs["graph_obs"]
        distance = distance_func(action, reward, info)
        all_actions.append(action.mean()**2)  # Mean Squared of action values
        all_distances.append(distance)
        state = next_flat_state
Example #2
0
# run this from the terminal and make sure you are loading appropriate environment variables
# $ echo $LD_LIBRARY_PATH

import gym
from tf_rl.common.monitor import Monitor
import environments.register as register

video_dir = "./video/"
temp = 5

env = gym.make("CentipedeSix-v1")
env = Monitor(env, video_dir, force=True)

for ep in range(10):
    if ep % temp == 0:
        print("recording")
        env.record_start()

    env.reset()
    done = False
    while not done:
        # env.render()
        action = env.action_space.sample()
        s, r, done, info = env.step(action)  # take a random action
    if ep % temp == 0:
        env.record_end()
# Invoke the agent
agent = DDPG(GGNN, Critic, node_info, env.action_space.shape[0], params)
""" === Training Phase === """
get_ready(agent.params)

global_timestep = tf.compat.v1.train.get_or_create_global_step()
time_buffer = deque(maxlen=agent.params.reward_buffer_ep)
log = logger(agent.params)
action_buffer, distance_buffer, eval_epochs = list(), list(), list()

with summary_writer.as_default():
    # for summary purpose, we put all codes in this context
    with tf.contrib.summary.always_record_summaries():

        for i in itertools.count():
            state = env.reset()
            total_reward = 0
            start = time.time()
            done = False
            episode_len = 0
            while not done:
                if global_timestep.numpy() < agent.params.learning_start:
                    action = env.action_space.sample()
                else:
                    action = agent.predict(state)
                # scale for execution in env (in DDPG, every action is clipped between [-1, 1] in agent.predict)
                next_state, reward, done, info = env.step(
                    action * env.action_space.high)
                replay_buffer.add(state, action, reward, next_state, done)
                """
                === Update the models