env = gym.make(params.env_name) env = Monitor(env, params.video_dir, force=True) random_process = GaussianNoise(mu=0.0, sigma=0.0) agent = DDPG(Actor, Critic, env.action_space.shape[0], random_process, params) global_timestep = tf.compat.v1.train.get_or_create_global_step() all_distances, all_rewards, all_actions = list(), list(), list() distance_func = get_distance( agent.params.env_name) # create the distance measure func print("=== Evaluation Mode ===") for ep in range(params.n_trial): env.record_start() obs = env.reset() state = obs["flat_obs"] done = False episode_reward = 0 while not done: action = agent.eval_predict(state) # action = env.action_space.sample() # scale for execution in env (in DDPG, every action is clipped between [-1, 1] in agent.predict) obs, reward, done, info = env.step(action * env.action_space.high) # print(action, reward) next_flat_state, next_graph_state = obs["flat_obs"], obs["graph_obs"] distance = distance_func(action, reward, info) all_actions.append(action.mean()**2) # Mean Squared of action values all_distances.append(distance) state = next_flat_state
# run this from the terminal and make sure you are loading appropriate environment variables # $ echo $LD_LIBRARY_PATH import gym from tf_rl.common.monitor import Monitor import environments.register as register video_dir = "./video/" temp = 5 env = gym.make("CentipedeSix-v1") env = Monitor(env, video_dir, force=True) for ep in range(10): if ep % temp == 0: print("recording") env.record_start() env.reset() done = False while not done: # env.render() action = env.action_space.sample() s, r, done, info = env.step(action) # take a random action if ep % temp == 0: env.record_end()
# Invoke the agent agent = DDPG(GGNN, Critic, node_info, env.action_space.shape[0], params) """ === Training Phase === """ get_ready(agent.params) global_timestep = tf.compat.v1.train.get_or_create_global_step() time_buffer = deque(maxlen=agent.params.reward_buffer_ep) log = logger(agent.params) action_buffer, distance_buffer, eval_epochs = list(), list(), list() with summary_writer.as_default(): # for summary purpose, we put all codes in this context with tf.contrib.summary.always_record_summaries(): for i in itertools.count(): state = env.reset() total_reward = 0 start = time.time() done = False episode_len = 0 while not done: if global_timestep.numpy() < agent.params.learning_start: action = env.action_space.sample() else: action = agent.predict(state) # scale for execution in env (in DDPG, every action is clipped between [-1, 1] in agent.predict) next_state, reward, done, info = env.step( action * env.action_space.high) replay_buffer.add(state, action, reward, next_state, done) """ === Update the models