def policyEval(envs, model_path, log_dir, algo_class, algo_args, num_timesteps=251, num_cpu=1): """ evaluation for the policy in the given envs :param envs: the environment we want to evaluate :param model_path: (str)the path to the policy ckp :param log_dir: (str) the path from a gym temporal file :param algo_class: :param algo_args: :param num_timesteps: (int) numbers of the timesteps we want to evaluate the policy :param num_cpu: :return: """ tf.reset_default_graph() method = algo_class.load(model_path, args=algo_args) using_custom_vec_env = isinstance(envs, WrapFrameStack) obs = envs.reset() if using_custom_vec_env: obs = obs.reshape((1, ) + obs.shape) n_done = 0 last_n_done = 0 episode_reward = [] dones = [False for _ in range(num_cpu)] for i in range(num_timesteps): actions = method.getAction(obs, dones) obs, rewards, dones, _ = envs.step(actions) if using_custom_vec_env: obs = obs.reshape((1, ) + obs.shape) if using_custom_vec_env: if dones: obs = envs.reset() obs = obs.reshape((1, ) + obs.shape) n_done += np.sum(dones) if (n_done - last_n_done) > 1: last_n_done = n_done _, mean_reward = computeMeanReward(log_dir, n_done) episode_reward.append(mean_reward) printRed('Episode:{} Reward:{}'.format(n_done, mean_reward)) _, mean_reward = computeMeanReward(log_dir, n_done) printRed('Episode:{} Reward:{}'.format(n_done, mean_reward)) episode_reward.append(mean_reward) episode_reward = np.array(episode_reward) envs.close() return episode_reward
def callback(_locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) """ global win, win_smooth, win_episodes, n_steps, viz, params_saved, best_mean_reward # Create vizdom object only if needed if viz is None: viz = Visdom(port=VISDOM_PORT) is_es = registered_rl[ALGO_NAME][1] == AlgoType.EVOLUTION_STRATEGIES # Save RL agent parameters if not params_saved: # Filter locals params = filterJSONSerializableObjects(_locals) with open(LOG_DIR + "rl_locals.json", "w") as f: json.dump(params, f) params_saved = True # Save the RL model if it has improved if (n_steps + 1) % SAVE_INTERVAL == 0: # Evaluate network performance ok, mean_reward = computeMeanReward(LOG_DIR, N_EPISODES_EVAL, is_es=is_es, return_n_episodes=True) if ok: # Unpack mean reward and number of episodes mean_reward, n_episodes = mean_reward print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward)) else: # Not enough episode mean_reward = -10000 n_episodes = 0 # Save Best model if mean_reward > best_mean_reward and n_episodes >= MIN_EPISODES_BEFORE_SAVE: # Try saving the running average (only valid for mlp policy) try: if 'env' in _locals: _locals['env'].save_running_average(LOG_DIR) else: _locals['self'].env.save_running_average(LOG_DIR) except AttributeError: pass best_mean_reward = mean_reward printGreen("Saving new best model") ALGO.save(LOG_DIR + ALGO_NAME + "_model.pkl", _locals) # Plots in visdom if viz and (n_steps + 1) % LOG_INTERVAL == 0: win = timestepsPlot(viz, win, LOG_DIR, ENV_NAME, ALGO_NAME, bin_size=1, smooth=0, title=PLOT_TITLE, is_es=is_es) win_smooth = timestepsPlot(viz, win_smooth, LOG_DIR, ENV_NAME, ALGO_NAME, title=PLOT_TITLE + " smoothed", is_es=is_es) win_episodes = episodePlot(viz, win_episodes, LOG_DIR, ENV_NAME, ALGO_NAME, window=EPISODE_WINDOW, title=PLOT_TITLE + " [Episodes]", is_es=is_es) n_steps += 1 return True
def main(): load_args = parseArguments() train_args, load_path, algo_name, algo_class, srl_model_path, env_kwargs = loadConfigAndSetup(load_args) log_dir, envs, algo_args = createEnv(load_args, train_args, algo_name, algo_class, env_kwargs) assert (not load_args.plotting and not load_args.action_proba)\ or load_args.num_cpu == 1, "Error: cannot run plotting with more than 1 CPU" tf.reset_default_graph() set_global_seeds(load_args.seed) # createTensorflowSession() printYellow("Compiling Policy function....") printYellow(load_path) method = algo_class.load(load_path, args=algo_args) dones = [False for _ in range(load_args.num_cpu)] # HACK: check for custom vec env by checking if the last wrapper is WrapFrameStack # this is used for detecting algorithms that have a similar wrapping to deepq # is considered a hack because we are unable to detect if this wrapper was added earlier to the environment object using_custom_vec_env = isinstance(envs, WrapFrameStack) obs = envs.reset() if using_custom_vec_env: obs = obs.reshape((1,) + obs.shape) # plotting init if load_args.plotting: plt.pause(0.1) fig = plt.figure() old_obs = [] if registered_env[train_args["env"]][2] == PlottingType.PLOT_3D: ax = fig.add_subplot(111, projection='3d') line, = ax.plot([], [], [], c=[1, 0, 0, 1], label="episode 0") point = ax.scatter([0], [0], [0], c=[1, 0, 0, 1]) min_zone = [+np.inf, +np.inf, +np.inf] max_zone = [-np.inf, -np.inf, -np.inf] amplitude = [0, 0, 0] min_state_dim = 3 else: ax = fig.add_subplot(111) line, = ax.plot([], [], c=[1, 0, 0, 1], label="episode 0") point = ax.scatter([0], [0], c=[1, 0, 0, 1]) min_zone = [+np.inf, +np.inf] max_zone = [-np.inf, -np.inf] amplitude = [0, 0] min_state_dim = 2 fig.legend() if train_args["srl_model"] in ["ground_truth", "supervised"]: delta_obs = [envs.get_original_obs()[0]] else: # we need to rebuild the PCA representation, in order to visualize correctly in 3D # load the saved representations path = srl_model_path.split("/")[:-1] + "/image_to_state.json" X = np.array(list(json.load(open(path, 'r')).values())) X = fixStateDim(X, min_state_dim=min_state_dim) # estimate the PCA if registered_env[train_args["env"]][2] == PlottingType.PLOT_3D: pca = PCA(n_components=3) else: pca = PCA(n_components=2) pca.fit(X) delta_obs = [pca.transform(fixStateDim([obs[0]], min_state_dim=min_state_dim))[0]] plt.pause(0.00001) # check if the algorithm has a defined getActionProba function before allowing action_proba plotting if load_args.action_proba: if not hasattr(method, "getActionProba"): printYellow("Warning: requested flag --action-proba, " "but the algorihtm {} does not implement 'getActionProba'".format(algo_name)) else: fig_prob = plt.figure() ax_prob = fig_prob.add_subplot(111) old_obs = [] if train_args["continuous_actions"]: ax_prob.set_ylim(np.min(envs.action_space.low), np.max(envs.action_space.high)) bar = ax_prob.bar(np.arange(np.prod(envs.action_space.shape)), np.array([0] * np.prod(envs.action_space.shape)), color=plt.get_cmap('viridis')(int(1 / np.prod(envs.action_space.shape) * 255))) else: ax_prob.set_ylim(0, 1) bar = ax_prob.bar(np.arange(envs.action_space.n), np.array([0] * envs.action_space.n), color=plt.get_cmap('viridis')(int(1 / envs.action_space.n * 255))) plt.pause(1) background_prob = fig_prob.canvas.copy_from_bbox(ax_prob.bbox) n_done = 0 last_n_done = 0 episode = 0 for i in range(load_args.num_timesteps): actions = method.getAction(obs, dones) obs, rewards, dones, _ = envs.step(actions) if using_custom_vec_env: obs = obs.reshape((1,) + obs.shape) # plotting if load_args.plotting: if train_args["srl_model"] in ["ground_truth", "supervised"]: ajusted_obs = envs.get_original_obs()[0] else: ajusted_obs = pca.transform(fixStateDim([obs[0]], min_state_dim=min_state_dim))[0] # create a new line, if the episode is finished if np.sum(dones) > 0: old_obs.append(np.array(delta_obs)) line.set_c(sns.color_palette()[episode % len(sns.color_palette())]) episode += 1 if registered_env[train_args["env"]][2] == PlottingType.PLOT_3D: line, = ax.plot([], [], [], c=[1, 0, 0, 1], label="episode " + str(episode)) else: line, = ax.plot([], [], c=[1, 0, 0, 1], label="episode " + str(episode)) fig.legend() delta_obs = [ajusted_obs] else: delta_obs.append(ajusted_obs) coor_plt = fixStateDim(np.array(delta_obs), min_state_dim=min_state_dim)[1:] unstack_val = coor_plt.shape[1] // train_args.get("num_stack", 1) coor_plt = coor_plt[:, -unstack_val:] # updating the 3d vertices for the line and the dot drawing, to avoid redrawing the entire image if registered_env[train_args["env"]][2] == PlottingType.PLOT_3D: line._verts3d = (coor_plt[:, 0], coor_plt[:, 1], coor_plt[:, 2]) point._offsets3d = (coor_plt[-1:, 0], coor_plt[-1:, 1], coor_plt[-1:, 2]) if coor_plt.shape[0] > 0: min_zone = np.minimum(np.amin(coor_plt, axis=0), min_zone) max_zone = np.maximum(np.amax(coor_plt, axis=0), max_zone) amplitude = max_zone - min_zone + 1e-10 ax.set_xlim(min_zone[0] - abs(amplitude[0] * 0.2), max_zone[0] + abs(amplitude[0] * 0.2)) ax.set_ylim(min_zone[1] - abs(amplitude[1] * 0.2), max_zone[1] + abs(amplitude[1] * 0.2)) ax.set_zlim(min_zone[2] - abs(amplitude[2] * 0.2), max_zone[2] + abs(amplitude[2] * 0.2)) else: line.set_xdata(coor_plt[:, 0]) line.set_ydata(coor_plt[:, 1]) point._offsets = coor_plt[-1:, :] if coor_plt.shape[0] > 0: min_zone = np.minimum(np.amin(coor_plt, axis=0), min_zone) max_zone = np.maximum(np.amax(coor_plt, axis=0), max_zone) amplitude = max_zone - min_zone + 1e-10 ax.set_xlim(min_zone[0] - abs(amplitude[0] * 0.2), max_zone[0] + abs(amplitude[0] * 0.2)) ax.set_ylim(min_zone[1] - abs(amplitude[1] * 0.2), max_zone[1] + abs(amplitude[1] * 0.2)) # Draw every 5 frames to avoid UI freezing if i % 5 == 0: fig.canvas.draw() plt.pause(0.000001) if load_args.action_proba and hasattr(method, "getActionProba"): # When continuous actions are needed, we cannot plot the action probability of every action # in the action space, so we show the action directly instead if train_args["continuous_actions"]: pi = method.getAction(obs, dones) else: pi = method.getActionProba(obs, dones) fig_prob.canvas.restore_region(background_prob) for act, rect in enumerate(bar): if train_args["continuous_actions"]: rect.set_height(pi[0][act]) color_val = np.abs(pi[0][act]) / max(np.max(envs.action_space.high), np.max(np.abs(envs.action_space.low))) else: rect.set_height(softmax(pi[0])[act]) color_val = softmax(pi[0])[act] rect.set_color(plt.get_cmap('viridis')(int(color_val * 255))) ax_prob.draw_artist(rect) fig_prob.canvas.blit(ax_prob.bbox) if using_custom_vec_env: if dones: obs = envs.reset() obs = obs.reshape((1,) + obs.shape) n_done += np.sum(dones) if (n_done - last_n_done) > 1: last_n_done = n_done _, mean_reward = computeMeanReward(log_dir, n_done) print("{} episodes - Mean reward: {:.2f}".format(n_done, mean_reward)) print("print: ", n_done, log_dir) _, mean_reward = computeMeanReward(log_dir, n_done) print("{} episodes - Mean reward: {:.2f}".format(n_done, mean_reward))