def policyCrossEval(log_dir, task, episode, model_path, num_timesteps=2000, num_cpu=1, seed=0): train_args, algo_name, algo_class, srl_model_path, env_kwargs = loadConfigAndSetup( log_dir) env_kwargs = EnvsKwargs(task, env_kwargs) OK = True if (not OK): # no latest model saved yet return None, False else: pass printGreen( "Evaluation from the model saved at: {}, with evaluation time steps: {}" .format(model_path, num_timesteps)) log_dir, environment, algo_args = createEnv(log_dir, train_args, algo_name, algo_class, env_kwargs, num_cpu=num_cpu, seed=seed) reward = policyEval(environment, model_path, log_dir, algo_class, algo_args, num_timesteps, num_cpu) # Just a trick to save the episode number of the reward,but need a little bit more space to store reward = np.append(episode, reward) return reward, True
def run(self): for step in reversed(range(self.max_steps + 1)): max_n_param_sampled = int( math.ceil(self.budget / self.max_iter * self.eta**step / (step + 1))) max_iters = self.max_iter * self.eta**(-step) all_parameters = np.array( [self.param_sampler() for _ in range(max_n_param_sampled)]) for i in range(step + 1): printGreen("\npop_itt:{}/{}, itt:{}/{}, pop_size:{}".format( self.max_steps - step, self.max_steps + 1, i, step + 1, len(all_parameters))) n_param_sampled = int( math.floor(max_n_param_sampled * self.eta**(-i))) num_iters = max_iters * self.eta**i losses = [ self.train(params, num_iters, train_id) for train_id, params in enumerate(all_parameters) ] self.history.extend( zip([(params, num_iters) for params in all_parameters], losses)) all_parameters = all_parameters[np.argsort( losses)[:int(math.floor(n_param_sampled / self.eta))]] return self.history[int(np.argmin([val[1] for val in self.history]))]
def callback(_locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) """ global win, win_smooth, win_episodes, n_steps, viz, params_saved, best_mean_reward # Create vizdom object only if needed if viz is None: viz = Visdom(port=VISDOM_PORT) is_es = registered_rl[ALGO_NAME][1] == AlgoType.EVOLUTION_STRATEGIES # Save RL agent parameters if not params_saved: # Filter locals params = filterJSONSerializableObjects(_locals) with open(LOG_DIR + "rl_locals.json", "w") as f: json.dump(params, f) params_saved = True # Save the RL model if it has improved if (n_steps + 1) % SAVE_INTERVAL == 0: # Evaluate network performance ok, mean_reward = computeMeanReward(LOG_DIR, N_EPISODES_EVAL, is_es=is_es, return_n_episodes=True) if ok: # Unpack mean reward and number of episodes mean_reward, n_episodes = mean_reward print( "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward)) else: # Not enough episode mean_reward = -10000 n_episodes = 0 # Save Best model if mean_reward > best_mean_reward and n_episodes >= MIN_EPISODES_BEFORE_SAVE: # Try saving the running average (only valid for mlp policy) try: if 'env' in _locals: _locals['env'].save_running_average(LOG_DIR) else: _locals['self'].env.save_running_average(LOG_DIR) except AttributeError: pass best_mean_reward = mean_reward printGreen("Saving new best model") ALGO.save(LOG_DIR + ALGO_NAME + "_model.pkl", _locals) # Plots in visdom if viz and (n_steps + 1) % LOG_INTERVAL == 0: win = timestepsPlot(viz, win, LOG_DIR, ENV_NAME, ALGO_NAME, bin_size=1, smooth=0, title=PLOT_TITLE, is_es=is_es) win_smooth = timestepsPlot(viz, win_smooth, LOG_DIR, ENV_NAME, ALGO_NAME, title=PLOT_TITLE + " smoothed", is_es=is_es) win_episodes = episodePlot(viz, win_episodes, LOG_DIR, ENV_NAME, ALGO_NAME, window=EPISODE_WINDOW, title=PLOT_TITLE + " [Episodes]", is_es=is_es) n_steps += 1 return True
def loadRunningAverage(envs, load_path_normalise=None): if load_path_normalise is not None: try: printGreen("Loading saved running average") envs.load_running_average(load_path_normalise) envs.training = False except FileNotFoundError: envs.training = True printYellow("Running Average files not found for VecNormalize, switching to training mode") return envs
def load_weight(self): """ new function that copy the value and the structure from self.params :return: """ #Creation of a new variable to the class PPO2 pretrained_weight = [self.sess.run(var) for var in self.params] printGreen("Pretrained weight loaded") return pretrained_weight
def loadConfigAndSetup(load_args): """ Get the training config and setup the parameters :param load_args: (Arguments) :return: (dict, str, str, str, dict) """ algo_name = "" for algo in list(registered_rl.keys()): if algo in load_args.log_dir: algo_name = algo break algo_class, algo_type, _ = registered_rl[algo_name] if algo_type == AlgoType.OTHER: raise ValueError(algo_name + " is not supported for replay") printGreen("\n" + algo_name + "\n") load_path = "{}/{}_model.pkl".format(load_args.log_dir, algo_name) env_globals = json.load(open(load_args.log_dir + "env_globals.json", 'r')) train_args = json.load(open(load_args.log_dir + "args.json", 'r')) env_kwargs = { "renders": load_args.render, "shape_reward": load_args.shape_reward, # Reward sparse or shaped "action_joints": train_args["action_joints"], "is_discrete": not train_args["continuous_actions"], "random_target": train_args.get('random_target', False), "srl_model": train_args["srl_model"] } # load it, if it was defined if "action_repeat" in env_globals: env_kwargs["action_repeat"] = env_globals['action_repeat'] # Remove up action if train_args["env"] == "Kuka2ButtonGymEnv-v0": env_kwargs["force_down"] = env_globals.get('force_down', True) else: env_kwargs["force_down"] = env_globals.get('force_down', False) srl_model_path = None if train_args["srl_model"] != "raw_pixels": train_args["policy"] = "mlp" path = env_globals.get('srl_model_path') if path is not None: env_kwargs["use_srl"] = True # Check that the srl saved model exists on the disk assert os.path.isfile( env_globals['srl_model_path']), "{} does not exist".format( env_globals['srl_model_path']) srl_model_path = env_globals['srl_model_path'] env_kwargs["srl_model_path"] = srl_model_path return train_args, load_path, algo_name, algo_class, srl_model_path, env_kwargs
def comparePlots(path, algo, y_limits, title="Learning Curve", timesteps=False, truncate_x=-1, no_display=False, normalization=False): """ :param path: (str) path to the folder where the plots are stored :param plots: ([str]) List of saved plots as npz file :param y_limits: ([float]) y-limits for the plot :param title: (str) plot title :param timesteps: (bool) Plot timesteps instead of episodes :param truncate_x: (int) Truncate the experiments after n ticks on the x-axis :param no_display: (bool) Set to true, the plot won't be displayed (useful when only saving plot) """ folders = [] other = [] legends = [] for folder in os.listdir(path): folders_srl = [] other_srl = [] tmp_path = "{}/{}/{}/".format(path, folder, algo) legends.append(folder) for f in os.listdir(tmp_path): paths = "{}/{}/{}/{}/".format(path, folder, algo, f) env_globals = json.load(open(paths + "env_globals.json", 'r')) train_args = json.load(open(paths + "args.json", 'r')) if train_args["shape_reward"] == args.shape_reward: folders_srl.append(paths) else: other_srl.append(paths) folders.append(folders_srl) other.append(other_srl) x_list, y_list = [], [] for folders_srl in folders: printGreen("Folder name {}".format(folders_srl)) x, y = GatherExperiments(folders_srl, algo, window=40, title=title, min_num_x=-1, timesteps=timesteps, output_file="") print(len(x)) x_list.append(x) y_list.append(y) printGreen(np.array(x_list).shape) # printGreen('y_list shape {}'.format(np.array(y_list[1]).shape)) plotGatheredData(x_list, y_list, y_limits, timesteps, title, legends, no_display, truncate_x, normalization)
def plotGatheredData(x_list,y_list,y_limits, timesteps,title,legends,no_display,truncate_x=-1,normalization=False): assert len(legends)==len(y_list) printGreen("{} Experiments".format(len(y_list))) lengths = list(map(len, x_list)) min_x, max_x = np.min(lengths), np.max(lengths) if truncate_x > 0: min_x = min(truncate_x, min_x) x = np.array(x_list[0][:min_x]) #To reformulize the data by the min_x for i in range(len(y_list)): y_list[i]=y_list[i][:, :min_x] y_list=np.array(y_list) #print("Min, Max rewards:", np.min(y_list), np.max(y_list)) #Normalize the data between 0 and 1. if (normalization): y_limits = [-0.05, 1.05] y_list =(y_list-np.min(y_list))/(np.max(y_list)-np.min(y_list)) fig = plt.figure(title) for i in range(len(y_list)): label = legends[i] y = y_list[i][:, :min_x] print('{}: {} experiments'.format(label, len(y))) # Compute mean for different seeds m = np.mean(y, axis=0) # Compute standard error s = np.squeeze(np.asarray(np.std(y, axis=0))) n = y.shape[0] plt.fill_between(x, m - s / np.sqrt(n), m + s / np.sqrt(n), color=lightcolors[i % len(lightcolors)], alpha=0.5) plt.plot(x, m, color=darkcolors[i % len(darkcolors)], label=label, linewidth=2) if timesteps: formatter = FuncFormatter(millions) plt.xlabel('Number of Timesteps') fig.axes[0].xaxis.set_major_formatter(formatter) else: plt.xlabel('Number of Episodes') if(normalization): plt.ylabel('Normalized Rewards') else: plt.ylabel('Rewards') plt.title(title, **fontstyle) plt.ylim(y_limits) plt.legend(framealpha=0.8, frameon=True, labelspacing=0.01, loc='lower right', fontsize=16) if not no_display: plt.show()
def _train(params, num_iters=None, train_id=None): # generate a print string print_str = "\nID_num={}, " format_args = [] if train_id is None: if not hasattr(_train, "current_id"): _train.current_id = 0 train_id = _train.current_id _train.current_id += 1 format_args.append(train_id) if num_iters is not None: print_str += "Num-timesteps={}, " format_args.append(int(max(MIN_ITERATION, num_iters * ITERATION_SCALE))) print_str += "Param:" printGreen(print_str.format(*format_args)) pprint.pprint(params) # cleanup old files if os.path.exists(args.log_dir): shutil.rmtree(args.log_dir) # add the training args that where parsed for the hyperparam optimizers if num_iters is not None: loop_args = ['--num-timesteps', str(int(max(MIN_ITERATION, num_iters * ITERATION_SCALE)))] else: loop_args = ['--num-timesteps', str(int(args.num_timesteps))] # redefine the hyperparam args for rl_baselines.train if len(params) > 0: loop_args.append("--hyperparam") for param_name, param_val in params.items(): loop_args.append("{}:{}".format(param_name, param_val)) # call the training ok = subprocess.call(['python', '-m', 'rl_baselines.train'] + train_args + loop_args, stdout=stdout) if ok != 0: # throw the error down to the terminal raise ChildProcessError("An error occured, error code: {}".format(ok)) # load the logging of the training, and extract the reward folders = glob.glob("{}/{}/{}/{}/*".format(args.log_dir, args.env, args.srl_model, args.algo)) assert len(folders) != 0, "Error: Could not find generated directory, halting {} search.".format(args.optimizer) rewards = [] for monitor_path in glob.glob(folders[0] + "/*.monitor.csv"): rewards.append(np.mean(pd.read_csv(monitor_path, skiprows=1)["r"][-10:])) if np.isnan(rewards).any(): rewards = -np.inf print("reward: ", np.mean(rewards)) # negative reward, as we are minimizing with hyperparameter search return -np.mean(rewards)
def policyCrossEval(log_dir, task, episode, model_path, num_timesteps=2000, num_cpu=1): """ To do a cross evaluation for a certain policy for different tasks A version of real time evaluation but with some bugs to fix :param log_dir: :param task: :param episode: :param model_path: :param num_timesteps: How many timesteps to evaluate the policy :param num_cpu: :return: """ train_args, algo_name, algo_class, srl_model_path, env_kwargs = loadConfigAndSetup( log_dir) env_kwargs = EnvsKwargs(task, env_kwargs) OK = True if (not OK): # no latest model saved yet return None, False else: pass printGreen( "Evaluation from the model saved at: {}, with evaluation time steps: {}" .format(model_path, num_timesteps)) log_dir, environment, algo_args = createEnv(log_dir, train_args, algo_name, algo_class, env_kwargs, num_cpu=num_cpu) reward = policyEval(environment, model_path, log_dir, algo_class, algo_args, num_timesteps, num_cpu) # Just a trick to save the episode number of the reward,but need a little bit more space to store reward = np.append(episode, reward) return reward, True
def makeTable(input_dir, rl_algo_name="ppo2", checkpoints=[1e6, 2*1e6, 3*1e6, 4*1e6, 5*1e6], episode_len=100, caption="my-caption", filepath=None): """ ---------- Latex table example ----------- \begin{table}[h!] \centering \begin{tabular}{c|ccc} % c: center, l: left \hline 0 & 0 & 0 & 1 \\ \hline 1 & 1 & 1 & 2 \\ 2 & 2 & 2 & 3 \\ \hline \end{tabular} \caption{} \label{tab:my-table} \end{table} ----------------------------------------- """ ext = filepath.split(".")[-1] assert ext in ["tex", "md"], "Only support Latex (tex) or Markdown (md) extension" if ext == "md": raise NotImplementedError srl_algo_dirs = glob.glob(os.path.join(input_dir, "*")) # list of subfolder in input_dir table = defaultdict(lambda: []) for folder in srl_algo_dirs: srl_name = folder.split("/")[-1] srl_algo_exps = glob.glob(os.path.join(os.path.join(folder, rl_algo_name, "*"))) printGreen("Found srl model: {} with {} experiments.".format(srl_name.ljust(20), len(srl_algo_exps))) for exp_dir in srl_algo_exps: _, rewards_history, total_timesteps = loadEpisodesData(exp_dir) mean_rwd = meanEpisodesReward(rewards_history, total_timesteps, checkpoints=checkpoints, episode_len=episode_len) table[srl_name].append(mean_rwd) with open(filepath, "w") as file: file.writelines("\\begin{table}[h!]\n") file.writelines("\\centering\n") file.writelines("\\begin{tabular}{c|"+len(checkpoints)*"c"+"}\n") file.writelines("\\hline\n") x_axis = timesteps2str(checkpoints) file.writelines(" & {} \\\\ \\hline \n".format(" & ".join(x_axis))) srl_names_list = sorted(list(table.keys())) if "ground_truth" in srl_names_list: ## put ground truth on the top of table srl_names_list.remove("ground_truth") srl_names_list.insert(0, "ground_truth") for ind, srl_name in enumerate(srl_names_list): res = results2str_latex(table[srl_name]) if ind == len(table) - 1 or srl_name == "ground_truth": file.writelines("{} & {} \\\\ \\hline \n".format(processStrLatex(srl_name), " & ".join(res))) else: file.writelines("{} & {} \\\\ \n".format(processStrLatex(srl_name), " & ".join(res))) file.writelines("\\end{tabular}\n") file.writelines("\\caption{{{}}}\n".format(caption)) file.writelines("\\end{table}\n") return table
def comparePlots(path, plots, y_limits, title="Learning Curve", timesteps=False, truncate_x=-1, no_display=False): """ :param path: (str) path to the folder where the plots are stored :param plots: ([str]) List of saved plots as npz file :param y_limits: ([float]) y-limits for the plot :param title: (str) plot title :param timesteps: (bool) Plot timesteps instead of episodes :param truncate_x: (int) Truncate the experiments after n ticks on the x-axis :param no_display: (bool) Set to true, the plot won't be displayed (useful when only saving plot) """ y_list = [] x_list = [] for plot in plots: saved_plot = np.load('{}/{}'.format(path, plot)) x_list.append(saved_plot['x']) y_list.append(saved_plot['y']) lengths = list(map(len, x_list)) min_x, max_x = np.min(lengths), np.max(lengths) print("Min x: {}".format(min_x)) print("Max x: {}".format(max_x)) if truncate_x > 0: min_x = min(truncate_x, min_x) print("Truncating the x-axis at {}".format(min_x)) x = np.array(x_list[0][:min_x]) printGreen("{} Experiments".format(len(y_list))) # print("Min, Max rewards:", np.min(y), np.max(y)) fig = plt.figure(title) for i in range(len(y_list)): label = plots[i].split('.npz')[0] y = y_list[i][:, :min_x] print('{}: {} experiments'.format(label, len(y))) # Compute mean for different seeds m = np.mean(y, axis=0) # Compute standard error s = np.squeeze(np.asarray(np.std(y, axis=0))) n = y.shape[0] plt.fill_between(x, m - s / np.sqrt(n), m + s / np.sqrt(n), color=lightcolors[i % len(lightcolors)], alpha=0.5) plt.plot(x, m, color=darkcolors[i % len(darkcolors)], label=label, linewidth=2) if timesteps: formatter = FuncFormatter(millions) plt.xlabel('Number of Timesteps', fontsize=20, fontweight='bold') fig.axes[0].xaxis.set_major_formatter(formatter) else: plt.xlabel('Number of Episodes') plt.ylabel('Rewards', fontsize=20, fontweight='bold') plt.title(title, **fontstyle) plt.ylim(y_limits) plt.legend(framealpha=0.8, frameon=True, labelspacing=0.01, loc='lower right', fontsize=18) if not no_display: plt.show()
index_to_begin = episodes.astype(int).tolist().index(max_eps) + 1 else: task_labels = ['cc', 'sc'] rewards = {} rewards['episode'] = [] rewards['policy'] = [] for t in ['cc', 'sc']: rewards[t] = [] for policy_path in policy_paths[index_to_begin:]: copyfile(log_dir + '/args.json', policy_path + '/args.json') copyfile(log_dir + '/env_globals.json', policy_path + '/env_globals.json') printGreen("The evaluation will begin from {}".format( episodes[index_to_begin])) last_mean = [250., 1900.] run_mean = [0, 0] for k in range(index_to_begin, len(episodes), interval_len): # if(interval_len > 1 and int(episodes[k])>=episode_schedule): # k += interval_len-1 printGreen("Evaluation for episode: {}".format(episodes[k])) increase_interval = True model_path = policy_paths[k] for t, task_label in enumerate(["-sc", "-cc"]): local_reward = [int(episodes[k])]
def loadConfigAndSetup(load_args): """ Get the training config and setup the parameters :param load_args: (Arguments) :return: (dict, str, str, str, dict) """ algo_name = "" for algo in list(registered_rl.keys()): if algo in load_args.log_dir: algo_name = algo break algo_class, algo_type, _ = registered_rl[algo_name] if algo_type == AlgoType.OTHER: raise ValueError(algo_name + " is not supported for replay") printGreen("\n" + algo_name + "\n") try: # If args contains episode information, this is for student_evaluation (distillation) if not load_args.episode == -1: load_path = "{}/{}_{}_model.pkl".format(load_args.log_dir, algo_name, load_args.episode,) else: load_path = "{}/{}_model.pkl".format(load_args.log_dir, algo_name) except: printYellow( "No episode of checkpoint specified, go for the default policy model: {}_model.pkl".format(algo_name)) if load_args.log_dir[-3:] != 'pkl': load_path = "{}/{}_model.pkl".format(load_args.log_dir, algo_name) else: load_path = load_args.log_dir load_args.log_dir = os.path.dirname(load_path)+'/' env_globals = json.load(open(load_args.log_dir + "env_globals.json", 'r')) train_args = json.load(open(load_args.log_dir + "args.json", 'r')) env_kwargs = { "renders": load_args.render, "shape_reward": load_args.shape_reward, # Reward sparse or shaped "action_joints": train_args["action_joints"], "is_discrete": not train_args["continuous_actions"], "random_target": train_args.get('random_target', False), "srl_model": train_args["srl_model"] } # load it, if it was defined if "action_repeat" in env_globals: env_kwargs["action_repeat"] = env_globals['action_repeat'] # Remove up action if train_args["env"] == "Kuka2ButtonGymEnv-v0": env_kwargs["force_down"] = env_globals.get('force_down', True) else: env_kwargs["force_down"] = env_globals.get('force_down', False) if train_args["env"] == "OmnirobotEnv-v0": env_kwargs["simple_continual_target"] = env_globals.get("simple_continual_target", False) env_kwargs["circular_continual_move"] = env_globals.get("circular_continual_move", False) env_kwargs["square_continual_move"] = env_globals.get("square_continual_move", False) env_kwargs["eight_continual_move"] = env_globals.get("eight_continual_move", False) # If overriding the environment for specific Continual Learning tasks if sum([load_args.simple_continual, load_args.circular_continual, load_args.square_continual]) >= 1: env_kwargs["simple_continual_target"] = load_args.simple_continual env_kwargs["circular_continual_move"] = load_args.circular_continual env_kwargs["square_continual_move"] = load_args.square_continual env_kwargs["random_target"] = not (load_args.circular_continual or load_args.square_continual) srl_model_path = None if train_args["srl_model"] != "raw_pixels": train_args["policy"] = "mlp" path = env_globals.get('srl_model_path') if path is not None: env_kwargs["use_srl"] = True # Check that the srl saved model exists on the disk assert os.path.isfile(env_globals['srl_model_path']), \ "{} does not exist".format(env_globals['srl_model_path']) srl_model_path = env_globals['srl_model_path'] env_kwargs["srl_model_path"] = srl_model_path return train_args, load_path, algo_name, algo_class, srl_model_path, env_kwargs
def main(): # Global variables for callback global ENV_NAME, ALGO, ALGO_NAME, LOG_INTERVAL, VISDOM_PORT, viz global SAVE_INTERVAL, EPISODE_WINDOW, MIN_EPISODES_BEFORE_SAVE parser = argparse.ArgumentParser( description="Train script for RL algorithms") parser.add_argument('--algo', default='ppo2', choices=list(registered_rl.keys()), help='RL algo to use', type=str) parser.add_argument('--env', type=str, help='environment ID', default='KukaButtonGymEnv-v0', choices=list(registered_env.keys())) parser.add_argument('--seed', type=int, default=0, help='random seed (default: 0)') parser.add_argument( '--episode-window', type=int, default=40, help='Episode window for moving average plot (default: 40)') parser.add_argument( '--log-dir', default='/tmp/gym/', type=str, help='directory to save agent logs and model (default: /tmp/gym)') parser.add_argument('--num-timesteps', type=int, default=int(1e6)) parser.add_argument('--srl-model', type=str, default='raw_pixels', choices=list(registered_srl.keys()), help='SRL model to use') parser.add_argument('--num-stack', type=int, default=1, help='number of frames to stack (default: 1)') parser.add_argument( '--action-repeat', type=int, default=1, help='number of times an action will be repeated (default: 1)') parser.add_argument('--port', type=int, default=8097, help='visdom server port (default: 8097)') parser.add_argument('--no-vis', action='store_true', default=False, help='disables visdom visualization') parser.add_argument( '--shape-reward', action='store_true', default=False, help='Shape the reward (reward = - distance) instead of a sparse reward' ) parser.add_argument('-c', '--continuous-actions', action='store_true', default=False) parser.add_argument( '-joints', '--action-joints', action='store_true', default=False, help= 'set actions to the joints of the arm directly, instead of inverse kinematics' ) parser.add_argument('-r', '--random-target', action='store_true', default=False, help='Set the button to a random position') parser.add_argument( '--srl-config-file', type=str, default="config/srl_models.yaml", help='Set the location of the SRL model path configuration.') parser.add_argument('--hyperparam', type=str, nargs='+', default=[]) parser.add_argument('--min-episodes-save', type=int, default=100, help="Min number of episodes before saving best model") parser.add_argument( '--latest', action='store_true', default=False, help= 'load the latest learned model (location:srl_zoo/logs/DatasetName/)') parser.add_argument( '--load-rl-model-path', type=str, default=None, help="load the trained RL model, should be with the same algorithm type" ) parser.add_argument( '-sc', '--simple-continual', action='store_true', default=False, help= 'Simple red square target for task 1 of continual learning scenario. ' + 'The task is: robot should reach the target.') parser.add_argument( '-cc', '--circular-continual', action='store_true', default=False, help='Blue square target for task 2 of continual learning scenario. ' + 'The task is: robot should turn in circle around the target.') parser.add_argument( '-sqc', '--square-continual', action='store_true', default=False, help='Green square target for task 3 of continual learning scenario. ' + 'The task is: robot should turn in square around the target.') parser.add_argument( '-ec', '--eight-continual', action='store_true', default=False, help='Green square target for task 4 of continual learning scenario. ' + 'The task is: robot should do the eigth with the target as center of the shape.' ) parser.add_argument('--teacher-data-folder', type=str, default="", help='Dataset folder of the teacher(s) policy(ies)', required=False) parser.add_argument( '--epochs-distillation', type=int, default=30, metavar='N', help='number of epochs to train for distillation(default: 30)') parser.add_argument( '--distillation-training-set-size', type=int, default=-1, help='Limit size (number of samples) of the training set (default: -1)' ) parser.add_argument( '--perform-cross-evaluation-cc', action='store_true', default=False, help='A cross evaluation from the latest stored model to all tasks') parser.add_argument( '--eval-episode-window', type=int, default=400, metavar='N', help= 'Episode window for saving each policy checkpoint for future distillation(default: 100)' ) parser.add_argument( '--new-lr', type=float, default=1.e-4, help="New learning rate ratio to train a pretrained agent") parser.add_argument('--img-shape', type=str, default="(3,64,64)", help="Image shape of environment.") parser.add_argument( "--gpu-num", help="Choose the number of GPU (CUDA_VISIBLE_DEVICES).", type=str, default="1", choices=["0", "1", "2", "3", "5", "6", "7", "8"]) parser.add_argument("--srl-model-path", help="SRL model weights path", type=str, default=None) parser.add_argument( "--relative-pos", action='store_true', default=False, help="For 'ground_truth': use relative position or not.") # Ignore unknown args for now args, unknown = parser.parse_known_args() # os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_num env_kwargs = {} if args.img_shape is None: img_shape = None #(3,224,224) else: img_shape = tuple(map(int, args.img_shape[1:-1].split(","))) env_kwargs['img_shape'] = img_shape # LOAD SRL models list assert os.path.exists(args.srl_config_file), \ "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.srl_config_file) with open(args.srl_config_file, 'rb') as f: all_models = yaml.load(f) # Sanity check assert args.episode_window >= 1, "Error: --episode_window cannot be less than 1" assert args.num_timesteps >= 1, "Error: --num-timesteps cannot be less than 1" assert args.num_stack >= 1, "Error: --num-stack cannot be less than 1" assert args.action_repeat >= 1, "Error: --action-repeat cannot be less than 1" assert 0 <= args.port < 65535, "Error: invalid visdom port number {}, ".format(args.port) + \ "port number must be an unsigned 16bit number [0,65535]." assert registered_srl[args.srl_model][0] == SRLType.ENVIRONMENT or args.env in all_models, \ "Error: the environment {} has no srl_model defined in 'srl_models.yaml'. Cannot continue.".format(args.env) # check that all the SRL_model can be run on the environment if registered_srl[args.srl_model][1] is not None: found = False for compatible_class in registered_srl[args.srl_model][1]: if issubclass(compatible_class, registered_env[args.env][0]): found = True break assert found, "Error: srl_model {}, is not compatible with the {} environment.".format( args.srl_model, args.env) assert not(sum([args.simple_continual, args.circular_continual, args.square_continual, args.eight_continual]) \ > 1 and args.env == "OmnirobotEnv-v0"), \ "For continual SRL and RL, please provide only one scenario at the time and use OmnirobotEnv-v0 environment !" assert not(args.algo == "distillation" and (args.teacher_data_folder == '' or args.continuous_actions is True)), \ "For performing policy distillation, make sure use specify a valid teacher dataset and discrete actions !" ENV_NAME = args.env ALGO_NAME = args.algo VISDOM_PORT = args.port EPISODE_WINDOW = args.episode_window MIN_EPISODES_BEFORE_SAVE = args.min_episodes_save CROSS_EVAL = args.perform_cross_evaluation_cc EPISODE_WINDOW_DISTILLATION_WIN = args.eval_episode_window NEW_LR = args.new_lr print("EPISODE_WINDOW_DISTILLATION_WIN: ", EPISODE_WINDOW_DISTILLATION_WIN) if args.no_vis: viz = False algo_class, algo_type, action_type = registered_rl[args.algo] algo = algo_class() ALGO = algo # if callback frequency needs to be changed LOG_INTERVAL = algo.LOG_INTERVAL SAVE_INTERVAL = algo.SAVE_INTERVAL if not args.continuous_actions and ActionType.DISCRETE not in action_type: raise ValueError( args.algo + " does not support discrete actions, please use the '--continuous-actions' " + "(or '-c') flag.") if args.continuous_actions and ActionType.CONTINUOUS not in action_type: raise ValueError( args.algo + " does not support continuous actions, please remove the '--continuous-actions' " + "(or '-c') flag.") env_kwargs["is_discrete"] = not args.continuous_actions printGreen("\nAgent = {} \n".format(args.algo)) env_kwargs["action_repeat"] = args.action_repeat # Random init position for button env_kwargs["random_target"] = args.random_target # If in simple continual scenario, then the target should be initialized randomly. if args.simple_continual is True: env_kwargs["random_target"] = True # Allow up action # env_kwargs["force_down"] = False # allow multi-view env_kwargs['multi_view'] = args.srl_model == "multi_view_srl" parser = algo.customArguments(parser) args = parser.parse_args() args, env_kwargs = configureEnvAndLogFolder(args, env_kwargs, all_models) args_dict = filterJSONSerializableObjects(vars(args)) # Save args with open(LOG_DIR + "args.json", "w") as f: json.dump(args_dict, f) env_class = registered_env[args.env][0] # env default kwargs default_env_kwargs = { k: v.default for k, v in inspect.signature(env_class.__init__).parameters.items() if v is not None } globals_env_param = sys.modules[env_class.__module__].getGlobals() ### HACK way to reset image shape !! globals_env_param['RENDER_HEIGHT'] = img_shape[1] globals_env_param['RENDER_WIDTH'] = img_shape[2] globals_env_param['RELATIVE_POS'] = args.relative_pos super_class = registered_env[args.env][1] # reccursive search through all the super classes of the asked environment, in order to get all the arguments. rec_super_class_lookup = { dict_class: dict_super_class for _, (dict_class, dict_super_class, _, _) in registered_env.items() } while super_class != SRLGymEnv: assert super_class in rec_super_class_lookup, "Error: could not find super class of {}".format(super_class) + \ ", are you sure \"registered_env\" is correctly defined?" super_env_kwargs = { k: v.default for k, v in inspect.signature( super_class.__init__).parameters.items() if v is not None } default_env_kwargs = {**super_env_kwargs, **default_env_kwargs} globals_env_param = { **sys.modules[super_class.__module__].getGlobals(), **globals_env_param } super_class = rec_super_class_lookup[super_class] # Print Variables printYellow("Arguments:") pprint(args_dict) printYellow("Env Globals:") pprint( filterJSONSerializableObjects({ **globals_env_param, **default_env_kwargs, **env_kwargs })) # Save env params saveEnvParams(globals_env_param, {**default_env_kwargs, **env_kwargs}) # Seed tensorflow, python and numpy random generator set_global_seeds(args.seed) # Augment the number of timesteps (when using mutliprocessing this number is not reached) args.num_timesteps = int(1.1 * args.num_timesteps) # Get the hyperparameter, if given (Hyperband) hyperparams = { param.split(":")[0]: param.split(":")[1] for param in args.hyperparam } hyperparams = algo.parserHyperParam(hyperparams) if args.load_rl_model_path is not None: #use a small learning rate print("use a small learning rate: {:f}".format(1.0e-4)) hyperparams["learning_rate"] = lambda f: f * 1.0e-4 # Train the agent if args.load_rl_model_path is not None: algo.setLoadPath(args.load_rl_model_path) algo.train(args, callback, env_kwargs=env_kwargs, train_kwargs=hyperparams)
def main(): parser = argparse.ArgumentParser( description="OpenAI RL Baselines Benchmark", epilog= 'After the arguments are parsed, the rest are assumed to be arguments for' + ' rl_baselines.train') parser.add_argument('--algo', type=str, default='ppo2', help='OpenAI baseline to use', choices=list(registered_rl.keys())) parser.add_argument('--env', type=str, nargs='+', default=["KukaButtonGymEnv-v0"], help='environment ID(s)', choices=list(registered_env.keys())) parser.add_argument('--srl-model', type=str, nargs='+', default=["raw_pixels"], help='SRL model(s) to use', choices=list(registered_srl.keys())) parser.add_argument('--num-timesteps', type=int, default=1e6, help='number of timesteps the baseline should run') parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Display baseline STDOUT') parser.add_argument( '--num-iteration', type=int, default=15, help= 'number of time each algorithm should be run for each unique combination of environment ' + ' and srl-model.') parser.add_argument( '--seed', type=int, default=0, help= 'initial seed for each unique combination of environment and srl-model.' ) parser.add_argument( '--srl-config-file', type=str, default="config/srl_models.yaml", help='Set the location of the SRL model path configuration.') # returns the parsed arguments, and the rest are assumed to be arguments for rl_baselines.train args, train_args = parser.parse_known_args() # Sanity check assert args.num_timesteps >= 1, "Error: --num-timesteps cannot be less than 1" assert args.num_iteration >= 1, "Error: --num-iteration cannot be less than 1" # Removing duplicates and sort srl_models = list(set(args.srl_model)) envs = list(set(args.env)) srl_models.sort() envs.sort() # LOAD SRL models list assert os.path.exists(args.srl_config_file), \ "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.srl_config_file) with open(args.srl_config_file, 'rb') as f: all_models = yaml.load(f) # Checking definition and presence of all requested srl_models valid = True for env in envs: # validated the env definition if env not in all_models: printRed( "Error: 'srl_models.yaml' missing definition for environment {}" .format(env)) valid = False continue # skip to the next env, this one is not valid # checking log_folder for current env missing_log = "log_folder" not in all_models[env] if missing_log: printRed( "Error: 'srl_models.yaml' missing definition for log_folder in environment {}" .format(env)) valid = False # validate each model for the current env definition for model in srl_models: if registered_srl[model][0] == SRLType.ENVIRONMENT: continue # not an srl model, skip to the next model elif model not in all_models[env]: printRed( "Error: 'srl_models.yaml' missing srl_model {} for environment {}" .format(model, env)) valid = False elif (not missing_log) and ( not os.path.exists(all_models[env]["log_folder"] + all_models[env][model])): # checking presence of srl_model path, if and only if log_folder exists printRed( "Error: srl_model {} for environment {} was defined in ". format(model, env) + "'srl_models.yaml', however the file {} it was tagetting does not exist." .format(all_models[env]["log_folder"] + all_models[env][model])) valid = False assert valid, "Errors occured due to malformed 'srl_models.yaml', cannot continue." # check that all the SRL_models can be run on all the environments valid = True for env in envs: for model in srl_models: if registered_srl[model][1] is not None: found = False for compatible_class in registered_srl[model][1]: if issubclass(compatible_class, registered_env[env][0]): found = True break if not found: valid = False printRed( "Error: srl_model {}, is not compatible with the {} environment." .format(model, env)) assert valid, "Errors occured due to an incompatible combination of srl_model and environment, cannot continue." # the seeds used in training the baseline. seeds = list(np.arange(args.num_iteration) + args.seed) if args.verbose: # None here means stdout of terminal for subprocess.call stdout = None else: stdout = open(os.devnull, 'w') printGreen("\nRunning {} benchmarks {} times...".format( args.algo, args.num_iteration)) print("\nSRL-Models:\t{}".format(srl_models)) print("environments:\t{}".format(envs)) print("verbose:\t{}".format(args.verbose)) print("timesteps:\t{}".format(args.num_timesteps)) for model in srl_models: for env in envs: for i in range(args.num_iteration): printGreen( "\nIteration_num={} (seed: {}), Environment='{}', SRL-Model='{}'" .format(i, seeds[i], env, model)) # redefine the parsed args for rl_baselines.train loop_args = [ '--srl-model', model, '--seed', str(seeds[i]), '--algo', args.algo, '--env', env, '--num-timesteps', str(int(args.num_timesteps)), '--srl-config-file', args.srl_config_file ] ok = subprocess.call(['python', '-m', 'rl_baselines.train'] + train_args + loop_args, stdout=stdout) if ok != 0: # throw the error down to the terminal raise ChildProcessError( "An error occured, error code: {}".format(ok))
def compute_fisher(self, num_timesteps, runner): """ To get the diagonal of accumulated fisher information matrix :param num_timesteps: timesteps for the sampling :param runner: :return: """ num_samples = num_timesteps // self.n_batch # Creation of a new variable to the class PPO2 self.Fisher_accum = [ np.zeros_like(var) for var in self.pretrained_weight ] F_prev = deepcopy(self.Fisher_accum) mean_diffs = np.zeros(0) for iter in range(1, num_samples + 1): obs, returns, masks, actions, values, neglogpacs, states, ep_infos, true_reward = runner.run( ) #randomly sample from the action, value and q-value step_ind = np.random.randint(self.n_steps) action_ind = tf.to_int32( tf.random.categorical(tf.log(self.train_model.policy_proba), 1))[:, 0] n_action = self.train_model.policy_proba.shape[1] action_mask = tf.one_hot(action_ind, depth=n_action, dtype=tf.bool, on_value=True, off_value=False) action_prob = tf.boolean_mask(self.train_model.policy_proba, action_mask) q_value = tf.boolean_mask(self.train_model.q_value, action_mask) #compute the fisher accumualated information for v in range(len(self.params)): #the first order derivative of the action proba by parameters (weight matrix) obs_sample = obs[step_ind:step_ind + 1] grad_action, grad_value, grad_q = self.sess.run( [ tf.gradients(action_prob, self.params[v], unconnected_gradients='zero')[0], tf.gradients(self.train_model._value, self.params[v], unconnected_gradients='zero')[0], tf.gradients(q_value, self.params[v], unconnected_gradients='zero')[0] ], feed_dict={ self.train_model.obs_ph: obs_sample, self.params[v]: self.pretrained_weight[v] }) """ Add penalization only on the action space, or do the regularization on all outputs """ #if (len(np.unique(grad_action)) >1): self.Fisher_accum[v] += np.square( (grad_action + grad_value + grad_q)) # Codes to show the convergence if (iter % (num_samples // 10) == 0): F_diff = 0 Fisher_total = 0 for v in range(len(self.Fisher_accum)): F_diff += np.sum( np.absolute(self.Fisher_accum[v] / (iter + 1) - F_prev[v])) Fisher_total += np.sum( np.absolute(self.Fisher_accum[v] / (iter + 1))) mean_diff = np.mean(F_diff) mean_diffs = np.append(mean_diffs, mean_diff) for v in range(len(self.Fisher_accum)): F_prev[v] = self.Fisher_accum[v] / (iter + 1) printGreen( "At iteration: {}, the new added information difference {}, total Fisher value {}" .format(iter, F_diff, Fisher_total)) printGreen("Fisher information computation complete") for v in range(len(self.Fisher_accum)): self.Fisher_accum[v] /= (num_samples)
def comparePlots(path, algo, y_limits, title="Learning Curve", timesteps=False, truncate_x=-1, no_display=False, normalization=False, figpath=None, exclude_list=None): """ :param path: (str) path to the folder where the plots are stored :param plots: ([str]) List of saved plots as npz file :param y_limits: ([float]) y-limits for the plot :param title: (str) plot title :param timesteps: (bool) Plot timesteps instead of episodes :param truncate_x: (int) Truncate the experiments after n ticks on the x-axis :param no_display: (bool) Set to true, the plot won't be displayed (useful when only saving plot) """ if exclude_list is None: exclude_list = [] folders = [] legends = [] for folder in os.listdir(path): folders_srl = [] tmp_path = "{}/{}/{}/".format(path, folder, algo) if os.path.exists(tmp_path) and ( folder not in exclude_list ): # folder contains algo (e.g. ppo2) subfolder and not in excluded list printRed(folder) legends.append(folder) for f in os.listdir(tmp_path): paths = "{}/{}/{}/{}/".format(path, folder, algo, f) folders_srl.append(paths) folders.append(folders_srl) else: continue x_list, y_list = [], [] exp_name_dict = {} for ind, folders_srl in enumerate(folders): printGreen("Folder name {}".format(folders_srl)) x, y = GatherExperiments(folders_srl, algo, window=40, title=title, min_num_x=-1, timesteps=timesteps, output_file="") print(len(x)) x_list.append(x) y_list.append(y) ## HACK: the line below is ugly and not robust code !! TODO exp_name_dict[ind] = folders_srl[0].split("/")[-4] printGreen(np.array(x_list).shape) # printGreen('y_list shape {}'.format(np.array(y_list[1]).shape)) plotGatheredData(x_list, y_list, y_limits, timesteps, title, legends, no_display, truncate_x, normalization, figpath=figpath, exp_name_dict=exp_name_dict)
def plotGatheredData(x_list, y_list, y_limits, timesteps, title, legends, no_display, truncate_x=-1, normalization=False, figpath=None, exp_name_dict=None): assert len(legends) == len(y_list) printGreen("{} Experiments".format(len(y_list))) lengths = list(map(len, x_list)) min_x, max_x = np.min(lengths), np.max(lengths) if truncate_x > 0: min_x = min(truncate_x, min_x) x = np.array(x_list[0][:min_x]) # To reformulize the data by the min_x for i in range(len(y_list)): y_list[i] = y_list[i][:, :min_x] y_list = np.array(y_list) #print("Min, Max rewards:", np.min(y_list), np.max(y_list)) # Normalize the data between 0 and 1. if (normalization): y_limits = [-0.05, 1.05] y_list = (y_list - np.min(y_list)) / (np.max(y_list) - np.min(y_list)) colormap = plt.cm.tab20.colors registered_indexes = [0, 4, 6] registered_color = { 'ground_truth': colormap[4], # green 'raw_pixels': colormap[0], # blue 'AE_ifr2_spcls_split': colormap[6], # red 'supervised': (0.0, 0.0, 0.0) # black } # import ipdb; ipdb.set_trace() new_colormap = tuple([ colormap[k] for k in range(len(colormap)) if k not in registered_indexes ]) fig = plt.figure(title, figsize=(20, 10)) for i in range(len(y_list)): label = legends[i] y = y_list[i][:, :min_x] print('{}: {} experiments'.format(label, len(y))) # Compute mean for different seeds m = np.mean(y, axis=0) # Compute standard error s = np.squeeze(np.asarray(np.std(y, axis=0))) n = y.shape[0] exp_name = exp_name_dict[i] color = registered_color.get( exp_name, new_colormap[i] ) # get color if exp_name is registered, otherwise, new color plt.fill_between(x, m - s / np.sqrt(n), m + s / np.sqrt(n), color=color, alpha=0.3) plt.plot(x, m, color=color, label=label, linewidth=2) if timesteps: formatter = FuncFormatter(millions) plt.xlabel('Number of Timesteps') fig.axes[0].xaxis.set_major_formatter(formatter) else: plt.xlabel('Number of Episodes') if (normalization): plt.ylabel('Normalized Rewards') else: plt.ylabel('Rewards') plt.title(title, **fontstyle) plt.ylim(y_limits) plt.legend(framealpha=0.8, frameon=True, labelspacing=0.01, loc='lower right', fontsize=16) if figpath is not None: plt.savefig(figpath) if not no_display: plt.show()
def loadSRLModel(path=None, cuda=False, state_dim=None, env_object=None, img_shape=None): """ Load a trained SRL model, it will try to guess the model type from the path :param path: (str) Path to a srl model :param cuda: (bool) :param state_dim: (int) :param env_object: (gym env object) :return: (srl model) """ model_type, losses, n_actions, model = None, None, None, None if path is not None: # Get path to the log folder log_folder = '/'.join(path.split('/')[:-1]) + '/' with open(log_folder + 'exp_config.json', 'r') as f: # IMPORTANT: keep the order for the losses # so the json is loaded as an OrderedDict exp_config = json.load(f, object_pairs_hook=OrderedDict) state_dim = exp_config.get('state-dim', None) losses = exp_config.get( 'losses', None) # None in the case of baseline models (pca) n_actions = exp_config.get( 'n_actions', None) # None in the case of baseline models (pca) model_type = exp_config.get('model-type', None) use_multi_view = exp_config.get('multi-view', False) inverse_model_type = exp_config.get('inverse-model-type', 'linear') num_dataset_episodes = exp_config.get('num_dataset_episodes', 100) assert state_dim is not None, \ "Please make sure you are loading an up to date model with a conform exp_config file." split_dimensions = exp_config.get('split-dimensions') if isinstance(split_dimensions, OrderedDict): n_dims = sum(split_dimensions.values()) # Combine losses instead of splitting if n_dims == 0: split_dimensions = None else: assert env_object is not None or state_dim > 0, \ "When learning states, state_dim must be > 0. Otherwise, set SRL_MODEL_PATH \ to a srl_model.pth file with learned states." if path is not None: if 'baselines' in path: if 'pca' in path: model_type = 'pca' model = SRLPCA(state_dim) assert model_type is not None or model is not None, \ "Model type not supported. In order to use loadSRLModel, a path to an SRL model must be given." assert not (losses is None and not model_type == 'pca'), \ "Please make sure you are loading an up to date model with a conform exp_config file." assert not (n_actions is None and not (model_type == 'pca')), \ "Please make sure you are loading an up to date model with a conform exp_config file." if model is None: if use_multi_view: new_img_shape = (6, ) + img_shape[1:] else: new_img_shape = img_shape model = SRLNeuralNetwork(state_dim, cuda, img_shape=new_img_shape, model_type=model_type, n_actions=n_actions, losses=losses, split_dimensions=split_dimensions, spcls_num_classes=num_dataset_episodes, inverse_model_type=inverse_model_type) model_name = model_type if 'baselines' not in path: model_name += " with " + ", ".join(losses) printGreen("\nSRL: Using {} \n".format(model_name)) if path is not None: printYellow("Loading trained model...{}".format(path)) model.load(path) return model
def prog_mlp_extractor(flat_observations, net_arch, act_fun, dict_res_tensor_ph, n_col=0): latent = flat_observations policy_only_layers = [ ] # Layer sizes of the network that only belongs to the policy network value_only_layers = [ ] # Layer sizes of the network that only belongs to the value network for idx, layer in enumerate(net_arch): if isinstance(layer, int): # Check that this is a shared layer layer_size = layer latent = act_fun( linear(latent, "shared_fc{}".format(idx), layer_size, init_scale=np.sqrt(2))) else: if 'pi' in layer: assert isinstance( layer['pi'], list ), "Error: net_arch[-1]['pi'] must contain a list of integers." policy_only_layers = layer['pi'] if 'vf' in layer: assert isinstance( layer['vf'], list ), "Error: net_arch[-1]['vf'] must contain a list of integers." value_only_layers = layer['vf'] break # From here on the network splits up in policy and value network # Build the non-shared part of the network latent_policy = latent latent_value = latent for idx, (pi_layer_size, vf_layer_size) in enumerate( zip_longest(policy_only_layers, value_only_layers)): if pi_layer_size is not None: assert isinstance( pi_layer_size, int), "Error: net_arch[-1]['pi'] must only contain integers." latent_policy = (linear(latent_policy, "pi_fc{}".format(idx), pi_layer_size, init_scale=np.sqrt(2))) if (n_col > 0): with tf.variable_scope("pi_res_{}".format(idx), reuse=tf.AUTO_REUSE): print(latent_policy.name) # and"train_model" in latent_policy.name): res_pi_ph = dict_res_tensor_ph[latent_policy.name.split( ":")[0]] printGreen(res_pi_ph) res_len = res_pi_ph.shape[1] U = tf.get_variable( name="U{}".format(idx), shape=[res_len, pi_layer_size], initializer=tf.constant_initializer(1.)) latent_policy += tf.matmul(res_pi_ph, U) latent_policy = act_fun(latent_policy) if vf_layer_size is not None: assert isinstance( vf_layer_size, int), "Error: net_arch[-1]['vf'] must only contain integers." latent_value = (linear(latent_value, "vf_fc{}".format(idx), vf_layer_size, init_scale=np.sqrt(2))) if (n_col > 0): with tf.variable_scope("vf_res_{}".format(idx), reuse=tf.AUTO_REUSE): res_vf_ph = dict_res_tensor_ph[latent_value.name.split(":") [0]] res_len = res_vf_ph.shape[1] U = tf.get_variable( name="U{}".format(idx), shape=[res_len, vf_layer_size], initializer=tf.constant_initializer(1.)) latent_value += tf.matmul(res_vf_ph, U) latent_value = act_fun(latent_value) return latent_policy, latent_value
def loadConfigAndSetup(load_args): """ Get the training config and setup the parameters :param load_args: (Arguments) :return: (dict, str, str, str, dict) """ algo_name = "" for algo in list(registered_rl.keys()): if algo in load_args.log_dir: algo_name = algo break algo_class, algo_type, _ = registered_rl[algo_name] if algo_type == AlgoType.OTHER: raise ValueError(algo_name + " is not supported for replay") printGreen("\n" + algo_name + "\n") load_path = "{}/{}_model.pkl".format(load_args.log_dir, algo_name) env_globals = json.load(open(load_args.log_dir + "env_globals.json", 'r')) train_args = json.load(open(load_args.log_dir + "args.json", 'r')) if train_args.get("img_shape", None) is None: img_shape = None #(3,224,224) else: img_shape = tuple( map(int, train_args.get("img_shape", None)[1:-1].split(","))) env_kwargs = { "renders": load_args.render, "shape_reward": load_args.shape_reward, # Reward sparse or shaped "action_joints": train_args["action_joints"], "is_discrete": not train_args["continuous_actions"], "random_target": train_args.get('random_target', False), "srl_model": train_args["srl_model"], "img_shape": img_shape # "img_shape" : train_args.get("img_shape", None) } # load it, if it was defined if "action_repeat" in env_globals: env_kwargs["action_repeat"] = env_globals['action_repeat'] # Remove up action if train_args["env"] == "Kuka2ButtonGymEnv-v0": env_kwargs["force_down"] = env_globals.get('force_down', True) else: env_kwargs["force_down"] = env_globals.get('force_down', False) if train_args["env"] == "OmnirobotEnv-v0": env_kwargs["simple_continual_target"] = env_globals.get( "simple_continual_target", False) env_kwargs["circular_continual_move"] = env_globals.get( "circular_continual_move", False) env_kwargs["square_continual_move"] = env_globals.get( "square_continual_move", False) env_kwargs["eight_continual_move"] = env_globals.get( "eight_continual_move", False) # If overriding the environment for specific Continual Learning tasks if sum([ load_args.simple_continual, load_args.circular_continual, load_args.square_continual ]) >= 1: env_kwargs["simple_continual_target"] = load_args.simple_continual env_kwargs[ "circular_continual_move"] = load_args.circular_continual env_kwargs["square_continual_move"] = load_args.square_continual env_kwargs["random_target"] = not (load_args.circular_continual or load_args.square_continual) srl_model_path = None if train_args["srl_model"] != "raw_pixels": train_args["policy"] = "mlp" path = env_globals.get('srl_model_path') if path is not None: env_kwargs["use_srl"] = True # Check that the srl saved model exists on the disk assert os.path.isfile( env_globals['srl_model_path']), "{} does not exist".format( env_globals['srl_model_path']) srl_model_path = env_globals['srl_model_path'] env_kwargs["srl_model_path"] = srl_model_path return train_args, load_path, algo_name, algo_class, srl_model_path, env_kwargs
def plotGatheredExperiments(folders, algo, y_limits, window=40, title="", min_num_x=-1, timesteps=False, output_file="", no_display=False): """ Compute mean and standard error for several experiments and plot the learning curve :param folders: ([str]) Log folders, where the monitor.csv are stored :param window: (int) Smoothing window :param algo: (str) name of the RL algo :param title: (str) plot title :param min_num_x: (int) Minimum number of episode/timesteps to keep an experiment (default: -1, no minimum) :param timesteps: (bool) Plot timesteps instead of episodes :param y_limits: ([float]) y-limits for the plot :param output_file: (str) Path to a file where the plot data will be saved :param no_display: (bool) Set to true, the plot won't be displayed (useful when only saving plot) """ y_list = [] x_list = [] ok = False for folder in folders: if timesteps: x, y = loadData(folder, smooth=1, bin_size=100) if x is not None: x, y = np.array(x), np.array(y) else: x, y = loadEpisodesData(folder) if x is None or (min_num_x > 0 and y.shape[0] < min_num_x): printYellow("Skipping {}".format(folder)) continue if y.shape[0] <= window: printYellow("Folder {}".format(folder)) printYellow( "Not enough episodes for current window size = {}".format( window)) continue ok = True y = movingAverage(y, window) y_list.append(y) # Truncate x x = x[len(x) - len(y):] x_list.append(x) if not ok: printRed("Not enough data to plot anything with current config." + " Consider decreasing --min-x") return lengths = list(map(len, x_list)) min_x, max_x = np.min(lengths), np.max(lengths) print("Min x: {}".format(min_x)) print("Max x: {}".format(max_x)) for i in range(len(x_list)): x_list[i] = x_list[i][:min_x] y_list[i] = y_list[i][:min_x] x = np.array(x_list)[0] y = np.array(y_list) printGreen("{} Experiments".format(y.shape[0])) print("Min, Max rewards:", np.min(y), np.max(y)) fig = plt.figure(title) # Compute mean for different seeds m = np.mean(y, axis=0) # Compute standard error s = np.squeeze(np.asarray(np.std(y, axis=0))) n = y.shape[0] plt.fill_between(x, m - s / np.sqrt(n), m + s / np.sqrt(n), color=lightcolors[0]) plt.plot(x, m, color=darkcolors[0], label=algo, linewidth=1) if timesteps: formatter = FuncFormatter(millions) plt.xlabel('Number of Timesteps') fig.axes[0].xaxis.set_major_formatter(formatter) else: plt.xlabel('Number of Episodes') plt.ylabel('Rewards') plt.title(title, **fontstyle) plt.ylim(y_limits) plt.legend(framealpha=0.5, labelspacing=0.01, loc='lower right', fontsize=16) if output_file != "": printGreen("Saving aggregated data to {}.npz".format(output_file)) np.savez(output_file, x=x, y=y) if not no_display: plt.show()
def main(): # Global variables for callback global ENV_NAME, ALGO, ALGO_NAME, LOG_INTERVAL, VISDOM_PORT, viz global SAVE_INTERVAL, EPISODE_WINDOW, MIN_EPISODES_BEFORE_SAVE parser = argparse.ArgumentParser(description="Train script for RL algorithms") parser.add_argument('--algo', default='ppo2', choices=list(registered_rl.keys()), help='RL algo to use', type=str) parser.add_argument('--env', type=str, help='environment ID', default='KukaButtonGymEnv-v0', choices=list(registered_env.keys())) parser.add_argument('--seed', type=int, default=0, help='random seed (default: 0)') parser.add_argument('--episode_window', type=int, default=40, help='Episode window for moving average plot (default: 40)') parser.add_argument('--log-dir', default='/tmp/gym/', type=str, help='directory to save agent logs and model (default: /tmp/gym)') parser.add_argument('--num-timesteps', type=int, default=int(1e6)) parser.add_argument('--srl-model', type=str, default='raw_pixels', choices=list(registered_srl.keys()), help='SRL model to use') parser.add_argument('--num-stack', type=int, default=1, help='number of frames to stack (default: 1)') parser.add_argument('--action-repeat', type=int, default=1, help='number of times an action will be repeated (default: 1)') parser.add_argument('--port', type=int, default=8097, help='visdom server port (default: 8097)') parser.add_argument('--no-vis', action='store_true', default=False, help='disables visdom visualization') parser.add_argument('--shape-reward', action='store_true', default=False, help='Shape the reward (reward = - distance) instead of a sparse reward') parser.add_argument('-c', '--continuous-actions', action='store_true', default=False) parser.add_argument('-joints', '--action-joints', action='store_true', default=False, help='set actions to the joints of the arm directly, instead of inverse kinematics') parser.add_argument('-r', '--random-target', action='store_true', default=False, help='Set the button to a random position') parser.add_argument('--srl-config-file', type=str, default="config/srl_models.yaml", help='Set the location of the SRL model path configuration.') parser.add_argument('--hyperparam', type=str, nargs='+', default=[]) parser.add_argument('--min-episodes-save', type=int, default=100, help="Min number of episodes before saving best model") parser.add_argument('--latest', action='store_true', default=False, help='load the latest learned model (location:srl_zoo/logs/DatasetName/)') parser.add_argument('--load-rl-model-path', type=str, default=None, help="load the trained RL model, should be with the same algorithm type") # Ignore unknown args for now args, unknown = parser.parse_known_args() env_kwargs = {} # LOAD SRL models list assert os.path.exists(args.srl_config_file), \ "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.srl_config_file) with open(args.srl_config_file, 'rb') as f: all_models = yaml.load(f) # Sanity check assert args.episode_window >= 1, "Error: --episode_window cannot be less than 1" assert args.num_timesteps >= 1, "Error: --num-timesteps cannot be less than 1" assert args.num_stack >= 1, "Error: --num-stack cannot be less than 1" assert args.action_repeat >= 1, "Error: --action-repeat cannot be less than 1" assert 0 <= args.port < 65535, "Error: invalid visdom port number {}, ".format(args.port) + \ "port number must be an unsigned 16bit number [0,65535]." assert registered_srl[args.srl_model][0] == SRLType.ENVIRONMENT or args.env in all_models, \ "Error: the environment {} has no srl_model defined in 'srl_models.yaml'. Cannot continue.".format(args.env) # check that all the SRL_model can be run on the environment if registered_srl[args.srl_model][1] is not None: found = False for compatible_class in registered_srl[args.srl_model][1]: if issubclass(compatible_class, registered_env[args.env][0]): found = True break assert found, "Error: srl_model {}, is not compatible with the {} environment.".format(args.srl_model, args.env) ENV_NAME = args.env ALGO_NAME = args.algo VISDOM_PORT = args.port EPISODE_WINDOW = args.episode_window MIN_EPISODES_BEFORE_SAVE = args.min_episodes_save if args.no_vis: viz = False algo_class, algo_type, action_type = registered_rl[args.algo] algo = algo_class() ALGO = algo # if callback frequency needs to be changed LOG_INTERVAL = algo.LOG_INTERVAL SAVE_INTERVAL = algo.SAVE_INTERVAL if not args.continuous_actions and ActionType.DISCRETE not in action_type: raise ValueError(args.algo + " does not support discrete actions, please use the '--continuous-actions' " + "(or '-c') flag.") if args.continuous_actions and ActionType.CONTINUOUS not in action_type: raise ValueError(args.algo + " does not support continuous actions, please remove the '--continuous-actions' " + "(or '-c') flag.") env_kwargs["is_discrete"] = not args.continuous_actions printGreen("\nAgent = {} \n".format(args.algo)) env_kwargs["action_repeat"] = args.action_repeat # Random init position for button env_kwargs["random_target"] = args.random_target # Allow up action # env_kwargs["force_down"] = False # allow multi-view env_kwargs['multi_view'] = args.srl_model == "multi_view_srl" parser = algo.customArguments(parser) args = parser.parse_args() args, env_kwargs = configureEnvAndLogFolder(args, env_kwargs, all_models) args_dict = filterJSONSerializableObjects(vars(args)) # Save args with open(LOG_DIR + "args.json", "w") as f: json.dump(args_dict, f) env_class = registered_env[args.env][0] # env default kwargs default_env_kwargs = {k: v.default for k, v in inspect.signature(env_class.__init__).parameters.items() if v is not None} globals_env_param = sys.modules[env_class.__module__].getGlobals() super_class = registered_env[args.env][1] # reccursive search through all the super classes of the asked environment, in order to get all the arguments. rec_super_class_lookup = {dict_class: dict_super_class for _, (dict_class, dict_super_class, _, _) in registered_env.items()} while super_class != SRLGymEnv: assert super_class in rec_super_class_lookup, "Error: could not find super class of {}".format(super_class) + \ ", are you sure \"registered_env\" is correctly defined?" super_env_kwargs = {k: v.default for k, v in inspect.signature(super_class.__init__).parameters.items() if v is not None} default_env_kwargs = {**super_env_kwargs, **default_env_kwargs} globals_env_param = {**sys.modules[super_class.__module__].getGlobals(), **globals_env_param} super_class = rec_super_class_lookup[super_class] # Print Variables printYellow("Arguments:") pprint(args_dict) printYellow("Env Globals:") pprint(filterJSONSerializableObjects({**globals_env_param, **default_env_kwargs, **env_kwargs})) # Save env params saveEnvParams(globals_env_param, {**default_env_kwargs, **env_kwargs}) # Seed tensorflow, python and numpy random generator set_global_seeds(args.seed) # Augment the number of timesteps (when using mutliprocessing this number is not reached) args.num_timesteps = int(1.1 * args.num_timesteps) # Get the hyperparameter, if given (Hyperband) hyperparams = {param.split(":")[0]: param.split(":")[1] for param in args.hyperparam} hyperparams = algo.parserHyperParam(hyperparams) if args.load_rl_model_path is not None: #use a small learning rate print("use a small learning rate: {:f}".format(1.0e-4)) hyperparams["learning_rate"] = lambda f: f * 1.0e-4 # Train the agent if args.load_rl_model_path is not None: algo.setLoadPath(args.load_rl_model_path) algo.train(args, callback, env_kwargs=env_kwargs, train_kwargs=hyperparams)