def policyEval(envs, model_path, log_dir, algo_class, algo_args, num_timesteps=251, num_cpu=1): """ evaluation for the policy in the given envs :param envs: the environment we want to evaluate :param model_path: (str)the path to the policy ckp :param log_dir: (str) the path from a gym temporal file :param algo_class: :param algo_args: :param num_timesteps: (int) numbers of the timesteps we want to evaluate the policy :param num_cpu: :return: """ tf.reset_default_graph() method = algo_class.load(model_path, args=algo_args) using_custom_vec_env = isinstance(envs, WrapFrameStack) obs = envs.reset() if using_custom_vec_env: obs = obs.reshape((1, ) + obs.shape) n_done = 0 last_n_done = 0 episode_reward = [] dones = [False for _ in range(num_cpu)] for i in range(num_timesteps): actions = method.getAction(obs, dones) obs, rewards, dones, _ = envs.step(actions) if using_custom_vec_env: obs = obs.reshape((1, ) + obs.shape) if using_custom_vec_env: if dones: obs = envs.reset() obs = obs.reshape((1, ) + obs.shape) n_done += np.sum(dones) if (n_done - last_n_done) > 1: last_n_done = n_done _, mean_reward = computeMeanReward(log_dir, n_done) episode_reward.append(mean_reward) printRed('Episode:{} Reward:{}'.format(n_done, mean_reward)) _, mean_reward = computeMeanReward(log_dir, n_done) printRed('Episode:{} Reward:{}'.format(n_done, mean_reward)) episode_reward.append(mean_reward) episode_reward = np.array(episode_reward) envs.close() return episode_reward
'--num-timesteps', '251', '--log-dir', model_path, task_label, "--seed", str(seed_i) ] ok = subprocess.check_output(command_line_enjoy_student) ok = ok.decode('utf-8') str_before = "Mean reward: " str_after = "\npybullet" idx_before = ok.find(str_before) + len(str_before) idx_after = ok.find(str_after) seed_reward = float(ok[idx_before:idx_after]) local_reward.append(seed_reward) printGreen(local_reward) printRed( "current rewards {} at episode {} with random seed: {} for task {}" .format(np.mean(seed_reward), episodes[k], seed_i, task_label)) rewards[task_label[1:]].append(local_reward) run_mean[t] = np.mean(local_reward[1:]) # If one of the two mean rewards varies more thant 1%, then we do not increase the evaluation interval for t in range(len(run_mean)): if (run_mean[t] > 1.01 * last_mean[t] or run_mean[t] < 0.99 * last_mean[t]): increase_interval = False printGreen( "Reward now: {}, last Rewards: {} for sc and cc respectively". format(run_mean, last_mean)) # If the mean reward varies slowly, we increase the length of the evaluation interval
default=False, help='Plot timesteps instead of episodes') parser.add_argument('--no-display', action='store_true', default=False, help='Do not display plot') args = parser.parse_args() y_limits = args.y_lim if y_limits[0] == y_limits[1]: if args.shape_reward: y_limits = Y_LIM_SHAPED_REWARD else: y_limits = Y_LIM_SPARSE_REWARD print("Using default limits:", y_limits) plots = [f for f in os.listdir(args.input_dir) if f.endswith('.npz')] plots.sort() if len(plots) == 0: printRed("No npz files found in {}".format(args.input_dir)) exit(-1) comparePlots(args.input_dir, plots, title=args.title, y_limits=y_limits, no_display=args.no_display, timesteps=args.timesteps, truncate_x=args.truncate_x)
def main(): parser = argparse.ArgumentParser( description="OpenAI RL Baselines Benchmark", epilog= 'After the arguments are parsed, the rest are assumed to be arguments for' + ' rl_baselines.train') parser.add_argument('--algo', type=str, default='ppo2', help='OpenAI baseline to use', choices=list(registered_rl.keys())) parser.add_argument('--env', type=str, nargs='+', default=["KukaButtonGymEnv-v0"], help='environment ID(s)', choices=list(registered_env.keys())) parser.add_argument('--srl-model', type=str, nargs='+', default=["raw_pixels"], help='SRL model(s) to use', choices=list(registered_srl.keys())) parser.add_argument('--num-timesteps', type=int, default=1e6, help='number of timesteps the baseline should run') parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Display baseline STDOUT') parser.add_argument( '--num-iteration', type=int, default=15, help= 'number of time each algorithm should be run for each unique combination of environment ' + ' and srl-model.') parser.add_argument( '--seed', type=int, default=0, help= 'initial seed for each unique combination of environment and srl-model.' ) parser.add_argument( '--srl-config-file', type=str, default="config/srl_models.yaml", help='Set the location of the SRL model path configuration.') # returns the parsed arguments, and the rest are assumed to be arguments for rl_baselines.train args, train_args = parser.parse_known_args() # Sanity check assert args.num_timesteps >= 1, "Error: --num-timesteps cannot be less than 1" assert args.num_iteration >= 1, "Error: --num-iteration cannot be less than 1" # Removing duplicates and sort srl_models = list(set(args.srl_model)) envs = list(set(args.env)) srl_models.sort() envs.sort() # LOAD SRL models list assert os.path.exists(args.srl_config_file), \ "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.srl_config_file) with open(args.srl_config_file, 'rb') as f: all_models = yaml.load(f) # Checking definition and presence of all requested srl_models valid = True for env in envs: # validated the env definition if env not in all_models: printRed( "Error: 'srl_models.yaml' missing definition for environment {}" .format(env)) valid = False continue # skip to the next env, this one is not valid # checking log_folder for current env missing_log = "log_folder" not in all_models[env] if missing_log: printRed( "Error: 'srl_models.yaml' missing definition for log_folder in environment {}" .format(env)) valid = False # validate each model for the current env definition for model in srl_models: if registered_srl[model][0] == SRLType.ENVIRONMENT: continue # not an srl model, skip to the next model elif model not in all_models[env]: printRed( "Error: 'srl_models.yaml' missing srl_model {} for environment {}" .format(model, env)) valid = False elif (not missing_log) and ( not os.path.exists(all_models[env]["log_folder"] + all_models[env][model])): # checking presence of srl_model path, if and only if log_folder exists printRed( "Error: srl_model {} for environment {} was defined in ". format(model, env) + "'srl_models.yaml', however the file {} it was tagetting does not exist." .format(all_models[env]["log_folder"] + all_models[env][model])) valid = False assert valid, "Errors occured due to malformed 'srl_models.yaml', cannot continue." # check that all the SRL_models can be run on all the environments valid = True for env in envs: for model in srl_models: if registered_srl[model][1] is not None: found = False for compatible_class in registered_srl[model][1]: if issubclass(compatible_class, registered_env[env][0]): found = True break if not found: valid = False printRed( "Error: srl_model {}, is not compatible with the {} environment." .format(model, env)) assert valid, "Errors occured due to an incompatible combination of srl_model and environment, cannot continue." # the seeds used in training the baseline. seeds = list(np.arange(args.num_iteration) + args.seed) if args.verbose: # None here means stdout of terminal for subprocess.call stdout = None else: stdout = open(os.devnull, 'w') printGreen("\nRunning {} benchmarks {} times...".format( args.algo, args.num_iteration)) print("\nSRL-Models:\t{}".format(srl_models)) print("environments:\t{}".format(envs)) print("verbose:\t{}".format(args.verbose)) print("timesteps:\t{}".format(args.num_timesteps)) for model in srl_models: for env in envs: for i in range(args.num_iteration): printGreen( "\nIteration_num={} (seed: {}), Environment='{}', SRL-Model='{}'" .format(i, seeds[i], env, model)) # redefine the parsed args for rl_baselines.train loop_args = [ '--srl-model', model, '--seed', str(seeds[i]), '--algo', args.algo, '--env', env, '--num-timesteps', str(int(args.num_timesteps)), '--srl-config-file', args.srl_config_file ] ok = subprocess.call(['python', '-m', 'rl_baselines.train'] + train_args + loop_args, stdout=stdout) if ok != 0: # throw the error down to the terminal raise ChildProcessError( "An error occured, error code: {}".format(ok))
def main(): parser = argparse.ArgumentParser( description='Deteministic dataset generator for SRL training ' + '(can be used for environment testing)') parser.add_argument('--num-cpu', type=int, default=1, help='number of cpu to run on') parser.add_argument('--num-episode', type=int, default=50, help='number of episode to run') parser.add_argument( '--save-path', type=str, default='srl_zoo/data/', help='Folder where the environments will save the output') parser.add_argument('--name', type=str, default='kuka_button', help='Folder name for the output') parser.add_argument('--env', type=str, default='KukaButtonGymEnv-v0', help='The environment wanted', choices=list(registered_env.keys())) parser.add_argument('--display', action='store_true', default=False) parser.add_argument('--no-record-data', action='store_true', default=False) parser.add_argument( '--max-distance', type=float, default=0.28, help= 'Beyond this distance from the goal, the agent gets a negative reward') parser.add_argument('-c', '--continuous-actions', action='store_true', default=False) parser.add_argument('--seed', type=int, default=0, help='the seed') parser.add_argument( '-f', '--force', action='store_true', default=False, help='Force the save, even if it overrides something else,' + ' including partial parts if they exist') parser.add_argument('-r', '--random-target', action='store_true', default=False, help='Set the button to a random position') parser.add_argument('--multi-view', action='store_true', default=False, help='Set a second camera to the scene') parser.add_argument( '--shape-reward', action='store_true', default=False, help='Shape the reward (reward = - distance) instead of a sparse reward' ) parser.add_argument( '--reward-dist', action='store_true', default=False, help= 'Prints out the reward distribution when the dataset generation is finished' ) parser.add_argument( '--run-policy', type=str, default="random", choices=VALID_POLICIES, help='Policy to run for data collection ' + '(random, localy pretrained ppo2, pretrained custom policy)') parser.add_argument( '--log-custom-policy', type=str, default='', help='Logs of the custom pretained policy to run for data collection') parser.add_argument( '--latest', action='store_true', default=False, help='load the latest learned model (location: args.log-custom-policy)' ) parser.add_argument( '-rgm', '--replay-generative-model', type=str, default="", choices=['vae'], help= 'Generative model to replay for generating a dataset (for Continual Learning purposes)' ) parser.add_argument( '--log-generative-model', type=str, default='', help='Logs of the custom pretained policy to run for data collection') parser.add_argument( '--ppo2-timesteps', type=int, default=1000, help='number of timesteps to run PPO2 on before generating the dataset' ) parser.add_argument( '--toward-target-timesteps-proportion', type=float, default=0.0, help= "propotion of timesteps that use simply towards target policy, should be 0.0 to 1.0" ) parser.add_argument( '-sc', '--simple-continual', action='store_true', default=False, help= 'Simple red square target for task 1 of continual learning scenario. ' + 'The task is: robot should reach the target.') parser.add_argument( '-cc', '--circular-continual', action='store_true', default=False, help='Blue square target for task 2 of continual learning scenario. ' + 'The task is: robot should turn in circle around the target.') parser.add_argument( '-sqc', '--square-continual', action='store_true', default=False, help='Green square target for task 3 of continual learning scenario. ' + 'The task is: robot should turn in square around the target.') parser.add_argument( '--short-episodes', action='store_true', default=False, help= 'Generate short episodes (only 10 contacts with the target allowed).') parser.add_argument('--episode', type=int, default=-1, help='Model saved at episode N that we want to load') args = parser.parse_args() assert (args.num_cpu > 0), "Error: number of cpu must be positive and non zero" assert (args.max_distance > 0), "Error: max distance must be positive and non zero" assert (args.num_episode > 0), "Error: number of episodes must be positive and non zero" assert not args.reward_dist or not args.shape_reward, \ "Error: cannot display the reward distribution for continuous reward" assert not(registered_env[args.env][3] is ThreadingType.NONE and args.num_cpu != 1), \ "Error: cannot have more than 1 CPU for the environment {}".format(args.env) if args.num_cpu > args.num_episode: args.num_cpu = args.num_episode printYellow( "num_cpu cannot be greater than num_episode, defaulting to {} cpus." .format(args.num_cpu)) assert sum([args.simple_continual, args.circular_continual, args.square_continual]) <= 1, \ "For continual SRL and RL, please provide only one scenario at the time !" assert not (args.log_custom_policy == '' and args.run_policy in ['walker', 'custom']), \ "If using a custom policy, please specify a valid log folder for loading it." assert not (args.log_generative_model == '' and args.replay_generative_model == 'custom'), \ "If using a custom policy, please specify a valid log folder for loading it." if not os.path.exists(args.save_path): os.makedirs(args.save_path) # this is done so seed 0 and 1 are different and not simply offset of the same datasets. args.seed = np.random.RandomState(args.seed).randint(int(1e10)) # File exists, need to deal with it if not args.no_record_data and os.path.exists(args.save_path + args.name): assert args.force, "Error: save directory '{}' already exists".format( args.save_path + args.name) shutil.rmtree(args.save_path + args.name) for part in glob.glob(args.save_path + args.name + "_part-[0-9]*"): shutil.rmtree(part) if not args.no_record_data: # create the output os.mkdir(args.save_path + args.name) if args.num_cpu == 1: env_thread(args, 0, partition=False) else: # try and divide into multiple processes, with an environment each try: jobs = [] for i in range(args.num_cpu): process = multiprocessing.Process(target=env_thread, args=(args, i, True)) jobs.append(process) for j in jobs: j.start() try: for j in jobs: j.join() except Exception as e: printRed("Error: unable to join thread") raise e except Exception as e: printRed("Error: unable to start thread") raise e if not args.no_record_data and args.num_cpu > 1: # sleep 1 second, to avoid congruency issues from multiprocess (eg., files still writing) time.sleep(1) # get all the parts file_parts = sorted(glob.glob(args.save_path + args.name + "_part-[0-9]*"), key=lambda a: int(a.split("-")[-1])) # move the config files from any as they are identical os.rename(file_parts[0] + "/dataset_config.json", args.save_path + args.name + "/dataset_config.json") os.rename(file_parts[0] + "/env_globals.json", args.save_path + args.name + "/env_globals.json") ground_truth = None preprocessed_data = None # used to convert the part record_id to the fused record_id record_id = 0 for part in file_parts: # sort the record names alphabetically, then numerically records = sorted(glob.glob(part + "/record_[0-9]*"), key=lambda a: int(a.split("_")[-1])) record_id_start = record_id for record in records: os.renames( record, args.save_path + args.name + "/record_{:03d}".format(record_id)) record_id += 1 # fuse the npz files together, in the right order if ground_truth is None: # init ground_truth = {} preprocessed_data = {} ground_truth_load = np.load(part + "/ground_truth.npz") preprocessed_data_load = np.load(part + "/preprocessed_data.npz") for arr in ground_truth_load.files: if arr == "images_path": ground_truth[arr] = np.array([ convertImagePath(args, path, record_id_start) for path in ground_truth_load[arr] ]) else: ground_truth[arr] = ground_truth_load[arr] for arr in preprocessed_data_load.files: preprocessed_data[arr] = preprocessed_data_load[arr] else: ground_truth_load = np.load(part + "/ground_truth.npz") preprocessed_data_load = np.load(part + "/preprocessed_data.npz") for arr in ground_truth_load.files: if arr == "images_path": sanitised_paths = np.array([ convertImagePath(args, path, record_id_start) for path in ground_truth_load[arr] ]) ground_truth[arr] = np.concatenate( (ground_truth[arr], sanitised_paths)) else: ground_truth[arr] = np.concatenate( (ground_truth[arr], ground_truth_load[arr])) for arr in preprocessed_data_load.files: preprocessed_data[arr] = np.concatenate( (preprocessed_data[arr], preprocessed_data_load[arr])) # remove the current part folder shutil.rmtree(part) # save the fused outputs np.savez(args.save_path + args.name + "/ground_truth.npz", **ground_truth) np.savez(args.save_path + args.name + "/preprocessed_data.npz", **preprocessed_data) if args.reward_dist: rewards, counts = np.unique( np.load(args.save_path + args.name + "/preprocessed_data.npz")['rewards'], return_counts=True) counts = [ "{:.2f}%".format(val * 100) for val in counts / np.sum(counts) ] print("reward distribution:") [ print(" ", reward, count) for reward, count in list(zip(rewards, counts)) ]
def comparePlots(path, algo, y_limits, title="Learning Curve", timesteps=False, truncate_x=-1, no_display=False, normalization=False, figpath=None, exclude_list=None): """ :param path: (str) path to the folder where the plots are stored :param plots: ([str]) List of saved plots as npz file :param y_limits: ([float]) y-limits for the plot :param title: (str) plot title :param timesteps: (bool) Plot timesteps instead of episodes :param truncate_x: (int) Truncate the experiments after n ticks on the x-axis :param no_display: (bool) Set to true, the plot won't be displayed (useful when only saving plot) """ if exclude_list is None: exclude_list = [] folders = [] legends = [] for folder in os.listdir(path): folders_srl = [] tmp_path = "{}/{}/{}/".format(path, folder, algo) if os.path.exists(tmp_path) and ( folder not in exclude_list ): # folder contains algo (e.g. ppo2) subfolder and not in excluded list printRed(folder) legends.append(folder) for f in os.listdir(tmp_path): paths = "{}/{}/{}/{}/".format(path, folder, algo, f) folders_srl.append(paths) folders.append(folders_srl) else: continue x_list, y_list = [], [] exp_name_dict = {} for ind, folders_srl in enumerate(folders): printGreen("Folder name {}".format(folders_srl)) x, y = GatherExperiments(folders_srl, algo, window=40, title=title, min_num_x=-1, timesteps=timesteps, output_file="") print(len(x)) x_list.append(x) y_list.append(y) ## HACK: the line below is ugly and not robust code !! TODO exp_name_dict[ind] = folders_srl[0].split("/")[-4] printGreen(np.array(x_list).shape) # printGreen('y_list shape {}'.format(np.array(y_list[1]).shape)) plotGatheredData(x_list, y_list, y_limits, timesteps, title, legends, no_display, truncate_x, normalization, figpath=figpath, exp_name_dict=exp_name_dict)
def GatherExperiments( folders, algo, window=40, title="", min_num_x=-1, timesteps=False, output_file="", ): """ Compute mean and standard error for several experiments and plot the learning curve :param folders: ([str]) Log folders, where the monitor.csv are stored :param window: (int) Smoothing window :param algo: (str) name of the RL algo :param title: (str) plot title :param min_num_x: (int) Minimum number of episode/timesteps to keep an experiment (default: -1, no minimum) :param timesteps: (bool) Plot timesteps instead of episodes :param y_limits: ([float]) y-limits for the plot :param output_file: (str) Path to a file where the plot data will be saved :param no_display: (bool) Set to true, the plot won't be displayed (useful when only saving plot) """ y_list = [] x_list = [] ok = False for folder in folders: if timesteps: x, y = loadData(folder, smooth=1, bin_size=100) if x is not None: x, y = np.array(x), np.array(y) else: x, y = loadEpisodesData(folder) if x is None or (min_num_x > 0 and y.shape[0] < min_num_x): printRed("Skipping {}".format(folder)) continue if y.shape[0] <= window: printYellow("Folder {}".format(folder)) printRed("Not enough episodes for current window size = {}".format( window)) continue ok = True y = movingAverage(y, window) y_list.append(y) print(len(x)) # Truncate x x = x[len(x) - len(y):] x_list.append(x) if not ok: printRed("Not enough data to plot anything with current config." + " Consider decreasing --min-x") return lengths = list(map(len, x_list)) min_x, max_x = np.min(lengths), np.max(lengths) print("Min x: {}".format(min_x)) print("Max x: {}".format(max_x)) for i in range(len(x_list)): x_list[i] = x_list[i][:min_x] y_list[i] = y_list[i][:min_x] x = np.array(x_list)[0] y = np.array(y_list) # if output_file != "": # printGreen("Saving aggregated data to {}.npz".format(output_file)) # np.savez(output_file, x=x, y=y) return x, y
def main(): # Global variables for callback parser = argparse.ArgumentParser( description= "Evaluation script for distillation from two teacher policies") parser.add_argument('--seed', type=int, default=0, help='random seed (default: 0)') parser.add_argument('--env', type=str, help='environment ID', default='OmnirobotEnv-v0', choices=list(registered_env.keys())) parser.add_argument( '--episode_window', type=int, default=40, help='Episode window for moving average plot (default: 40)') parser.add_argument( '--log-dir-teacher-one', default='/tmp/gym/', type=str, help= 'directory to load an optmimal agent for task 1 (default: /tmp/gym)') parser.add_argument( '--log-dir-teacher-two', default='/tmp/gym/', type=str, help= 'directory to load an optmimal agent for task 2 (default: /tmp/gym)') parser.add_argument( '--log-dir-student', default='/tmp/gym/', type=str, help= 'directory to save the student agent logs and model (default: /tmp/gym)' ) parser.add_argument( '--srl-config-file-one', type=str, default="config/srl_models_one.yaml", help='Set the location of the SRL model path configuration.') parser.add_argument( '--srl-config-file-two', type=str, default="config/srl_models_two.yaml", help='Set the location of the SRL model path configuration.') parser.add_argument( '--epochs-distillation', type=int, default=30, metavar='N', help='number of epochs to train for distillation(default: 30)') parser.add_argument( '--distillation-training-set-size', type=int, default=-1, help='Limit size (number of samples) of the training set (default: -1)' ) parser.add_argument( '--eval-tasks', type=str, nargs='+', default=['cc', 'sqc', 'sc'], help='A cross evaluation from the latest stored model to all tasks') parser.add_argument( '--continual-learning-labels', type=str, nargs=2, metavar=('label_1', 'label_2'), default=argparse.SUPPRESS, help='Labels for the continual learning RL distillation task.') parser.add_argument('--student-srl-model', type=str, default='raw_pixels', choices=list(registered_srl.keys()), help='SRL model to use for the student RL policy') parser.add_argument( '--epochs-teacher-datasets', type=int, default=30, metavar='N', help= 'number of epochs for generating both RL teacher datasets (default: 30)' ) parser.add_argument( '--num-iteration', type=int, default=1, help='number of time each algorithm should be run the eval (N seeds).') parser.add_argument( '--eval-episode-window', type=int, default=400, metavar='N', help= 'Episode window for saving each policy checkpoint for future distillation(default: 100)' ) args, unknown = parser.parse_known_args() if 'continual_learning_labels' in args: assert args.continual_learning_labels[0] in CONTINUAL_LEARNING_LABELS and args.continual_learning_labels[1] \ in CONTINUAL_LEARNING_LABELS, "Please specify a valid Continual learning label to each dataset to be " \ "used for RL distillation !" print(args.continual_learning_labels) assert os.path.exists(args.srl_config_file_one), \ "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.srl_config_file_one) assert os.path.exists(args.srl_config_file_two), \ "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.srl_config_file_two) if not (args.log_dir_teacher_one == "None"): assert os.path.exists(args.log_dir_teacher_one), \ "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.log_dir_teacher_one) assert os.path.exists(args.log_dir_teacher_two), \ "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.srl_config_file_two) teacher_pro = args.log_dir_teacher_one teacher_learn = args.log_dir_teacher_two # The output path generate from the teacher_pro_data = args.continual_learning_labels[0] + '/' teacher_learn_data = args.continual_learning_labels[1] + '/' merge_path = "data/on_policy_merged" print(teacher_pro_data, teacher_learn_data) episodes, policy_path = allPolicy(teacher_learn) rewards_at_episode = {} episodes_to_test = [ e for e in episodes if (int(e) < 2000 and int(e) % 200 == 0) or ( int(e) > 2000 and int(e) % 1000 == 0) ] # generate data from Professional teacher printYellow("\nGenerating on policy for optimal teacher: " + args.continual_learning_labels[0]) if not (args.log_dir_teacher_one == "None"): OnPolicyDatasetGenerator(teacher_pro, args.continual_learning_labels[0] + '_copy/', task_id=args.continual_learning_labels[0], num_eps=args.epochs_teacher_datasets, episode=-1, env_name=args.env) print("Eval on eps list: ", episodes_to_test) for eps in episodes_to_test: student_path = args.log_dir_student printBlue("\n\nEvaluation at episode " + str(eps)) if not (args.log_dir_teacher_one == "None"): # Use a copy of the optimal teacher ok = subprocess.call([ 'cp', '-r', 'data/' + args.continual_learning_labels[0] + '_copy/', 'data/' + teacher_pro_data, '-f' ]) assert ok == 0 time.sleep(2) # Generate data from learning teacher printYellow("\nGenerating on-policy data from the optimal teacher: " + args.continual_learning_labels[1]) OnPolicyDatasetGenerator(teacher_learn, teacher_learn_data, task_id=args.continual_learning_labels[1], episode=eps, num_eps=args.epochs_teacher_datasets, env_name=args.env) if args.log_dir_teacher_one == "None": merge_path = 'data/' + teacher_learn_data ok = subprocess.call( ['cp', '-r', merge_path, 'srl_zoo/data/', '-f']) else: # merge the data mergeData('data/' + teacher_pro_data, 'data/' + teacher_learn_data, merge_path, force=True) ok = subprocess.call( ['cp', '-r', 'data/on_policy_merged/', 'srl_zoo/data/', '-f']) assert ok == 0 time.sleep(2) # Train a policy with distillation on the merged teacher's datasets trainStudent('srl_zoo/' + merge_path, args.continual_learning_labels[1], yaml_file=args.srl_config_file_one, log_dir=args.log_dir_student, srl_model=args.student_srl_model, env_name=args.env, training_size=args.distillation_training_set_size, epochs=args.epochs_distillation) student_path += args.env + '/' + args.student_srl_model + "/distillation/" latest_student_path = max([ student_path + "/" + d for d in os.listdir(student_path) if os.path.isdir(student_path + "/" + d) ], key=os.path.getmtime) + '/' rewards = {} printRed("\nSaving the student at path: " + latest_student_path) for task_label in ["-sc", "-cc"]: rewards[task_label] = [] for seed_i in range(args.num_iteration): printYellow("\nEvaluating student on task: " + task_label + " for seed: " + str(seed_i)) command_line_enjoy_student = [ 'python', '-m', 'replay.enjoy_baselines', '--num-timesteps', '251', '--log-dir', latest_student_path, task_label, "--seed", str(seed_i) ] ok = subprocess.check_output(command_line_enjoy_student) ok = ok.decode('utf-8') str_before = "Mean reward: " str_after = "\npybullet" idx_before = ok.find(str_before) + len(str_before) idx_after = ok.find(str_after) seed_reward = float(ok[idx_before:idx_after]) rewards[task_label].append(seed_reward) print("rewards at eps ", eps, ": ", rewards) rewards_at_episode[eps] = rewards print("All rewards: ", rewards_at_episode) json_dict = json.dumps(rewards_at_episode) json_dict_name = \ args.log_dir_student + "/reward_at_episode_" + datetime.datetime.now().strftime("%y-%m-%d_%Hh%M_%S") + '.json' f = open(json_dict_name, "w") f.write(json_dict) f.close() printRed("\nSaving the evalation at path: " + json_dict_name)
def plotGatheredExperiments(folders, algo, y_limits, window=40, title="", min_num_x=-1, timesteps=False, output_file="", no_display=False): """ Compute mean and standard error for several experiments and plot the learning curve :param folders: ([str]) Log folders, where the monitor.csv are stored :param window: (int) Smoothing window :param algo: (str) name of the RL algo :param title: (str) plot title :param min_num_x: (int) Minimum number of episode/timesteps to keep an experiment (default: -1, no minimum) :param timesteps: (bool) Plot timesteps instead of episodes :param y_limits: ([float]) y-limits for the plot :param output_file: (str) Path to a file where the plot data will be saved :param no_display: (bool) Set to true, the plot won't be displayed (useful when only saving plot) """ y_list = [] x_list = [] ok = False for folder in folders: if timesteps: x, y = loadData(folder, smooth=1, bin_size=100) if x is not None: x, y = np.array(x), np.array(y) else: x, y = loadEpisodesData(folder) if x is None or (min_num_x > 0 and y.shape[0] < min_num_x): printYellow("Skipping {}".format(folder)) continue if y.shape[0] <= window: printYellow("Folder {}".format(folder)) printYellow( "Not enough episodes for current window size = {}".format( window)) continue ok = True y = movingAverage(y, window) y_list.append(y) # Truncate x x = x[len(x) - len(y):] x_list.append(x) if not ok: printRed("Not enough data to plot anything with current config." + " Consider decreasing --min-x") return lengths = list(map(len, x_list)) min_x, max_x = np.min(lengths), np.max(lengths) print("Min x: {}".format(min_x)) print("Max x: {}".format(max_x)) for i in range(len(x_list)): x_list[i] = x_list[i][:min_x] y_list[i] = y_list[i][:min_x] x = np.array(x_list)[0] y = np.array(y_list) printGreen("{} Experiments".format(y.shape[0])) print("Min, Max rewards:", np.min(y), np.max(y)) fig = plt.figure(title) # Compute mean for different seeds m = np.mean(y, axis=0) # Compute standard error s = np.squeeze(np.asarray(np.std(y, axis=0))) n = y.shape[0] plt.fill_between(x, m - s / np.sqrt(n), m + s / np.sqrt(n), color=lightcolors[0]) plt.plot(x, m, color=darkcolors[0], label=algo, linewidth=1) if timesteps: formatter = FuncFormatter(millions) plt.xlabel('Number of Timesteps') fig.axes[0].xaxis.set_major_formatter(formatter) else: plt.xlabel('Number of Episodes') plt.ylabel('Rewards') plt.title(title, **fontstyle) plt.ylim(y_limits) plt.legend(framealpha=0.5, labelspacing=0.01, loc='lower right', fontsize=16) if output_file != "": printGreen("Saving aggregated data to {}.npz".format(output_file)) np.savez(output_file, x=x, y=y) if not no_display: plt.show()
def prog_mlp_extractor(flat_observations, net_arch, act_fun, dict_res_tensor={}): printRed(net_arch) latent = flat_observations policy_only_layers = [] # Layer sizes of the network that only belongs to the policy network value_only_layers = [] # Layer sizes of the network that only belongs to the value network # Test purpose # # if(len(prev_cols)>0): # model = prev_cols[0] # sess = model.sess # graph =sess.graph # # model_variables = graph.get_operations() # train_ops = [] # for op in model_variables: # if("loss" not in str(op.name) and "model" in str(op.name) and "add" in str(op.name)): # print(op.name) # train_ops.append(op.values()) # # try: # # tmp = sess.run(op.values()) # # if(tmp !=None): # # print(op.name,op.values(), tmp[0].shape) # # except: # # print("feed_dict no compitible", op.name) # #model / pi_fc0 / w: 0 # Iterate through the shared layers and build the shared parts of the network for idx, layer in enumerate(net_arch): if isinstance(layer, int): # Check that this is a shared layer layer_size = layer latent = act_fun(linear(latent, "shared_fc{}".format(idx), layer_size, init_scale=np.sqrt(2))) else: if 'pi' in layer: assert isinstance(layer['pi'], list), "Error: net_arch[-1]['pi'] must contain a list of integers." policy_only_layers = layer['pi'] if 'vf' in layer: assert isinstance(layer['vf'], list), "Error: net_arch[-1]['vf'] must contain a list of integers." value_only_layers = layer['vf'] break # From here on the network splits up in policy and value network # Build the non-shared part of the network latent_policy = latent latent_value = latent for idx, (pi_layer_size, vf_layer_size) in enumerate(zip_longest(policy_only_layers, value_only_layers)): if pi_layer_size is not None: assert isinstance(pi_layer_size, int), "Error: net_arch[-1]['pi'] must only contain integers." latent_policy = (linear(latent_policy, "pi_fc{}".format(idx), pi_layer_size, init_scale=np.sqrt(2))) print(latent_policy.name) with tf.variable_scope("pi_res_{}".format(idx)): try: print(dict_res_tensor[latent_policy.name]) except: printRed("Exception: residual insertion from previous model failed.") latent_policy = act_fun(latent_policy) if vf_layer_size is not None: assert isinstance(vf_layer_size, int), "Error: net_arch[-1]['vf'] must only contain integers." latent_value = (linear(latent_value, "vf_fc{}".format(idx), vf_layer_size, init_scale=np.sqrt(2))) with tf.variable_scope("vf_res_{}".format(idx)): latent_value = act_fun(latent_value + 2) return latent_policy, latent_value
def main(): parser = argparse.ArgumentParser( description='Deteministic dataset generator for SRL training ' + '(can be used for environment testing)') parser.add_argument('--num-cpu', type=int, default=1, help='number of cpu to run on') parser.add_argument('--num-episode', type=int, default=50, help='number of episode to run') parser.add_argument('--max_steps_per_epoch', type=int, default=200, help='max num steps per epoch') #CUSTOM ARGS. want to udpate eventually, i.e., specify a specific path for dr parser.add_argument( '--dr', action='store_true', default=False, help= "Include this flag to use the chosen environment with domain randomization" ) parser.add_argument( '--alt', action='store_true', default=False, help= "Include this flag to use the chosen environment with alternate view") parser.add_argument( '--special_start', action='store_true', default=False, help= "Include this flag to use the chosen environment with the special start" ) parser.add_argument( '--save-path', type=str, default='robotics-rl-srl/data/', help='Folder where the environments will save the output') parser.add_argument('--name', type=str, default='UNSETNAME', help='Folder name for the output') parser.add_argument('--env', type=str, default='push_rotate', help='The environment wanted', choices=list(envs.keys())) parser.add_argument('--display', action='store_true', default=False) parser.add_argument('--no-record-data', action='store_true', default=False) parser.add_argument('--seed', type=int, default=0, help='the seed') parser.add_argument( '-f', '--force', action='store_true', default=False, help='Force the save, even if it overrides something else,' + ' including partial parts if they exist') #TODO: Change this argument to be for the diff types of tasks parser.add_argument('--multi-view', action='store_true', default=False, help='Set a second camera to the scene') parser.add_argument( '--reward-dist', action='store_true', default=False, help= 'Prints out the reward distribution when the dataset generation is finished' ) parser.add_argument('--run-ppo2', action='store_true', default=False, help='runs a ppo2 agent instead of a random agent') parser.add_argument( '--ppo2-timesteps', type=int, default=1000, help='number of timesteps to run PPO2 on before generating the dataset' ) args = parser.parse_args() assert (args.num_cpu > 0), "Error: number of cpu must be positive and non zero" assert (args.num_episode > 0), "Error: number of episodes must be positive and non zero" # assert not(registered_env[args.env][3] is ThreadingType.NONE and args.num_cpu != 1), \ # "Error: cannot have more than 1 CPU for the environment {}".format(args.env) if args.num_cpu > args.num_episode: args.num_cpu = args.num_episode printYellow( "num_cpu cannot be greater than num_episode, defaulting to {} cpus." .format(args.num_cpu)) # this is done so seed 0 and 1 are different and not simply offset of the same datasets. args.seed = np.random.RandomState(args.seed).randint(int(1e10)) # File exists, need to deal with it if not args.no_record_data and os.path.exists(args.save_path + args.name): assert args.force, "Error: save directory '{}' already exists".format( args.save_path + args.name) shutil.rmtree(args.save_path + args.name) for part in glob.glob(args.save_path + args.name + "_part-[0-9]*"): shutil.rmtree(part) if not args.no_record_data: # create the output os.makedirs(args.save_path + args.name, exist_ok=True) if args.num_cpu == 1: env_thread(args, 0, partition=False, use_ppo2=args.run_ppo2) else: # try and divide into multiple processes, with an environment each try: jobs = [] for i in range(args.num_cpu): process = multiprocessing.Process(target=env_thread, args=(args, i, True, args.run_ppo2)) jobs.append(process) for j in jobs: j.start() try: for j in jobs: j.join() except Exception as e: printRed("Error: unable to join thread") raise e except Exception as e: printRed("Error: unable to start thread") raise e if not args.no_record_data and args.num_cpu > 1: # sleep 1 second, to avoid congruency issues from multiprocess (eg., files still writing) time.sleep(1) # get all the parts file_parts = sorted(glob.glob(args.save_path + args.name + "_part-[0-9]*"), key=lambda a: int(a.split("-")[-1])) # move the config files from any as they are identical os.rename(file_parts[0] + "/dataset_config.json", args.save_path + args.name + "/dataset_config.json") os.rename(file_parts[0] + "/env_globals.json", args.save_path + args.name + "/env_globals.json") ground_truth = None preprocessed_data = None # used to convert the part record_id to the fused record_id record_id = 0 for part in file_parts: # sort the record names alphabetically, then numerically records = sorted(glob.glob(part + "/record_[0-9]*"), key=lambda a: int(a.split("_")[-1])) record_id_start = record_id for record in records: os.renames( record, args.save_path + args.name + "/record_{:03d}".format(record_id)) record_id += 1 # fuse the npz files together, in the right order if ground_truth is None: # init ground_truth = {} preprocessed_data = {} ground_truth_load = np.load(part + "/ground_truth.npz") preprocessed_data_load = np.load(part + "/preprocessed_data.npz") for arr in ground_truth_load.files: if arr == "images_path": ground_truth[arr] = np.array([ convertImagePath(args, path, record_id_start) for path in ground_truth_load[arr] ]) else: ground_truth[arr] = ground_truth_load[arr] for arr in preprocessed_data_load.files: preprocessed_data[arr] = preprocessed_data_load[arr] else: ground_truth_load = np.load(part + "/ground_truth.npz") preprocessed_data_load = np.load(part + "/preprocessed_data.npz") for arr in ground_truth_load.files: if arr == "images_path": sanitised_paths = np.array([ convertImagePath(args, path, record_id_start) for path in ground_truth_load[arr] ]) ground_truth[arr] = np.concatenate( (ground_truth[arr], sanitised_paths)) else: ground_truth[arr] = np.concatenate( (ground_truth[arr], ground_truth_load[arr])) for arr in preprocessed_data_load.files: preprocessed_data[arr] = np.concatenate( (preprocessed_data[arr], preprocessed_data_load[arr])) # remove the current part folder shutil.rmtree(part) # save the fused outputs np.savez(args.save_path + args.name + "/ground_truth.npz", **ground_truth) np.savez(args.save_path + args.name + "/preprocessed_data.npz", **preprocessed_data) if args.reward_dist: rewards, counts = np.unique( np.load(args.save_path + args.name + "/preprocessed_data.npz")['rewards'], return_counts=True) counts = [ "{:.2f}%".format(val * 100) for val in counts / np.sum(counts) ] print("reward distribution:") [ print(" ", reward, count) for reward, count in list(zip(rewards, counts)) ]