def policyEval(envs,
               model_path,
               log_dir,
               algo_class,
               algo_args,
               num_timesteps=251,
               num_cpu=1):
    """
    evaluation for the policy in the given envs
    :param envs: the environment we want to evaluate
    :param model_path: (str)the path to the policy ckp
    :param log_dir: (str) the path from a gym temporal file
    :param algo_class:
    :param algo_args:
    :param num_timesteps: (int) numbers of the timesteps we want to evaluate the policy
    :param num_cpu:
    :return:
    """
    tf.reset_default_graph()

    method = algo_class.load(model_path, args=algo_args)

    using_custom_vec_env = isinstance(envs, WrapFrameStack)

    obs = envs.reset()

    if using_custom_vec_env:
        obs = obs.reshape((1, ) + obs.shape)
    n_done = 0
    last_n_done = 0
    episode_reward = []
    dones = [False for _ in range(num_cpu)]

    for i in range(num_timesteps):
        actions = method.getAction(obs, dones)
        obs, rewards, dones, _ = envs.step(actions)
        if using_custom_vec_env:
            obs = obs.reshape((1, ) + obs.shape)
        if using_custom_vec_env:
            if dones:
                obs = envs.reset()
                obs = obs.reshape((1, ) + obs.shape)

        n_done += np.sum(dones)
        if (n_done - last_n_done) > 1:
            last_n_done = n_done
            _, mean_reward = computeMeanReward(log_dir, n_done)
            episode_reward.append(mean_reward)
            printRed('Episode:{} Reward:{}'.format(n_done, mean_reward))
    _, mean_reward = computeMeanReward(log_dir, n_done)
    printRed('Episode:{} Reward:{}'.format(n_done, mean_reward))

    episode_reward.append(mean_reward)

    episode_reward = np.array(episode_reward)
    envs.close()
    return episode_reward
Beispiel #2
0
                    '--num-timesteps', '251', '--log-dir', model_path,
                    task_label, "--seed",
                    str(seed_i)
                ]
                ok = subprocess.check_output(command_line_enjoy_student)
                ok = ok.decode('utf-8')
                str_before = "Mean reward: "
                str_after = "\npybullet"
                idx_before = ok.find(str_before) + len(str_before)
                idx_after = ok.find(str_after)
                seed_reward = float(ok[idx_before:idx_after])

                local_reward.append(seed_reward)
                printGreen(local_reward)
                printRed(
                    "current rewards {} at episode {} with random seed: {} for task {}"
                    .format(np.mean(seed_reward), episodes[k], seed_i,
                            task_label))

            rewards[task_label[1:]].append(local_reward)
            run_mean[t] = np.mean(local_reward[1:])

        # If one of the two mean rewards varies more thant 1%, then we do not increase the evaluation interval
        for t in range(len(run_mean)):
            if (run_mean[t] > 1.01 * last_mean[t]
                    or run_mean[t] < 0.99 * last_mean[t]):
                increase_interval = False

        printGreen(
            "Reward now: {}, last Rewards: {} for sc and cc respectively".
            format(run_mean, last_mean))
        # If the mean reward varies slowly, we increase the length of the evaluation interval
Beispiel #3
0
                        default=False,
                        help='Plot timesteps instead of episodes')
    parser.add_argument('--no-display',
                        action='store_true',
                        default=False,
                        help='Do not display plot')
    args = parser.parse_args()

    y_limits = args.y_lim
    if y_limits[0] == y_limits[1]:
        if args.shape_reward:
            y_limits = Y_LIM_SHAPED_REWARD
        else:
            y_limits = Y_LIM_SPARSE_REWARD
        print("Using default limits:", y_limits)

    plots = [f for f in os.listdir(args.input_dir) if f.endswith('.npz')]
    plots.sort()

    if len(plots) == 0:
        printRed("No npz files found in {}".format(args.input_dir))
        exit(-1)

    comparePlots(args.input_dir,
                 plots,
                 title=args.title,
                 y_limits=y_limits,
                 no_display=args.no_display,
                 timesteps=args.timesteps,
                 truncate_x=args.truncate_x)
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser(
        description="OpenAI RL Baselines Benchmark",
        epilog=
        'After the arguments are parsed, the rest are assumed to be arguments for'
        + ' rl_baselines.train')
    parser.add_argument('--algo',
                        type=str,
                        default='ppo2',
                        help='OpenAI baseline to use',
                        choices=list(registered_rl.keys()))
    parser.add_argument('--env',
                        type=str,
                        nargs='+',
                        default=["KukaButtonGymEnv-v0"],
                        help='environment ID(s)',
                        choices=list(registered_env.keys()))
    parser.add_argument('--srl-model',
                        type=str,
                        nargs='+',
                        default=["raw_pixels"],
                        help='SRL model(s) to use',
                        choices=list(registered_srl.keys()))
    parser.add_argument('--num-timesteps',
                        type=int,
                        default=1e6,
                        help='number of timesteps the baseline should run')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        default=False,
                        help='Display baseline STDOUT')
    parser.add_argument(
        '--num-iteration',
        type=int,
        default=15,
        help=
        'number of time each algorithm should be run for each unique combination of environment '
        + ' and srl-model.')
    parser.add_argument(
        '--seed',
        type=int,
        default=0,
        help=
        'initial seed for each unique combination of environment and srl-model.'
    )
    parser.add_argument(
        '--srl-config-file',
        type=str,
        default="config/srl_models.yaml",
        help='Set the location of the SRL model path configuration.')

    # returns the parsed arguments, and the rest are assumed to be arguments for rl_baselines.train
    args, train_args = parser.parse_known_args()

    # Sanity check
    assert args.num_timesteps >= 1, "Error: --num-timesteps cannot be less than 1"
    assert args.num_iteration >= 1, "Error: --num-iteration cannot be less than 1"

    # Removing duplicates and sort
    srl_models = list(set(args.srl_model))
    envs = list(set(args.env))
    srl_models.sort()
    envs.sort()

    # LOAD SRL models list
    assert os.path.exists(args.srl_config_file), \
        "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.srl_config_file)
    with open(args.srl_config_file, 'rb') as f:
        all_models = yaml.load(f)

    # Checking definition and presence of all requested srl_models
    valid = True
    for env in envs:
        # validated the env definition
        if env not in all_models:
            printRed(
                "Error: 'srl_models.yaml' missing definition for environment {}"
                .format(env))
            valid = False
            continue  # skip to the next env, this one is not valid

        # checking log_folder for current env
        missing_log = "log_folder" not in all_models[env]
        if missing_log:
            printRed(
                "Error: 'srl_models.yaml' missing definition for log_folder in environment {}"
                .format(env))
            valid = False

        # validate each model for the current env definition
        for model in srl_models:
            if registered_srl[model][0] == SRLType.ENVIRONMENT:
                continue  # not an srl model, skip to the next model
            elif model not in all_models[env]:
                printRed(
                    "Error: 'srl_models.yaml' missing srl_model {} for environment {}"
                    .format(model, env))
                valid = False
            elif (not missing_log) and (
                    not os.path.exists(all_models[env]["log_folder"] +
                                       all_models[env][model])):
                # checking presence of srl_model path, if and only if log_folder exists
                printRed(
                    "Error: srl_model {} for environment {} was defined in ".
                    format(model, env) +
                    "'srl_models.yaml', however the file {} it was tagetting does not exist."
                    .format(all_models[env]["log_folder"] +
                            all_models[env][model]))
                valid = False

    assert valid, "Errors occured due to malformed 'srl_models.yaml', cannot continue."

    # check that all the SRL_models can be run on all the environments
    valid = True
    for env in envs:
        for model in srl_models:
            if registered_srl[model][1] is not None:
                found = False
                for compatible_class in registered_srl[model][1]:
                    if issubclass(compatible_class, registered_env[env][0]):
                        found = True
                        break
                if not found:
                    valid = False
                    printRed(
                        "Error: srl_model {}, is not compatible with the {} environment."
                        .format(model, env))
    assert valid, "Errors occured due to an incompatible combination of srl_model and environment, cannot continue."

    # the seeds used in training the baseline.
    seeds = list(np.arange(args.num_iteration) + args.seed)

    if args.verbose:
        # None here means stdout of terminal for subprocess.call
        stdout = None
    else:
        stdout = open(os.devnull, 'w')

    printGreen("\nRunning {} benchmarks {} times...".format(
        args.algo, args.num_iteration))
    print("\nSRL-Models:\t{}".format(srl_models))
    print("environments:\t{}".format(envs))
    print("verbose:\t{}".format(args.verbose))
    print("timesteps:\t{}".format(args.num_timesteps))
    for model in srl_models:
        for env in envs:
            for i in range(args.num_iteration):

                printGreen(
                    "\nIteration_num={} (seed: {}), Environment='{}', SRL-Model='{}'"
                    .format(i, seeds[i], env, model))

                # redefine the parsed args for rl_baselines.train
                loop_args = [
                    '--srl-model', model, '--seed',
                    str(seeds[i]), '--algo', args.algo, '--env', env,
                    '--num-timesteps',
                    str(int(args.num_timesteps)), '--srl-config-file',
                    args.srl_config_file
                ]

                ok = subprocess.call(['python', '-m', 'rl_baselines.train'] +
                                     train_args + loop_args,
                                     stdout=stdout)

                if ok != 0:
                    # throw the error down to the terminal
                    raise ChildProcessError(
                        "An error occured, error code: {}".format(ok))
def main():
    parser = argparse.ArgumentParser(
        description='Deteministic dataset generator for SRL training ' +
        '(can be used for environment testing)')
    parser.add_argument('--num-cpu',
                        type=int,
                        default=1,
                        help='number of cpu to run on')
    parser.add_argument('--num-episode',
                        type=int,
                        default=50,
                        help='number of episode to run')
    parser.add_argument(
        '--save-path',
        type=str,
        default='srl_zoo/data/',
        help='Folder where the environments will save the output')
    parser.add_argument('--name',
                        type=str,
                        default='kuka_button',
                        help='Folder name for the output')
    parser.add_argument('--env',
                        type=str,
                        default='KukaButtonGymEnv-v0',
                        help='The environment wanted',
                        choices=list(registered_env.keys()))
    parser.add_argument('--display', action='store_true', default=False)
    parser.add_argument('--no-record-data', action='store_true', default=False)
    parser.add_argument(
        '--max-distance',
        type=float,
        default=0.28,
        help=
        'Beyond this distance from the goal, the agent gets a negative reward')
    parser.add_argument('-c',
                        '--continuous-actions',
                        action='store_true',
                        default=False)
    parser.add_argument('--seed', type=int, default=0, help='the seed')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='Force the save, even if it overrides something else,' +
        ' including partial parts if they exist')
    parser.add_argument('-r',
                        '--random-target',
                        action='store_true',
                        default=False,
                        help='Set the button to a random position')
    parser.add_argument('--multi-view',
                        action='store_true',
                        default=False,
                        help='Set a second camera to the scene')
    parser.add_argument(
        '--shape-reward',
        action='store_true',
        default=False,
        help='Shape the reward (reward = - distance) instead of a sparse reward'
    )
    parser.add_argument(
        '--reward-dist',
        action='store_true',
        default=False,
        help=
        'Prints out the reward distribution when the dataset generation is finished'
    )
    parser.add_argument(
        '--run-policy',
        type=str,
        default="random",
        choices=VALID_POLICIES,
        help='Policy to run for data collection ' +
        '(random, localy pretrained ppo2, pretrained custom policy)')
    parser.add_argument(
        '--log-custom-policy',
        type=str,
        default='',
        help='Logs of the custom pretained policy to run for data collection')
    parser.add_argument(
        '--latest',
        action='store_true',
        default=False,
        help='load the latest learned model (location: args.log-custom-policy)'
    )
    parser.add_argument(
        '-rgm',
        '--replay-generative-model',
        type=str,
        default="",
        choices=['vae'],
        help=
        'Generative model to replay for generating a dataset (for Continual Learning purposes)'
    )
    parser.add_argument(
        '--log-generative-model',
        type=str,
        default='',
        help='Logs of the custom pretained policy to run for data collection')
    parser.add_argument(
        '--ppo2-timesteps',
        type=int,
        default=1000,
        help='number of timesteps to run PPO2 on before generating the dataset'
    )
    parser.add_argument(
        '--toward-target-timesteps-proportion',
        type=float,
        default=0.0,
        help=
        "propotion of timesteps that use simply towards target policy, should be 0.0 to 1.0"
    )
    parser.add_argument(
        '-sc',
        '--simple-continual',
        action='store_true',
        default=False,
        help=
        'Simple red square target for task 1 of continual learning scenario. '
        + 'The task is: robot should reach the target.')
    parser.add_argument(
        '-cc',
        '--circular-continual',
        action='store_true',
        default=False,
        help='Blue square target for task 2 of continual learning scenario. ' +
        'The task is: robot should turn in circle around the target.')
    parser.add_argument(
        '-sqc',
        '--square-continual',
        action='store_true',
        default=False,
        help='Green square target for task 3 of continual learning scenario. '
        + 'The task is: robot should turn in square around the target.')
    parser.add_argument(
        '--short-episodes',
        action='store_true',
        default=False,
        help=
        'Generate short episodes (only 10 contacts with the target allowed).')
    parser.add_argument('--episode',
                        type=int,
                        default=-1,
                        help='Model saved at episode N that we want to load')

    args = parser.parse_args()

    assert (args.num_cpu >
            0), "Error: number of cpu must be positive and non zero"
    assert (args.max_distance >
            0), "Error: max distance must be positive and non zero"
    assert (args.num_episode >
            0), "Error: number of episodes must be positive and non zero"
    assert not args.reward_dist or not args.shape_reward, \
        "Error: cannot display the reward distribution for continuous reward"
    assert not(registered_env[args.env][3] is ThreadingType.NONE and args.num_cpu != 1), \
        "Error: cannot have more than 1 CPU for the environment {}".format(args.env)
    if args.num_cpu > args.num_episode:
        args.num_cpu = args.num_episode
        printYellow(
            "num_cpu cannot be greater than num_episode, defaulting to {} cpus."
            .format(args.num_cpu))

    assert sum([args.simple_continual, args.circular_continual, args.square_continual]) <= 1, \
        "For continual SRL and RL, please provide only one scenario at the time !"

    assert not (args.log_custom_policy == '' and args.run_policy in ['walker', 'custom']), \
        "If using a custom policy, please specify a valid log folder for loading it."

    assert not (args.log_generative_model == '' and args.replay_generative_model == 'custom'), \
        "If using a custom policy, please specify a valid log folder for loading it."

    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)

    # this is done so seed 0 and 1 are different and not simply offset of the same datasets.
    args.seed = np.random.RandomState(args.seed).randint(int(1e10))

    # File exists, need to deal with it
    if not args.no_record_data and os.path.exists(args.save_path + args.name):
        assert args.force, "Error: save directory '{}' already exists".format(
            args.save_path + args.name)

        shutil.rmtree(args.save_path + args.name)
        for part in glob.glob(args.save_path + args.name + "_part-[0-9]*"):
            shutil.rmtree(part)
    if not args.no_record_data:
        # create the output
        os.mkdir(args.save_path + args.name)

    if args.num_cpu == 1:
        env_thread(args, 0, partition=False)
    else:
        # try and divide into multiple processes, with an environment each
        try:
            jobs = []
            for i in range(args.num_cpu):
                process = multiprocessing.Process(target=env_thread,
                                                  args=(args, i, True))
                jobs.append(process)

            for j in jobs:
                j.start()

            try:
                for j in jobs:
                    j.join()
            except Exception as e:
                printRed("Error: unable to join thread")
                raise e

        except Exception as e:
            printRed("Error: unable to start thread")
            raise e

    if not args.no_record_data and args.num_cpu > 1:
        # sleep 1 second, to avoid congruency issues from multiprocess (eg., files still writing)
        time.sleep(1)
        # get all the parts
        file_parts = sorted(glob.glob(args.save_path + args.name +
                                      "_part-[0-9]*"),
                            key=lambda a: int(a.split("-")[-1]))

        # move the config files from any as they are identical
        os.rename(file_parts[0] + "/dataset_config.json",
                  args.save_path + args.name + "/dataset_config.json")
        os.rename(file_parts[0] + "/env_globals.json",
                  args.save_path + args.name + "/env_globals.json")

        ground_truth = None
        preprocessed_data = None

        # used to convert the part record_id to the fused record_id
        record_id = 0
        for part in file_parts:
            # sort the record names alphabetically, then numerically
            records = sorted(glob.glob(part + "/record_[0-9]*"),
                             key=lambda a: int(a.split("_")[-1]))

            record_id_start = record_id
            for record in records:
                os.renames(
                    record, args.save_path + args.name +
                    "/record_{:03d}".format(record_id))
                record_id += 1

            # fuse the npz files together, in the right order
            if ground_truth is None:
                # init
                ground_truth = {}
                preprocessed_data = {}
                ground_truth_load = np.load(part + "/ground_truth.npz")
                preprocessed_data_load = np.load(part +
                                                 "/preprocessed_data.npz")

                for arr in ground_truth_load.files:
                    if arr == "images_path":
                        ground_truth[arr] = np.array([
                            convertImagePath(args, path, record_id_start)
                            for path in ground_truth_load[arr]
                        ])
                    else:
                        ground_truth[arr] = ground_truth_load[arr]
                for arr in preprocessed_data_load.files:
                    preprocessed_data[arr] = preprocessed_data_load[arr]

            else:
                ground_truth_load = np.load(part + "/ground_truth.npz")
                preprocessed_data_load = np.load(part +
                                                 "/preprocessed_data.npz")

                for arr in ground_truth_load.files:
                    if arr == "images_path":
                        sanitised_paths = np.array([
                            convertImagePath(args, path, record_id_start)
                            for path in ground_truth_load[arr]
                        ])
                        ground_truth[arr] = np.concatenate(
                            (ground_truth[arr], sanitised_paths))
                    else:
                        ground_truth[arr] = np.concatenate(
                            (ground_truth[arr], ground_truth_load[arr]))
                for arr in preprocessed_data_load.files:
                    preprocessed_data[arr] = np.concatenate(
                        (preprocessed_data[arr], preprocessed_data_load[arr]))

            # remove the current part folder
            shutil.rmtree(part)

        # save the fused outputs
        np.savez(args.save_path + args.name + "/ground_truth.npz",
                 **ground_truth)
        np.savez(args.save_path + args.name + "/preprocessed_data.npz",
                 **preprocessed_data)

    if args.reward_dist:
        rewards, counts = np.unique(
            np.load(args.save_path + args.name +
                    "/preprocessed_data.npz")['rewards'],
            return_counts=True)
        counts = [
            "{:.2f}%".format(val * 100) for val in counts / np.sum(counts)
        ]
        print("reward distribution:")
        [
            print(" ", reward, count)
            for reward, count in list(zip(rewards, counts))
        ]
Beispiel #6
0
def comparePlots(path,
                 algo,
                 y_limits,
                 title="Learning Curve",
                 timesteps=False,
                 truncate_x=-1,
                 no_display=False,
                 normalization=False,
                 figpath=None,
                 exclude_list=None):
    """
    :param path: (str) path to the folder where the plots are stored
    :param plots: ([str]) List of saved plots as npz file
    :param y_limits: ([float]) y-limits for the plot
    :param title: (str) plot title
    :param timesteps: (bool) Plot timesteps instead of episodes
    :param truncate_x: (int) Truncate the experiments after n ticks on the x-axis
    :param no_display: (bool) Set to true, the plot won't be displayed (useful when only saving plot)
    """
    if exclude_list is None:
        exclude_list = []
    folders = []
    legends = []
    for folder in os.listdir(path):
        folders_srl = []
        tmp_path = "{}/{}/{}/".format(path, folder, algo)
        if os.path.exists(tmp_path) and (
                folder not in exclude_list
        ):  # folder contains algo (e.g. ppo2) subfolder and not in excluded list
            printRed(folder)
            legends.append(folder)
            for f in os.listdir(tmp_path):
                paths = "{}/{}/{}/{}/".format(path, folder, algo, f)
                folders_srl.append(paths)
            folders.append(folders_srl)
        else:
            continue

    x_list, y_list = [], []
    exp_name_dict = {}
    for ind, folders_srl in enumerate(folders):
        printGreen("Folder name {}".format(folders_srl))
        x, y = GatherExperiments(folders_srl,
                                 algo,
                                 window=40,
                                 title=title,
                                 min_num_x=-1,
                                 timesteps=timesteps,
                                 output_file="")
        print(len(x))
        x_list.append(x)
        y_list.append(y)
        ## HACK: the line below is ugly and not robust code !! TODO
        exp_name_dict[ind] = folders_srl[0].split("/")[-4]
    printGreen(np.array(x_list).shape)
    # printGreen('y_list shape {}'.format(np.array(y_list[1]).shape))

    plotGatheredData(x_list,
                     y_list,
                     y_limits,
                     timesteps,
                     title,
                     legends,
                     no_display,
                     truncate_x,
                     normalization,
                     figpath=figpath,
                     exp_name_dict=exp_name_dict)
Beispiel #7
0
def GatherExperiments(
    folders,
    algo,
    window=40,
    title="",
    min_num_x=-1,
    timesteps=False,
    output_file="",
):
    """
    Compute mean and standard error for several experiments and plot the learning curve
    :param folders: ([str]) Log folders, where the monitor.csv are stored
    :param window: (int) Smoothing window
    :param algo: (str) name of the RL algo
    :param title: (str) plot title
    :param min_num_x: (int) Minimum number of episode/timesteps to keep an experiment (default: -1, no minimum)
    :param timesteps: (bool) Plot timesteps instead of episodes
    :param y_limits: ([float]) y-limits for the plot
    :param output_file: (str) Path to a file where the plot data will be saved
    :param no_display: (bool) Set to true, the plot won't be displayed (useful when only saving plot)
    """
    y_list = []
    x_list = []
    ok = False
    for folder in folders:
        if timesteps:
            x, y = loadData(folder, smooth=1, bin_size=100)
            if x is not None:
                x, y = np.array(x), np.array(y)
        else:
            x, y = loadEpisodesData(folder)

        if x is None or (min_num_x > 0 and y.shape[0] < min_num_x):
            printRed("Skipping {}".format(folder))
            continue

        if y.shape[0] <= window:
            printYellow("Folder {}".format(folder))
            printRed("Not enough episodes for current window size = {}".format(
                window))
            continue
        ok = True
        y = movingAverage(y, window)
        y_list.append(y)
        print(len(x))
        # Truncate x
        x = x[len(x) - len(y):]
        x_list.append(x)

    if not ok:
        printRed("Not enough data to plot anything with current config." +
                 " Consider decreasing --min-x")
        return

    lengths = list(map(len, x_list))
    min_x, max_x = np.min(lengths), np.max(lengths)

    print("Min x: {}".format(min_x))
    print("Max x: {}".format(max_x))

    for i in range(len(x_list)):
        x_list[i] = x_list[i][:min_x]
        y_list[i] = y_list[i][:min_x]

    x = np.array(x_list)[0]
    y = np.array(y_list)
    # if output_file != "":
    #     printGreen("Saving aggregated data to {}.npz".format(output_file))
    #     np.savez(output_file, x=x, y=y)
    return x, y
Beispiel #8
0
def main():
    # Global variables for callback
    parser = argparse.ArgumentParser(
        description=
        "Evaluation script for distillation from two teacher policies")
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='random seed (default: 0)')
    parser.add_argument('--env',
                        type=str,
                        help='environment ID',
                        default='OmnirobotEnv-v0',
                        choices=list(registered_env.keys()))
    parser.add_argument(
        '--episode_window',
        type=int,
        default=40,
        help='Episode window for moving average plot (default: 40)')
    parser.add_argument(
        '--log-dir-teacher-one',
        default='/tmp/gym/',
        type=str,
        help=
        'directory to load an optmimal agent for task 1 (default: /tmp/gym)')
    parser.add_argument(
        '--log-dir-teacher-two',
        default='/tmp/gym/',
        type=str,
        help=
        'directory to load an optmimal agent for task 2 (default: /tmp/gym)')
    parser.add_argument(
        '--log-dir-student',
        default='/tmp/gym/',
        type=str,
        help=
        'directory to save the student agent logs and model (default: /tmp/gym)'
    )
    parser.add_argument(
        '--srl-config-file-one',
        type=str,
        default="config/srl_models_one.yaml",
        help='Set the location of the SRL model path configuration.')
    parser.add_argument(
        '--srl-config-file-two',
        type=str,
        default="config/srl_models_two.yaml",
        help='Set the location of the SRL model path configuration.')
    parser.add_argument(
        '--epochs-distillation',
        type=int,
        default=30,
        metavar='N',
        help='number of epochs to train for distillation(default: 30)')
    parser.add_argument(
        '--distillation-training-set-size',
        type=int,
        default=-1,
        help='Limit size (number of samples) of the training set (default: -1)'
    )
    parser.add_argument(
        '--eval-tasks',
        type=str,
        nargs='+',
        default=['cc', 'sqc', 'sc'],
        help='A cross evaluation from the latest stored model to all tasks')
    parser.add_argument(
        '--continual-learning-labels',
        type=str,
        nargs=2,
        metavar=('label_1', 'label_2'),
        default=argparse.SUPPRESS,
        help='Labels for the continual learning RL distillation task.')
    parser.add_argument('--student-srl-model',
                        type=str,
                        default='raw_pixels',
                        choices=list(registered_srl.keys()),
                        help='SRL model to use for the student RL policy')
    parser.add_argument(
        '--epochs-teacher-datasets',
        type=int,
        default=30,
        metavar='N',
        help=
        'number of epochs for generating both RL teacher datasets (default: 30)'
    )
    parser.add_argument(
        '--num-iteration',
        type=int,
        default=1,
        help='number of time each algorithm should be run the eval (N seeds).')
    parser.add_argument(
        '--eval-episode-window',
        type=int,
        default=400,
        metavar='N',
        help=
        'Episode window for saving each policy checkpoint for future distillation(default: 100)'
    )

    args, unknown = parser.parse_known_args()

    if 'continual_learning_labels' in args:
        assert args.continual_learning_labels[0] in CONTINUAL_LEARNING_LABELS and args.continual_learning_labels[1] \
               in CONTINUAL_LEARNING_LABELS, "Please specify a valid Continual learning label to each dataset to be " \
                                             "used for RL distillation !"
    print(args.continual_learning_labels)
    assert os.path.exists(args.srl_config_file_one), \
        "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.srl_config_file_one)

    assert os.path.exists(args.srl_config_file_two), \
        "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.srl_config_file_two)
    if not (args.log_dir_teacher_one == "None"):
        assert os.path.exists(args.log_dir_teacher_one), \
            "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.log_dir_teacher_one)
    assert os.path.exists(args.log_dir_teacher_two), \
        "Error: cannot load \"--srl-config-file {}\", file not found!".format(args.srl_config_file_two)

    teacher_pro = args.log_dir_teacher_one
    teacher_learn = args.log_dir_teacher_two

    # The output path generate from the
    teacher_pro_data = args.continual_learning_labels[0] + '/'
    teacher_learn_data = args.continual_learning_labels[1] + '/'
    merge_path = "data/on_policy_merged"

    print(teacher_pro_data, teacher_learn_data)
    episodes, policy_path = allPolicy(teacher_learn)

    rewards_at_episode = {}
    episodes_to_test = [
        e for e in episodes if (int(e) < 2000 and int(e) % 200 == 0) or (
            int(e) > 2000 and int(e) % 1000 == 0)
    ]

    # generate data from Professional teacher
    printYellow("\nGenerating on policy for optimal teacher: " +
                args.continual_learning_labels[0])

    if not (args.log_dir_teacher_one == "None"):
        OnPolicyDatasetGenerator(teacher_pro,
                                 args.continual_learning_labels[0] + '_copy/',
                                 task_id=args.continual_learning_labels[0],
                                 num_eps=args.epochs_teacher_datasets,
                                 episode=-1,
                                 env_name=args.env)
    print("Eval on eps list: ", episodes_to_test)
    for eps in episodes_to_test:
        student_path = args.log_dir_student
        printBlue("\n\nEvaluation at episode " + str(eps))

        if not (args.log_dir_teacher_one == "None"):
            # Use a copy of the optimal teacher
            ok = subprocess.call([
                'cp', '-r',
                'data/' + args.continual_learning_labels[0] + '_copy/',
                'data/' + teacher_pro_data, '-f'
            ])
            assert ok == 0
            time.sleep(2)

        # Generate data from learning teacher
        printYellow("\nGenerating on-policy data from the optimal teacher: " +
                    args.continual_learning_labels[1])
        OnPolicyDatasetGenerator(teacher_learn,
                                 teacher_learn_data,
                                 task_id=args.continual_learning_labels[1],
                                 episode=eps,
                                 num_eps=args.epochs_teacher_datasets,
                                 env_name=args.env)

        if args.log_dir_teacher_one == "None":
            merge_path = 'data/' + teacher_learn_data
            ok = subprocess.call(
                ['cp', '-r', merge_path, 'srl_zoo/data/', '-f'])
        else:
            # merge the data
            mergeData('data/' + teacher_pro_data,
                      'data/' + teacher_learn_data,
                      merge_path,
                      force=True)

            ok = subprocess.call(
                ['cp', '-r', 'data/on_policy_merged/', 'srl_zoo/data/', '-f'])
        assert ok == 0
        time.sleep(2)

        # Train a policy with distillation on the merged teacher's datasets
        trainStudent('srl_zoo/' + merge_path,
                     args.continual_learning_labels[1],
                     yaml_file=args.srl_config_file_one,
                     log_dir=args.log_dir_student,
                     srl_model=args.student_srl_model,
                     env_name=args.env,
                     training_size=args.distillation_training_set_size,
                     epochs=args.epochs_distillation)
        student_path += args.env + '/' + args.student_srl_model + "/distillation/"
        latest_student_path = max([
            student_path + "/" + d for d in os.listdir(student_path)
            if os.path.isdir(student_path + "/" + d)
        ],
                                  key=os.path.getmtime) + '/'
        rewards = {}
        printRed("\nSaving the student at path: " + latest_student_path)
        for task_label in ["-sc", "-cc"]:
            rewards[task_label] = []

            for seed_i in range(args.num_iteration):
                printYellow("\nEvaluating student on task: " + task_label +
                            " for seed: " + str(seed_i))
                command_line_enjoy_student = [
                    'python', '-m', 'replay.enjoy_baselines',
                    '--num-timesteps', '251', '--log-dir', latest_student_path,
                    task_label, "--seed",
                    str(seed_i)
                ]
                ok = subprocess.check_output(command_line_enjoy_student)
                ok = ok.decode('utf-8')
                str_before = "Mean reward: "
                str_after = "\npybullet"
                idx_before = ok.find(str_before) + len(str_before)
                idx_after = ok.find(str_after)
                seed_reward = float(ok[idx_before:idx_after])
                rewards[task_label].append(seed_reward)
        print("rewards at eps ", eps, ": ", rewards)
        rewards_at_episode[eps] = rewards
    print("All rewards: ", rewards_at_episode)
    json_dict = json.dumps(rewards_at_episode)
    json_dict_name = \
        args.log_dir_student + "/reward_at_episode_" + datetime.datetime.now().strftime("%y-%m-%d_%Hh%M_%S") + '.json'
    f = open(json_dict_name, "w")
    f.write(json_dict)
    f.close()
    printRed("\nSaving the evalation at path: " + json_dict_name)
Beispiel #9
0
def plotGatheredExperiments(folders,
                            algo,
                            y_limits,
                            window=40,
                            title="",
                            min_num_x=-1,
                            timesteps=False,
                            output_file="",
                            no_display=False):
    """
    Compute mean and standard error for several experiments and plot the learning curve
    :param folders: ([str]) Log folders, where the monitor.csv are stored
    :param window: (int) Smoothing window
    :param algo: (str) name of the RL algo
    :param title: (str) plot title
    :param min_num_x: (int) Minimum number of episode/timesteps to keep an experiment (default: -1, no minimum)
    :param timesteps: (bool) Plot timesteps instead of episodes
    :param y_limits: ([float]) y-limits for the plot
    :param output_file: (str) Path to a file where the plot data will be saved
    :param no_display: (bool) Set to true, the plot won't be displayed (useful when only saving plot)
    """
    y_list = []
    x_list = []
    ok = False
    for folder in folders:
        if timesteps:
            x, y = loadData(folder, smooth=1, bin_size=100)
            if x is not None:
                x, y = np.array(x), np.array(y)
        else:
            x, y = loadEpisodesData(folder)

        if x is None or (min_num_x > 0 and y.shape[0] < min_num_x):
            printYellow("Skipping {}".format(folder))
            continue

        if y.shape[0] <= window:
            printYellow("Folder {}".format(folder))
            printYellow(
                "Not enough episodes for current window size = {}".format(
                    window))
            continue
        ok = True
        y = movingAverage(y, window)
        y_list.append(y)

        # Truncate x
        x = x[len(x) - len(y):]
        x_list.append(x)

    if not ok:
        printRed("Not enough data to plot anything with current config." +
                 " Consider decreasing --min-x")
        return

    lengths = list(map(len, x_list))
    min_x, max_x = np.min(lengths), np.max(lengths)

    print("Min x: {}".format(min_x))
    print("Max x: {}".format(max_x))

    for i in range(len(x_list)):
        x_list[i] = x_list[i][:min_x]
        y_list[i] = y_list[i][:min_x]

    x = np.array(x_list)[0]
    y = np.array(y_list)

    printGreen("{} Experiments".format(y.shape[0]))
    print("Min, Max rewards:", np.min(y), np.max(y))

    fig = plt.figure(title)
    # Compute mean for different seeds
    m = np.mean(y, axis=0)
    # Compute standard error
    s = np.squeeze(np.asarray(np.std(y, axis=0)))
    n = y.shape[0]
    plt.fill_between(x,
                     m - s / np.sqrt(n),
                     m + s / np.sqrt(n),
                     color=lightcolors[0])
    plt.plot(x, m, color=darkcolors[0], label=algo, linewidth=1)

    if timesteps:
        formatter = FuncFormatter(millions)
        plt.xlabel('Number of Timesteps')
        fig.axes[0].xaxis.set_major_formatter(formatter)
    else:
        plt.xlabel('Number of Episodes')
    plt.ylabel('Rewards')

    plt.title(title, **fontstyle)
    plt.ylim(y_limits)

    plt.legend(framealpha=0.5,
               labelspacing=0.01,
               loc='lower right',
               fontsize=16)

    if output_file != "":
        printGreen("Saving aggregated data to {}.npz".format(output_file))
        np.savez(output_file, x=x, y=y)

    if not no_display:
        plt.show()
def prog_mlp_extractor(flat_observations, net_arch, act_fun, dict_res_tensor={}):
    printRed(net_arch)
    latent = flat_observations
    policy_only_layers = []  # Layer sizes of the network that only belongs to the policy network
    value_only_layers = []  # Layer sizes of the network that only belongs to the value network

    # Test purpose
    #
    # if(len(prev_cols)>0):
    #     model = prev_cols[0]
    #     sess = model.sess
    #     graph =sess.graph
    #
    #     model_variables = graph.get_operations()
    #     train_ops = []
    #     for op in model_variables:
    #         if("loss" not in str(op.name) and "model" in str(op.name) and "add" in str(op.name)):
    #             print(op.name)
    #             train_ops.append(op.values())
    #             # try:
    #             #     tmp = sess.run(op.values())
    #             #     if(tmp !=None):
    #             #         print(op.name,op.values(), tmp[0].shape)
    #             # except:
    #             #     print("feed_dict no compitible", op.name)
    #     #model / pi_fc0 / w: 0
    # Iterate through the shared layers and build the shared parts of the network
    for idx, layer in enumerate(net_arch):
        if isinstance(layer, int):  # Check that this is a shared layer
            layer_size = layer
            latent = act_fun(linear(latent, "shared_fc{}".format(idx), layer_size, init_scale=np.sqrt(2)))
        else:
            if 'pi' in layer:
                assert isinstance(layer['pi'], list), "Error: net_arch[-1]['pi'] must contain a list of integers."
                policy_only_layers = layer['pi']

            if 'vf' in layer:
                assert isinstance(layer['vf'], list), "Error: net_arch[-1]['vf'] must contain a list of integers."
                value_only_layers = layer['vf']
            break  # From here on the network splits up in policy and value network

    # Build the non-shared part of the network
    latent_policy = latent
    latent_value = latent
    for idx, (pi_layer_size, vf_layer_size) in enumerate(zip_longest(policy_only_layers, value_only_layers)):
        if pi_layer_size is not None:
            assert isinstance(pi_layer_size, int), "Error: net_arch[-1]['pi'] must only contain integers."
            latent_policy = (linear(latent_policy, "pi_fc{}".format(idx), pi_layer_size, init_scale=np.sqrt(2)))
            print(latent_policy.name)
            with tf.variable_scope("pi_res_{}".format(idx)):
                try:
                    print(dict_res_tensor[latent_policy.name])
                except:
                    printRed("Exception: residual insertion from previous model failed.")
            latent_policy = act_fun(latent_policy)
        if vf_layer_size is not None:
            assert isinstance(vf_layer_size, int), "Error: net_arch[-1]['vf'] must only contain integers."
            latent_value = (linear(latent_value, "vf_fc{}".format(idx), vf_layer_size, init_scale=np.sqrt(2)))
            with tf.variable_scope("vf_res_{}".format(idx)):
                latent_value = act_fun(latent_value + 2)
    return latent_policy, latent_value
def main():
    parser = argparse.ArgumentParser(
        description='Deteministic dataset generator for SRL training ' +
        '(can be used for environment testing)')
    parser.add_argument('--num-cpu',
                        type=int,
                        default=1,
                        help='number of cpu to run on')
    parser.add_argument('--num-episode',
                        type=int,
                        default=50,
                        help='number of episode to run')
    parser.add_argument('--max_steps_per_epoch',
                        type=int,
                        default=200,
                        help='max num steps per epoch')

    #CUSTOM ARGS. want to udpate eventually, i.e., specify a specific path for dr
    parser.add_argument(
        '--dr',
        action='store_true',
        default=False,
        help=
        "Include this flag to use the chosen environment with domain randomization"
    )
    parser.add_argument(
        '--alt',
        action='store_true',
        default=False,
        help=
        "Include this flag to use the chosen environment with alternate view")
    parser.add_argument(
        '--special_start',
        action='store_true',
        default=False,
        help=
        "Include this flag to use the chosen environment with the special start"
    )

    parser.add_argument(
        '--save-path',
        type=str,
        default='robotics-rl-srl/data/',
        help='Folder where the environments will save the output')
    parser.add_argument('--name',
                        type=str,
                        default='UNSETNAME',
                        help='Folder name for the output')
    parser.add_argument('--env',
                        type=str,
                        default='push_rotate',
                        help='The environment wanted',
                        choices=list(envs.keys()))

    parser.add_argument('--display', action='store_true', default=False)
    parser.add_argument('--no-record-data', action='store_true', default=False)

    parser.add_argument('--seed', type=int, default=0, help='the seed')
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help='Force the save, even if it overrides something else,' +
        ' including partial parts if they exist')

    #TODO: Change this argument to be for the diff types of tasks
    parser.add_argument('--multi-view',
                        action='store_true',
                        default=False,
                        help='Set a second camera to the scene')

    parser.add_argument(
        '--reward-dist',
        action='store_true',
        default=False,
        help=
        'Prints out the reward distribution when the dataset generation is finished'
    )
    parser.add_argument('--run-ppo2',
                        action='store_true',
                        default=False,
                        help='runs a ppo2 agent instead of a random agent')
    parser.add_argument(
        '--ppo2-timesteps',
        type=int,
        default=1000,
        help='number of timesteps to run PPO2 on before generating the dataset'
    )

    args = parser.parse_args()

    assert (args.num_cpu >
            0), "Error: number of cpu must be positive and non zero"

    assert (args.num_episode >
            0), "Error: number of episodes must be positive and non zero"
    # assert not(registered_env[args.env][3] is ThreadingType.NONE and args.num_cpu != 1), \
    # "Error: cannot have more than 1 CPU for the environment {}".format(args.env)

    if args.num_cpu > args.num_episode:
        args.num_cpu = args.num_episode
        printYellow(
            "num_cpu cannot be greater than num_episode, defaulting to {} cpus."
            .format(args.num_cpu))

    # this is done so seed 0 and 1 are different and not simply offset of the same datasets.
    args.seed = np.random.RandomState(args.seed).randint(int(1e10))

    # File exists, need to deal with it
    if not args.no_record_data and os.path.exists(args.save_path + args.name):
        assert args.force, "Error: save directory '{}' already exists".format(
            args.save_path + args.name)

        shutil.rmtree(args.save_path + args.name)
        for part in glob.glob(args.save_path + args.name + "_part-[0-9]*"):
            shutil.rmtree(part)
    if not args.no_record_data:
        # create the output
        os.makedirs(args.save_path + args.name, exist_ok=True)

    if args.num_cpu == 1:
        env_thread(args, 0, partition=False, use_ppo2=args.run_ppo2)
    else:
        # try and divide into multiple processes, with an environment each
        try:
            jobs = []
            for i in range(args.num_cpu):
                process = multiprocessing.Process(target=env_thread,
                                                  args=(args, i, True,
                                                        args.run_ppo2))
                jobs.append(process)

            for j in jobs:
                j.start()

            try:
                for j in jobs:
                    j.join()
            except Exception as e:
                printRed("Error: unable to join thread")
                raise e

        except Exception as e:
            printRed("Error: unable to start thread")
            raise e

    if not args.no_record_data and args.num_cpu > 1:
        # sleep 1 second, to avoid congruency issues from multiprocess (eg., files still writing)
        time.sleep(1)
        # get all the parts
        file_parts = sorted(glob.glob(args.save_path + args.name +
                                      "_part-[0-9]*"),
                            key=lambda a: int(a.split("-")[-1]))

        # move the config files from any as they are identical
        os.rename(file_parts[0] + "/dataset_config.json",
                  args.save_path + args.name + "/dataset_config.json")
        os.rename(file_parts[0] + "/env_globals.json",
                  args.save_path + args.name + "/env_globals.json")

        ground_truth = None
        preprocessed_data = None

        # used to convert the part record_id to the fused record_id
        record_id = 0
        for part in file_parts:
            # sort the record names alphabetically, then numerically
            records = sorted(glob.glob(part + "/record_[0-9]*"),
                             key=lambda a: int(a.split("_")[-1]))

            record_id_start = record_id
            for record in records:
                os.renames(
                    record, args.save_path + args.name +
                    "/record_{:03d}".format(record_id))
                record_id += 1

            # fuse the npz files together, in the right order
            if ground_truth is None:
                # init
                ground_truth = {}
                preprocessed_data = {}
                ground_truth_load = np.load(part + "/ground_truth.npz")
                preprocessed_data_load = np.load(part +
                                                 "/preprocessed_data.npz")

                for arr in ground_truth_load.files:
                    if arr == "images_path":
                        ground_truth[arr] = np.array([
                            convertImagePath(args, path, record_id_start)
                            for path in ground_truth_load[arr]
                        ])
                    else:
                        ground_truth[arr] = ground_truth_load[arr]
                for arr in preprocessed_data_load.files:
                    preprocessed_data[arr] = preprocessed_data_load[arr]

            else:
                ground_truth_load = np.load(part + "/ground_truth.npz")
                preprocessed_data_load = np.load(part +
                                                 "/preprocessed_data.npz")

                for arr in ground_truth_load.files:
                    if arr == "images_path":
                        sanitised_paths = np.array([
                            convertImagePath(args, path, record_id_start)
                            for path in ground_truth_load[arr]
                        ])
                        ground_truth[arr] = np.concatenate(
                            (ground_truth[arr], sanitised_paths))
                    else:
                        ground_truth[arr] = np.concatenate(
                            (ground_truth[arr], ground_truth_load[arr]))
                for arr in preprocessed_data_load.files:
                    preprocessed_data[arr] = np.concatenate(
                        (preprocessed_data[arr], preprocessed_data_load[arr]))

            # remove the current part folder
            shutil.rmtree(part)

        # save the fused outputs
        np.savez(args.save_path + args.name + "/ground_truth.npz",
                 **ground_truth)
        np.savez(args.save_path + args.name + "/preprocessed_data.npz",
                 **preprocessed_data)

    if args.reward_dist:
        rewards, counts = np.unique(
            np.load(args.save_path + args.name +
                    "/preprocessed_data.npz")['rewards'],
            return_counts=True)
        counts = [
            "{:.2f}%".format(val * 100) for val in counts / np.sum(counts)
        ]
        print("reward distribution:")
        [
            print(" ", reward, count)
            for reward, count in list(zip(rewards, counts))
        ]