def start_exp(config_path):
    logger.add_output(dowel.StdOutput())
    deterministic.set_seed(21)
    ex.observers.append(FileStorageObserver("runs"))
    ex.add_config(config_path)
    ex.run()
Ejemplo n.º 2
0
def run_experiment(argv):
    now = datetime.datetime.now(dateutil.tz.tzlocal())

    # avoid name clashes when running distributed jobs
    rand_id = str(uuid.uuid4())[:5]
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z')

    default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id)
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--n_parallel',
        type=int,
        default=1,
        help=('Number of parallel workers to perform rollouts. '
              "0 => don't start any workers"))
    parser.add_argument('--exp_name',
                        type=str,
                        default=default_exp_name,
                        help='Name of the experiment.')
    parser.add_argument('--log_dir',
                        type=str,
                        default=None,
                        help='Path to save the log and iteration snapshot.')
    parser.add_argument('--snapshot_mode',
                        type=str,
                        default='all',
                        help='Mode to save the snapshot. Can be either "all" '
                        '(all iterations will be saved), "last" (only '
                        'the last iteration will be saved), "gap" (every'
                        '`snapshot_gap` iterations are saved), or "none" '
                        '(do not save snapshots)')
    parser.add_argument('--snapshot_gap',
                        type=int,
                        default=1,
                        help='Gap between snapshot iterations.')
    parser.add_argument('--tabular_log_file',
                        type=str,
                        default='progress.csv',
                        help='Name of the tabular log file (in csv).')
    parser.add_argument('--text_log_file',
                        type=str,
                        default='debug.log',
                        help='Name of the text log file (in pure text).')
    parser.add_argument('--tensorboard_step_key',
                        type=str,
                        default=None,
                        help='Name of the step key in tensorboard_summary.')
    parser.add_argument('--params_log_file',
                        type=str,
                        default='params.json',
                        help='Name of the parameter log file (in json).')
    parser.add_argument('--variant_log_file',
                        type=str,
                        default='variant.json',
                        help='Name of the variant log file (in json).')
    parser.add_argument(
        '--resume_from_dir',
        type=str,
        default=None,
        help='Directory of the pickle file to resume experiment from.')
    parser.add_argument('--resume_epoch',
                        type=str,
                        default=None,
                        help='Index of iteration to restore from. '
                        'Can be "first", "last" or a number. '
                        'Not applicable when snapshot_mode="last"')
    parser.add_argument('--plot',
                        type=ast.literal_eval,
                        default=False,
                        help='Whether to plot the iteration results')
    parser.add_argument(
        '--log_tabular_only',
        type=ast.literal_eval,
        default=False,
        help='Print only the tabular log information (in a horizontal format)')
    parser.add_argument('--seed', type=int, help='Random seed for numpy')
    parser.add_argument('--args_data',
                        type=str,
                        help='Pickled data for objects')
    parser.add_argument('--variant_data',
                        type=str,
                        help='Pickled data for variant configuration')
    parser.add_argument('--use_cloudpickle',
                        type=ast.literal_eval,
                        default=False)

    args = parser.parse_args(argv[1:])

    if args.seed is not None:
        deterministic.set_seed(args.seed)

    # SIGINT is blocked for all processes created in parallel_sampler to avoid
    # the creation of sleeping and zombie processes.
    #
    # If the user interrupts run_experiment, there's a chance some processes
    # won't die due to a dead lock condition where one of the children in the
    # parallel sampler exits without releasing a lock once after it catches
    # SIGINT.
    #
    # Later the parent tries to acquire the same lock to proceed with his
    # cleanup, but it remains sleeping waiting for the lock to be released.
    # In the meantime, all the process in parallel sampler remain in the zombie
    # state since the parent cannot proceed with their clean up.
    with mask_signals([signal.SIGINT]):
        if args.n_parallel > 0:
            parallel_sampler.initialize(n_parallel=args.n_parallel)
            if args.seed is not None:
                parallel_sampler.set_seed(args.seed)

    if not args.plot:
        garage.plotter.Plotter.disable()
        garage.tf.plotter.Plotter.disable()

    if args.log_dir is None:
        if args.resume_from_dir is None:
            log_dir = osp.join(osp.join(os.getcwd(), 'data'), args.exp_name)
        else:
            log_dir = args.resume_from_dir
    else:
        log_dir = args.log_dir

    tabular_log_file = osp.join(log_dir, args.tabular_log_file)
    text_log_file = osp.join(log_dir, args.text_log_file)
    params_log_file = osp.join(log_dir, args.params_log_file)

    if args.variant_data is not None:
        variant_data = pickle.loads(base64.b64decode(args.variant_data))
        variant_log_file = osp.join(log_dir, args.variant_log_file)
        dump_variant(variant_log_file, variant_data)
    else:
        variant_data = None

    if not args.use_cloudpickle:
        log_parameters(params_log_file, args)

    logger.add_output(dowel.TextOutput(text_log_file))
    logger.add_output(dowel.CsvOutput(tabular_log_file))
    logger.add_output(dowel.TensorBoardOutput(log_dir))
    logger.add_output(dowel.StdOutput())
    prev_snapshot_dir = snapshotter.snapshot_dir
    prev_mode = snapshotter.snapshot_mode
    snapshotter.snapshot_dir = log_dir
    snapshotter.snapshot_mode = args.snapshot_mode
    snapshotter.snapshot_gap = args.snapshot_gap
    logger.push_prefix('[%s] ' % args.exp_name)

    if args.resume_from_dir is not None:
        with LocalRunner() as runner:
            runner.restore(args.resume_from_dir, from_epoch=args.resume_epoch)
            runner.resume()
    else:
        # read from stdin
        if args.use_cloudpickle:
            import cloudpickle
            method_call = cloudpickle.loads(base64.b64decode(args.args_data))
            try:
                method_call(variant_data)
            except BaseException:
                children = garage.plotter.Plotter.get_plotters()
                children += garage.tf.plotter.Plotter.get_plotters()
                if args.n_parallel > 0:
                    children += [parallel_sampler]
                child_proc_shutdown(children)
                raise
        else:
            data = pickle.loads(base64.b64decode(args.args_data))
            maybe_iter = concretize(data)
            if is_iterable(maybe_iter):
                for _ in maybe_iter:
                    pass

    snapshotter.snapshot_mode = prev_mode
    snapshotter.snapshot_dir = prev_snapshot_dir
    logger.remove_all()
    logger.pop_prefix()
Ejemplo n.º 3
0
def run_metarl(env, envs, tasks, seed, log_dir):
    """Create metarl Tensorflow PPO model and training.

    Args:
        env (dict): Environment of the task.
        seed (int): Random positive integer for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: Path to output csv file

    """
    deterministic.set_seed(seed)
    snapshot_config = SnapshotConfig(snapshot_dir=log_dir,
                                     snapshot_mode='gap',
                                     snapshot_gap=10)
    with LocalTFRunner(snapshot_config) as runner:
        policy = GaussianGRUPolicy(
            hidden_dims=hyper_parameters['hidden_sizes'],
            env_spec=env.spec,
            state_include_action=False)

        baseline = MetaRLLinearFeatureBaseline(env_spec=env.spec)

        inner_algo = RL2PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=hyper_parameters['max_path_length'] *
            hyper_parameters['rollout_per_task'],
            discount=hyper_parameters['discount'],
            gae_lambda=hyper_parameters['gae_lambda'],
            lr_clip_range=hyper_parameters['lr_clip_range'],
            optimizer_args=dict(
                max_epochs=hyper_parameters['optimizer_max_epochs'],
                tf_optimizer_args=dict(
                    learning_rate=hyper_parameters['optimizer_lr'], ),
            ))

        # Need to pass this if meta_batch_size < num_of_tasks
        task_names = list(ML45_ENVS['train'].keys())
        algo = RL2(policy=policy,
                   inner_algo=inner_algo,
                   max_path_length=hyper_parameters['max_path_length'],
                   meta_batch_size=hyper_parameters['meta_batch_size'],
                   task_sampler=tasks,
                   task_names=None
                   if hyper_parameters['meta_batch_size'] >= len(task_names)
                   else task_names)

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        text_log_file = osp.join(log_dir, 'debug.log')
        dowel_logger.add_output(dowel.TextOutput(text_log_file))
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(
            algo,
            envs,
            sampler_cls=hyper_parameters['sampler_cls'],
            n_workers=hyper_parameters['meta_batch_size'],
            worker_class=RL2Worker,
            sampler_args=dict(
                use_all_workers=hyper_parameters['use_all_workers']),
            worker_args=dict(
                n_paths_per_trial=hyper_parameters['rollout_per_task']))

        # meta evaluator
        env_obs_dim = [
            env().observation_space.shape[0]
            for (_, env) in ML45_ENVS['test'].items()
        ]
        max_obs_dim = max(env_obs_dim)
        ML_test_envs = [
            TaskIdWrapper(RL2Env(
                env(*ML45_ARGS['test'][task]['args'],
                    **ML45_ARGS['test'][task]['kwargs']), max_obs_dim),
                          task_id=task_id,
                          task_name=task)
            for (task_id, (task, env)) in enumerate(ML45_ENVS['test'].items())
        ]
        test_tasks = task_sampler.EnvPoolSampler(ML_test_envs)
        test_tasks.grow_pool(hyper_parameters['n_test_tasks'])

        test_task_names = list(ML45_ENVS['test'].keys())

        runner.setup_meta_evaluator(
            test_task_sampler=test_tasks,
            n_exploration_traj=hyper_parameters['rollout_per_task'],
            n_test_rollouts=hyper_parameters['test_rollout_per_task'],
            n_test_tasks=hyper_parameters['n_test_tasks'],
            n_workers=hyper_parameters['n_test_tasks'],
            test_task_names=None
            if hyper_parameters['n_test_tasks'] >= len(test_task_names) else
            test_task_names)

        runner.train(n_epochs=hyper_parameters['n_itr'],
                     batch_size=hyper_parameters['meta_batch_size'] *
                     hyper_parameters['rollout_per_task'] *
                     hyper_parameters['max_path_length'])

        dowel_logger.remove_all()

        return tabular_log_file
Ejemplo n.º 4
0
def main(args):
    import dowel
    from dowel import logger, tabular
    training.utility.set_up_logging()
    stages = {'500k': 'model.ckpt-2502500'}
    # stages = {'1000k': 'model.ckpt-5005000'}
    num_traj = 10
    # stages = {'100k': 'model.ckpt-500500', '500k': 'model.ckpt-2502500'}
    # stages = {'1M': 'model.ckpt-5005000'}
    # stages = {'final': 'model.ckpt-2652650'}
    # stages = {'100k': 'model.ckpt-500500', '500k': 'model.ckpt-2502500', '1M': 'model.ckpt-5005000'}
    # stages = {'100k': 'model.ckpt-600500', '500k': 'model.ckpt-3002500',
    #           'final':'model.ckpt-3182650'}
    # methods = ['weighted_100']
    # methods = ['aug7']
    methods = ['baseline3']
    # rival_method = 'baseline3'
    rival_method = 'aug7'
    rival_runs = 5
    base_dir = 'benchmark'
    envs = ['finger_spin']
    # envs = ['cartpole_swingup']
    # envs = ['finger_spin', 'cartpole_swingup','cheetah_run', 'cup_catch']
    # envs = ['finger_spin', 'cartpole_swingup', 'reacher_easy', 'cheetah_run']
    # envs = ['cartpole_swingup', 'cheetah_run', 'walker_walk', 'cup_catch']
    # envs = ['finger_spin', 'cartpole_swingup', 'reacher_easy', 'cheetah_run', 'walker_walk', 'cup_catch']
    if not check_finish(base_dir, stages, methods, envs, args.num_runs):
        exit()

    for pref, chkpt in stages.items():
        print(pref, 'begin')
        logger.add_output(dowel.StdOutput())
        logger.add_output(dowel.CsvOutput('benchmark_{}.csv'.format(pref)))
        for env in envs:
            tabular.record('Env', env)
            for method in methods:
                for id in range(rival_runs):
                    means, stds, all_scores = [], [], []
                    with args.params.unlocked:
                        args.params.chkpt = chkpt
                        args.params.tasks = [env]
                        args.params.planner_horizon = 12
                        args.params.eval_ratio = 1 / num_traj
                        # args.params.r_loss = 'contra'
                        # args.params.aug = 'rad'
                        args.params.planner = 'dual2'
                        args.params.rival = '{}/{}/00{}'.format(
                            env, rival_method, id + 1)

                    experiment = training.Experiment(
                        os.path.join(base_dir, env, method),
                        process_fn=functools.partial(process, args=args),
                        num_runs=args.num_runs,
                        ping_every=args.ping_every,
                        resume_runs=args.resume_runs,
                        planner=args.params.planner,
                        task_str=env)
                    for i, run in enumerate(experiment):
                        scores = []
                        for i, unused_score in enumerate(run):
                            print('unused', unused_score)
                            scores.append(unused_score)
                            if i == num_traj - 1:
                                break
                        means.append(np.mean(scores))
                        stds.append(np.std(scores))
                        all_scores.append(scores)
                        print(means)
                        # if args.params.planner != 'cem':
                        #     exit()
                        if args.params.planner == 'cem_eval':
                            np.save(
                                os.path.join(
                                    args.logdir, env, method,
                                    '00{}/scores_{}_cem.npy'.format(i, pref)),
                                np.array(all_scores))
                    mean, std = np.mean(means), np.std(means)
                    print('{}    {}+/-{}'.format(method, int(mean), int(std)))
                    if mean > 0:
                        tabular.record(method,
                                       '{}+/-{}'.format(int(mean), int(std)))
                    np.save(
                        os.path.join(args.logdir, env, method,
                                     'scores_{}.npy'.format(pref)),
                        np.array(all_scores))
            logger.log(tabular)
            logger.dump_all()
        logger.remove_all()
Ejemplo n.º 5
0
def main(args):

    if args.output_folder is not None:
        if not os.path.exists(args.output_folder):
            raise ValueError(
                "The folder with the training files does not exist")

    policy_filename = os.path.join(args.output_folder, 'policy.th')
    dynamics_filename = os.path.join(args.output_folder, 'dynamics.th')
    config_filename = os.path.join(args.output_folder, 'config.json')
    # eval_filename = os.path.join(args.output_folder, 'eval.npz')

    text_log_file = os.path.join(args.output_folder, 'test_log.txt')
    tabular_log_file = os.path.join(args.output_folder, 'test_result.csv')

    output_test_folder = args.output_folder + "test" if args.output_folder[
        -1] == '/' else args.output_folder + "/test"

    if os.path.exists(output_test_folder):
        shutil.rmtree(output_test_folder)
    os.makedirs(output_test_folder)

    # Set up logger
    logger.add_output(dowel.StdOutput())
    logger.add_output(dowel.TextOutput(text_log_file))
    logger.add_output(dowel.CsvOutput(tabular_log_file))
    logger.add_output(
        dowel.TensorBoardOutput(output_test_folder, x_axis='Batch'))
    logger.log('Logging to {}'.format(output_test_folder))

    with open(config_filename, 'r') as f:
        config = json.load(f)

    seed = config["seed"] if "seed" in config else args.seed
    if seed is not None:
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        random.seed(args.seed)

    # Metaworld
    if config['env-name'].startswith('Metaworld'):
        env_name = config['env-name'].replace("Metaworld-", "")
        metaworld = __import__('metaworld')
        class_ = getattr(metaworld, env_name)
        metaworld_benchmark = class_()
        for name, env_cls in metaworld_benchmark.train_classes.items():
            env = env_cls()
            env.close()
        benchmark = metaworld_benchmark
    # Other gym envs
    else:
        env_name = config['env-name']
        env = gym.make(config['env-name'], **config.get('env-kwargs', {}))
        env.close()
        benchmark = None

    # Policy
    policy = get_policy_for_env(env,
                                hidden_sizes=config['hidden-sizes'],
                                nonlinearity=config['nonlinearity'])

    with open(policy_filename, 'rb') as f:
        state_dict = torch.load(f, map_location=torch.device(args.device))
        policy.load_state_dict(state_dict)
    policy.share_memory()

    # Dynamics
    dynamics = get_dynamics_for_env(env,
                                    config['use_vime'],
                                    config['use_inv_vime'],
                                    args.device,
                                    config,
                                    benchmark=benchmark)
    inverse_dynamics = config['use_inv_vime']
    use_dynamics = config["use_vime"] or config["use_inv_vime"]

    if use_dynamics:
        with open(dynamics_filename, 'rb') as f:
            state_dict = torch.load(f, map_location=torch.device(args.device))
            dynamics.load_state_dict(state_dict)
        dynamics.share_memory()

    # Eta
    if config['adapt_eta']:
        eta_value = torch.Tensor([config["adapted-eta"]])
    else:
        eta_value = torch.Tensor([config["eta"]])
    eta_value = torch.log(eta_value / (1 - eta_value))
    eta = EtaParameter(eta_value, adapt_eta=config['adapt_eta'])
    eta.share_memory()

    # Baseline
    baseline = LinearFeatureBaseline(get_input_size(env))

    # Sampler
    normalize_spaces = config[
        "normalize-spaces"] if "normalize-spaces" in config else True
    act_prev_mean = mp.Manager().list()
    obs_prev_mean = mp.Manager().list()

    # Sampler
    if normalize_spaces:
        obs_prev_mean.append({
            "mean": torch.Tensor(config["obs_mean"]),
            "std": torch.Tensor(config["obs_std"])
        })
        act_prev_mean.append({
            "mean": torch.Tensor(config["act_mean"]),
            "std": torch.Tensor(config["act_std"])
        })

    epochs_counter = mp.Value('i', 100)

    sampler = MultiTaskSampler(
        config['env-name'],
        env_kwargs=config.get('env-kwargs', {}),
        batch_size=config['fast-batch-size'],  # TODO
        policy=policy,
        baseline=baseline,
        dynamics=dynamics,
        inverse_dynamics=inverse_dynamics,
        env=env,
        seed=args.seed,
        num_workers=args.num_workers,
        epochs_counter=epochs_counter,
        act_prev_mean=act_prev_mean,
        obs_prev_mean=obs_prev_mean,
        # rew_prev_mean=rew_prev_mean,
        eta=eta,
        benchmark=benchmark,
        normalize_spaces=normalize_spaces)

    logs = {'tasks': []}
    train_returns, valid_returns = [], []
    for batch in trange(args.num_batches):
        tasks = sampler.sample_test_tasks(num_tasks=config['meta-batch-size'])
        train_episodes, valid_episodes = sampler.sample(
            tasks,
            num_steps=args.num_steps,
            fast_lr=config['fast-lr'],
            gamma=config['gamma'],
            gae_lambda=config['gae-lambda'],
            device=args.device)

        logs['tasks'].extend(tasks)
        train_returns.append(get_returns(train_episodes[0]))
        valid_returns.append(get_returns(valid_episodes))

        logs['train_returns'] = np.concatenate(train_returns, axis=0)
        logs['valid_returns'] = np.concatenate(valid_returns, axis=0)

        tabular.record("Batch", batch)

        log_returns(train_episodes,
                    valid_episodes,
                    batch,
                    log_dynamics=use_dynamics,
                    benchmark=benchmark,
                    env=env,
                    env_name=env_name,
                    is_testing=True)
        log_trajectories(config['env-name'], output_test_folder,
                         train_episodes, valid_episodes, batch)

        logger.log(tabular)

        logger.dump_all()

        # with open(eval_filename + "_" + str(batch), 'wb') as f:
        #     np.savez(f, **logs)

    logger.remove_all()
Ejemplo n.º 6
0
def run_garage_tf(env, seed, log_dir):
    """Create garage TensorFlow PPO model and training.

    Args:
        env (dict): Environment of the task.
        seed (int): Random positive integer for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: Path to output csv file

    """
    deterministic.set_seed(seed)

    with LocalTFRunner(snapshot_config) as runner:
        env = TfEnv(normalize(env))

        policy = TF_GMP(
            env_spec=env.spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = TF_GMB(
            env_spec=env.spec,
            regressor_args=dict(
                hidden_sizes=(32, 32),
                use_trust_region=False,
                optimizer=FirstOrderOptimizer,
                optimizer_args=dict(
                    batch_size=32,
                    max_epochs=10,
                    tf_optimizer_args=dict(learning_rate=3e-4),
                ),
            ),
        )

        algo = TF_PPO(env_spec=env.spec,
                      policy=policy,
                      baseline=baseline,
                      max_path_length=hyper_parameters['max_path_length'],
                      discount=0.99,
                      gae_lambda=0.95,
                      center_adv=True,
                      lr_clip_range=0.2,
                      optimizer_args=dict(
                          batch_size=32,
                          max_epochs=10,
                          tf_optimizer_args=dict(learning_rate=3e-4),
                          verbose=True))

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(algo, env)
        runner.train(n_epochs=hyper_parameters['n_epochs'],
                     batch_size=hyper_parameters['batch_size'])

        dowel_logger.remove_all()

        return tabular_log_file
Ejemplo n.º 7
0
def run_garage(env, seed, log_dir):
    """
    Create garage model and training.

    Replace the td3 with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trail.
    :param log_dir: Log dir path.
    :return:
    """
    deterministic.set_seed(seed)

    with LocalTFRunner(snapshot_config) as runner:
        env = TfEnv(normalize(env))
        # Set up params for TD3
        exploration_noise = GaussianStrategy(env.spec,
                                             max_sigma=params['sigma'],
                                             min_sigma=params['sigma'])

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=params['policy_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh)

        qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction',
                                    env_spec=env.spec,
                                    hidden_sizes=params['qf_hidden_sizes'],
                                    action_merge_layer=0,
                                    hidden_nonlinearity=tf.nn.relu)

        qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2',
                                     env_spec=env.spec,
                                     hidden_sizes=params['qf_hidden_sizes'],
                                     action_merge_layer=0,
                                     hidden_nonlinearity=tf.nn.relu)

        replay_buffer = SimpleReplayBuffer(
            env_spec=env.spec,
            size_in_transitions=params['replay_buffer_size'],
            time_horizon=params['n_rollout_steps'])

        td3 = TD3(env.spec,
                  policy=policy,
                  qf=qf,
                  qf2=qf2,
                  replay_buffer=replay_buffer,
                  steps_per_epoch=params['steps_per_epoch'],
                  policy_lr=params['policy_lr'],
                  qf_lr=params['qf_lr'],
                  target_update_tau=params['tau'],
                  n_train_steps=params['n_train_steps'],
                  discount=params['discount'],
                  smooth_return=params['smooth_return'],
                  min_buffer_size=params['min_buffer_size'],
                  buffer_batch_size=params['buffer_batch_size'],
                  exploration_strategy=exploration_noise,
                  policy_optimizer=tf.compat.v1.train.AdamOptimizer,
                  qf_optimizer=tf.compat.v1.train.AdamOptimizer)

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(td3, env)
        runner.train(n_epochs=params['n_epochs'],
                     batch_size=params['n_rollout_steps'])

        dowel_logger.remove_all()

        return tabular_log_file
Ejemplo n.º 8
0
def run_garage(env, seed, log_dir):
    """Create garage model and training.

    Replace the ppo with the algorithm you want to run.

    Args:
        env (gym.Env): Environment of the task.
        seed (int): Random seed for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: The log file path.

    """
    deterministic.set_seed(seed)

    with LocalTFRunner(snapshot_config) as runner:
        env = TfEnv(normalize(env))
        # Set up params for ddpg
        action_noise = OUStrategy(env.spec, sigma=params['sigma'])

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=params['policy_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh)

        qf = ContinuousMLPQFunction(env_spec=env.spec,
                                    hidden_sizes=params['qf_hidden_sizes'],
                                    hidden_nonlinearity=tf.nn.relu)

        replay_buffer = SimpleReplayBuffer(
            env_spec=env.spec,
            size_in_transitions=params['replay_buffer_size'],
            time_horizon=params['n_rollout_steps'])

        algo = DDPG(env_spec=env.spec,
                    policy=policy,
                    qf=qf,
                    replay_buffer=replay_buffer,
                    steps_per_epoch=params['steps_per_epoch'],
                    policy_lr=params['policy_lr'],
                    qf_lr=params['qf_lr'],
                    target_update_tau=params['tau'],
                    n_train_steps=params['n_train_steps'],
                    discount=params['discount'],
                    min_buffer_size=int(1e4),
                    exploration_strategy=action_noise,
                    policy_optimizer=tf.compat.v1.train.AdamOptimizer,
                    qf_optimizer=tf.compat.v1.train.AdamOptimizer)

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        tensorboard_log_dir = osp.join(log_dir)
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(tensorboard_log_dir))

        runner.setup(algo, env)
        runner.train(n_epochs=params['n_epochs'],
                     batch_size=params['n_rollout_steps'])

        dowel_logger.remove_all()

        return tabular_log_file
Ejemplo n.º 9
0
def run_metarl(env, seed, log_dir):
    """Create metarl Tensorflow PPO model and training.

    Args:
        env (dict): Environment of the task.
        seed (int): Random positive integer for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: Path to output csv file

    """
    deterministic.set_seed(seed)
    snapshot_config = SnapshotConfig(snapshot_dir=log_dir,
                                     snapshot_mode='gap',
                                     snapshot_gap=10)
    with LocalTFRunner(snapshot_config) as runner:
        env, task_samplers = _prepare_meta_env(env)

        policy = GaussianGRUPolicy(
            hidden_dims=hyper_parameters['hidden_sizes'],
            env_spec=env.spec,
            state_include_action=False)

        baseline = MetaRLLinearFeatureBaseline(env_spec=env.spec)

        inner_algo = RL2PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=hyper_parameters['max_path_length'] *
            hyper_parameters['rollout_per_task'],
            discount=hyper_parameters['discount'],
            gae_lambda=hyper_parameters['gae_lambda'],
            lr_clip_range=hyper_parameters['lr_clip_range'],
            optimizer_args=dict(
                max_epochs=hyper_parameters['optimizer_max_epochs'],
                tf_optimizer_args=dict(
                    learning_rate=hyper_parameters['optimizer_lr'], ),
            ))

        algo = RL2(policy=policy,
                   inner_algo=inner_algo,
                   max_path_length=hyper_parameters['max_path_length'],
                   meta_batch_size=hyper_parameters['meta_batch_size'],
                   task_sampler=task_samplers)

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        text_log_file = osp.join(log_dir, 'debug.log')
        dowel_logger.add_output(dowel.TextOutput(text_log_file))
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(
            algo,
            task_samplers.sample(hyper_parameters['meta_batch_size']),
            sampler_cls=hyper_parameters['sampler_cls'],
            n_workers=hyper_parameters['meta_batch_size'],
            worker_class=RL2Worker,
            sampler_args=dict(
                use_all_workers=hyper_parameters['use_all_workers']),
            worker_args=dict(
                n_paths_per_trial=hyper_parameters['rollout_per_task']))

        runner.setup_meta_evaluator(
            test_task_sampler=task_samplers,
            n_exploration_traj=hyper_parameters['rollout_per_task'],
            n_test_rollouts=hyper_parameters['test_rollout_per_task'],
            n_test_tasks=hyper_parameters['n_test_tasks'],
            n_workers=hyper_parameters['n_test_tasks'])

        runner.train(n_epochs=hyper_parameters['n_itr'],
                     batch_size=hyper_parameters['meta_batch_size'] *
                     hyper_parameters['rollout_per_task'] *
                     hyper_parameters['max_path_length'])

        dowel_logger.remove_all()

        return tabular_log_file
Ejemplo n.º 10
0
def run_garage(env, seed, log_dir):
    '''
    Create garage model and training.

    Replace the ppo with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return:
    '''
    deterministic.set_seed(seed)
    env.reset()

    with LocalTFRunner(snapshot_config) as runner:
        env = TfEnv(normalize(env))

        action_noise = OUStrategy(env.spec, sigma=params['sigma'])

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=params['policy_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
            input_include_goal=True,
        )

        qf = ContinuousMLPQFunction(
            env_spec=env.spec,
            hidden_sizes=params['qf_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu,
            input_include_goal=True,
        )

        replay_buffer = HerReplayBuffer(
            env_spec=env.spec,
            size_in_transitions=params['replay_buffer_size'],
            time_horizon=params['n_rollout_steps'],
            replay_k=0.4,
            reward_fun=env.compute_reward,
        )

        algo = DDPG(
            env_spec=env.spec,
            policy=policy,
            qf=qf,
            replay_buffer=replay_buffer,
            policy_lr=params['policy_lr'],
            qf_lr=params['qf_lr'],
            target_update_tau=params['tau'],
            n_train_steps=params['n_train_steps'],
            discount=params['discount'],
            exploration_strategy=action_noise,
            policy_optimizer=tf.train.AdamOptimizer,
            qf_optimizer=tf.train.AdamOptimizer,
            buffer_batch_size=256,
            input_include_goal=True,
        )

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        logger.add_output(dowel.StdOutput())
        logger.add_output(dowel.CsvOutput(tabular_log_file))
        logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(algo, env)
        runner.train(n_epochs=params['n_epochs'],
                     n_epoch_cycles=params['n_epoch_cycles'],
                     batch_size=params['n_rollout_steps'])

        logger.remove_all()

        return tabular_log_file
Ejemplo n.º 11
0
def standard_eval(env,
                  policy,
                  n_episodes=20,
                  greedy=True,
                  load_from_file=False,
                  render=False,
                  recorder=None,
                  max_steps=10000):
    if recorder is not None:
        render = False  # force off
    if load_from_file:
        logger.add_output(dowel.StdOutput())
    logger.log('Evaluating policy, {} episodes, greedy = {} ...'.format(
        n_episodes, greedy))
    episode_rewards = []
    pbar = ProgBarCounter(n_episodes)
    for e in range(n_episodes):
        obs = env.reset()
        policy.reset([True])
        terminated = False
        t = 0
        episode_rewards.append(0)
        while not terminated:
            if render:
                env.render()
                # time.sleep(0.05)
            if recorder is not None:
                recorder.capture_frame()
            if not env.centralized:
                # obs.shape = (n_agents, n_envs, obs_dim)
                obs = torch.Tensor(obs).unsqueeze(1)  # add n_envs dim
                avail_actions = torch.Tensor(
                    env.get_avail_actions()).unsqueeze(1)
                actions, agent_infos = policy.get_actions(obs,
                                                          avail_actions,
                                                          greedy=greedy)
                if len(actions.shape) == 3:  # n-d action
                    actions = actions[:, 0, :]
                elif len(actions.shape) == 2:  # 1-d action
                    actions = actions[:, 0]
                obs, reward, terminated, info = env.step(actions)  # n_env = 1
                terminated = all(terminated)
            else:
                # obs.shape = (n_envs, n_agents * obs_dim)
                obs = np.array([obs])
                avail_actions = np.array([env.get_avail_actions()])
                actions, agent_infos = policy.get_actions(obs,
                                                          avail_actions,
                                                          greedy=greedy)
                obs, reward, terminated, info = env.step(
                    actions[0])  # n_env = 1
            t += 1
            if t > max_steps:
                terminated = True
            episode_rewards[-1] += np.mean(reward)
        pbar.inc(1)
    pbar.stop()
    policy.reset([True])
    avg_return = np.mean(episode_rewards)
    logger.log('EvalAvgReturn: {}'.format(avg_return))
    if not load_from_file:
        tabular.record('EvalAvgReturn', avg_return)
Ejemplo n.º 12
0
    def eval(self,
             policy,
             n_episodes=20,
             greedy=True,
             load_from_file=False,
             max_steps=60):
        import dowel
        from dowel import logger, tabular
        from garage.misc.prog_bar_counter import ProgBarCounter

        if load_from_file:
            logger.add_output(dowel.StdOutput())
        logger.log('Evaluating policy, {} episodes, greedy = {} ...'.format(
            n_episodes, greedy))
        episode_rewards = []
        success = 0
        pbar = ProgBarCounter(n_episodes)
        for e in range(n_episodes):
            obs = self.reset()
            policy.reset([True])
            terminated = False
            t = 0
            episode_rewards.append(0)
            while not terminated:
                if not self.centralized:
                    # obs.shape = (n_agents, n_envs, obs_dim)
                    obs = torch.Tensor(obs).unsqueeze(1)  # add n_envs dim
                    avail_actions = torch.Tensor(
                        self.get_avail_actions()).unsqueeze(1)
                    actions, agent_infos = policy.get_actions(obs,
                                                              avail_actions,
                                                              greedy=greedy)
                    if len(actions.shape) == 3:  # n-d action
                        actions = actions[:, 0, :]
                    elif len(actions.shape) == 2:  # 1-d action
                        actions = actions[:, 0]
                    obs, reward, terminated, info = self.step(
                        actions)  # n_env = 1
                    terminated = all(terminated)
                else:
                    # obs.shape = (n_envs, n_agents * obs_dim)
                    obs = np.array([obs])
                    avail_actions = np.array([self.get_avail_actions()])
                    actions, agent_infos = policy.get_actions(obs,
                                                              avail_actions,
                                                              greedy=greedy)
                    obs, reward, terminated, info = self.step(
                        actions[0])  # n_env = 1
                t += 1
                if t >= max_steps:
                    terminated = True
                episode_rewards[-1] += np.mean(reward)
            # episode end
            success += self.stat['success']
            pbar.inc(1)
        pbar.stop()
        policy.reset([True])
        avg_return = np.mean(episode_rewards)
        success = success / n_episodes
        logger.log('EvalAvgReturn: {}'.format(avg_return))
        logger.log('EvalSucessRate: {}'.format(success))
        if not load_from_file:
            tabular.record('EvalAvgReturn', avg_return)
            tabular.record('EvalSucessRate', success)
Ejemplo n.º 13
0
def run_experiment(argv):
    """Run experiment."""
    now = datetime.datetime.now(dateutil.tz.tzlocal())

    # avoid name clashes when running distributed jobs
    rand_id = str(uuid.uuid4())[:5]
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z')

    default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id)
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--n_parallel',
        type=int,
        default=1,
        help=('Number of parallel workers to perform rollouts. '
              "0 => don't start any workers"))
    parser.add_argument('--exp_name',
                        type=str,
                        default=default_exp_name,
                        help='Name of the experiment.')
    parser.add_argument('--log_dir',
                        type=str,
                        default=None,
                        help='Path to save the log and iteration snapshot.')
    parser.add_argument('--snapshot_mode',
                        type=str,
                        default='last',
                        help='Mode to save the snapshot. Can be either "all" '
                        '(all iterations will be saved), "last" (only '
                        'the last iteration will be saved), "gap" (every'
                        '`snapshot_gap` iterations are saved), or "none" '
                        '(do not save snapshots)')
    parser.add_argument('--snapshot_gap',
                        type=int,
                        default=1,
                        help='Gap between snapshot iterations.')
    parser.add_argument(
        '--resume_from_dir',
        type=str,
        default=None,
        help='Directory of the pickle file to resume experiment from.')
    parser.add_argument('--resume_from_epoch',
                        type=str,
                        default=None,
                        help='Index of iteration to restore from. '
                        'Can be "first", "last" or a number. '
                        'Not applicable when snapshot_mode="last"')
    parser.add_argument('--tabular_log_file',
                        type=str,
                        default='progress.csv',
                        help='Name of the tabular log file (in csv).')
    parser.add_argument('--text_log_file',
                        type=str,
                        default='debug.log',
                        help='Name of the text log file (in pure text).')
    parser.add_argument('--tensorboard_step_key',
                        type=str,
                        default=None,
                        help='Name of the step key in tensorboard_summary.')
    parser.add_argument('--params_log_file',
                        type=str,
                        default='params.json',
                        help='Name of the parameter log file (in json).')
    parser.add_argument('--variant_log_file',
                        type=str,
                        default='variant.json',
                        help='Name of the variant log file (in json).')
    parser.add_argument('--plot',
                        type=ast.literal_eval,
                        default=False,
                        help='Whether to plot the iteration results')
    parser.add_argument(
        '--log_tabular_only',
        type=ast.literal_eval,
        default=False,
        help='Print only the tabular log information (in a horizontal format)')
    parser.add_argument('--seed',
                        type=int,
                        default=None,
                        help='Random seed for numpy')
    parser.add_argument('--args_data',
                        type=str,
                        help='Pickled data for objects')
    parser.add_argument('--variant_data',
                        type=str,
                        help='Pickled data for variant configuration')

    args = parser.parse_args(argv[1:])

    if args.seed is not None:
        deterministic.set_seed(args.seed)

    if args.n_parallel > 0:
        parallel_sampler.initialize(n_parallel=args.n_parallel)
        if args.seed is not None:
            parallel_sampler.set_seed(args.seed)

    if not args.plot:
        garage.plotter.Plotter.disable()
        garage.tf.plotter.Plotter.disable()

    if args.log_dir is None:
        log_dir = os.path.join(os.path.join(os.getcwd(), 'data'),
                               args.exp_name)
    else:
        log_dir = args.log_dir

    tabular_log_file = os.path.join(log_dir, args.tabular_log_file)
    text_log_file = os.path.join(log_dir, args.text_log_file)
    params_log_file = os.path.join(log_dir, args.params_log_file)

    if args.variant_data is not None:
        variant_data = pickle.loads(base64.b64decode(args.variant_data))
        variant_log_file = os.path.join(log_dir, args.variant_log_file)
        dump_variant(variant_log_file, variant_data)
    else:
        variant_data = None

    log_parameters(params_log_file, args)

    logger.add_output(dowel.TextOutput(text_log_file))
    logger.add_output(dowel.CsvOutput(tabular_log_file))
    logger.add_output(dowel.TensorBoardOutput(log_dir))
    logger.add_output(dowel.StdOutput())

    logger.push_prefix('[%s] ' % args.exp_name)

    snapshot_config = SnapshotConfig(snapshot_dir=log_dir,
                                     snapshot_mode=args.snapshot_mode,
                                     snapshot_gap=args.snapshot_gap)

    method_call = cloudpickle.loads(base64.b64decode(args.args_data))
    try:
        method_call(snapshot_config, variant_data, args.resume_from_dir,
                    args.resume_from_epoch)
    except BaseException:
        children = garage.plotter.Plotter.get_plotters()
        children += garage.tf.plotter.Plotter.get_plotters()
        if args.n_parallel > 0:
            children += [parallel_sampler]
        child_proc_shutdown(children)
        raise

    logger.remove_all()
    logger.pop_prefix()
Ejemplo n.º 14
0
def run_metarl(env, test_env, seed, log_dir):
    """Create metarl model and training."""

    deterministic.set_seed(seed)
    snapshot_config = SnapshotConfig(snapshot_dir=log_dir,
                                     snapshot_mode='gap',
                                     snapshot_gap=10)
    runner = LocalRunner(snapshot_config)

    obs_dim = int(np.prod(env[0]().observation_space.shape))
    action_dim = int(np.prod(env[0]().action_space.shape))
    reward_dim = 1

    # instantiate networks
    encoder_in_dim = obs_dim + action_dim + reward_dim
    encoder_out_dim = params['latent_size'] * 2
    net_size = params['net_size']

    context_encoder = MLPEncoder(input_dim=encoder_in_dim,
                                 output_dim=encoder_out_dim,
                                 hidden_sizes=[200, 200, 200])

    space_a = akro.Box(low=-1,
                       high=1,
                       shape=(obs_dim + params['latent_size'], ),
                       dtype=np.float32)
    space_b = akro.Box(low=-1, high=1, shape=(action_dim, ), dtype=np.float32)
    augmented_env = EnvSpec(space_a, space_b)

    qf1 = ContinuousMLPQFunction(env_spec=augmented_env,
                                 hidden_sizes=[net_size, net_size, net_size])

    qf2 = ContinuousMLPQFunction(env_spec=augmented_env,
                                 hidden_sizes=[net_size, net_size, net_size])

    obs_space = akro.Box(low=-1, high=1, shape=(obs_dim, ), dtype=np.float32)
    action_space = akro.Box(low=-1,
                            high=1,
                            shape=(params['latent_size'], ),
                            dtype=np.float32)
    vf_env = EnvSpec(obs_space, action_space)

    vf = ContinuousMLPQFunction(env_spec=vf_env,
                                hidden_sizes=[net_size, net_size, net_size])

    policy = TanhGaussianMLPPolicy2(
        env_spec=augmented_env, hidden_sizes=[net_size, net_size, net_size])

    context_conditioned_policy = ContextConditionedPolicy(
        latent_dim=params['latent_size'],
        context_encoder=context_encoder,
        policy=policy,
        use_ib=params['use_information_bottleneck'],
        use_next_obs=params['use_next_obs_in_context'],
    )

    train_task_names = ML10.get_train_tasks()._task_names
    test_task_names = ML10.get_test_tasks()._task_names

    pearlsac = PEARLSAC(
        env=env,
        test_env=test_env,
        policy=context_conditioned_policy,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        num_train_tasks=params['num_train_tasks'],
        num_test_tasks=params['num_test_tasks'],
        latent_dim=params['latent_size'],
        meta_batch_size=params['meta_batch_size'],
        num_steps_per_epoch=params['num_steps_per_epoch'],
        num_initial_steps=params['num_initial_steps'],
        num_tasks_sample=params['num_tasks_sample'],
        num_steps_prior=params['num_steps_prior'],
        num_extra_rl_steps_posterior=params['num_extra_rl_steps_posterior'],
        num_evals=params['num_evals'],
        num_steps_per_eval=params['num_steps_per_eval'],
        batch_size=params['batch_size'],
        embedding_batch_size=params['embedding_batch_size'],
        embedding_mini_batch_size=params['embedding_mini_batch_size'],
        max_path_length=params['max_path_length'],
        reward_scale=params['reward_scale'],
        train_task_names=train_task_names,
        test_task_names=test_task_names,
    )

    tu.set_gpu_mode(params['use_gpu'], gpu_id=0)
    if params['use_gpu']:
        pearlsac.to()

    tabular_log_file = osp.join(log_dir, 'progress.csv')
    tensorboard_log_dir = osp.join(log_dir)
    dowel_logger.add_output(dowel.StdOutput())
    dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
    dowel_logger.add_output(dowel.TensorBoardOutput(tensorboard_log_dir))

    runner.setup(algo=pearlsac,
                 env=env,
                 sampler_cls=PEARLSampler,
                 sampler_args=dict(max_path_length=params['max_path_length']))
    runner.train(n_epochs=params['num_epochs'],
                 batch_size=params['batch_size'])

    dowel_logger.remove_all()

    return tabular_log_file
Ejemplo n.º 15
0
def run_garage(env, seed, log_dir):
    '''
    Create garage model and training.

    Replace the ppo with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return:
    '''
    deterministic.set_seed(seed)

    with LocalRunner() as runner:
        env = TfEnv(normalize(env))

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(
                hidden_sizes=(64, 64),
                use_trust_region=False,
                optimizer=FirstOrderOptimizer,
                optimizer_args=dict(
                    batch_size=32,
                    max_epochs=10,
                    tf_optimizer_args=dict(learning_rate=1e-3),
                ),
            ),
        )

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            policy_ent_coeff=0.0,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
                tf_optimizer_args=dict(learning_rate=1e-3),
            ),
        )

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(algo, env)
        runner.train(n_epochs=488, batch_size=2048)

        dowel_logger.remove_all()

        return tabular_log_file
def run_garage(env, seed, log_dir):
    '''
    Create garage model and training.

    Replace the ppo with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return:
    '''
    deterministic.set_seed(seed)
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=12,
                            inter_op_parallelism_threads=12)
    sess = tf.Session(config=config)

    with LocalTFRunner(snapshot_config, sess=sess, max_cpus=12) as runner:
        env = TfEnv(normalize(env))

        policy = CategoricalCNNPolicy(
            env_spec=env.spec,
            conv_filters=params['conv_filters'],
            conv_filter_sizes=params['conv_filter_sizes'],
            conv_strides=params['conv_strides'],
            conv_pad=params['conv_pad'],
            hidden_sizes=params['hidden_sizes'])

        baseline = GaussianCNNBaseline(
            env_spec=env.spec,
            regressor_args=dict(num_filters=params['conv_filters'],
                                filter_dims=params['conv_filter_sizes'],
                                strides=params['conv_strides'],
                                padding=params['conv_pad'],
                                hidden_sizes=params['hidden_sizes'],
                                use_trust_region=params['use_trust_region']))

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            policy_ent_coeff=0.0,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
                tf_optimizer_args=dict(learning_rate=1e-3),
            ),
            flatten_input=False,
        )

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(algo, env)
        runner.train(n_epochs=params['n_epochs'],
                     batch_size=params['batch_size'])

        dowel_logger.remove_all()

        return tabular_log_file
Ejemplo n.º 17
0
def run_garage_pytorch(env, seed, log_dir):
    """Create garage PyTorch PPO model and training.

    Args:
        env (dict): Environment of the task.
        seed (int): Random positive integer for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: Path to output csv file

    """
    env = TfEnv(normalize(env))

    deterministic.set_seed(seed)

    runner = LocalRunner(snapshot_config)

    policy = PyTorch_GMP(env.spec,
                         hidden_sizes=(32, 32),
                         hidden_nonlinearity=torch.tanh,
                         output_nonlinearity=None)

    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=(32, 32),
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)

    policy_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)),
                                        policy,
                                        max_optimization_epochs=10,
                                        minibatch_size=64)
    vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)),
                                    value_function,
                                    max_optimization_epochs=10,
                                    minibatch_size=64)

    algo = PyTorch_PPO(env_spec=env.spec,
                       policy=policy,
                       value_function=value_function,
                       policy_optimizer=policy_optimizer,
                       vf_optimizer=vf_optimizer,
                       max_path_length=hyper_parameters['max_path_length'],
                       discount=0.99,
                       gae_lambda=0.95,
                       center_adv=True,
                       lr_clip_range=0.2)

    # Set up logger since we are not using run_experiment
    tabular_log_file = osp.join(log_dir, 'progress.csv')
    dowel_logger.add_output(dowel.StdOutput())
    dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
    dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

    runner.setup(algo, env)
    runner.train(n_epochs=hyper_parameters['n_epochs'],
                 batch_size=hyper_parameters['batch_size'])

    dowel_logger.remove_all()

    return tabular_log_file
Ejemplo n.º 18
0
def run_metarl(env, seed, log_dir):
    """Create metarl model and training.

    Replace the ddpg with the algorithm you want to run.

    Args:
        env (gym.Env): Environment of the task.
        seed (int): Random seed for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: Log file path.

    """
    deterministic.set_seed(seed)
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=12,
                            inter_op_parallelism_threads=12)
    sess = tf.Session(config=config)
    with LocalTFRunner(snapshot_config, sess=sess, max_cpus=12) as runner:
        env = TfEnv(normalize(env))
        # Set up params for ddpg
        action_noise = OUStrategy(env.spec, sigma=params['sigma'])

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            name='ContinuousMLPPolicy',
            hidden_sizes=params['policy_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh)

        qf = ContinuousMLPQFunction(env_spec=env.spec,
                                    hidden_sizes=params['qf_hidden_sizes'],
                                    hidden_nonlinearity=tf.nn.relu,
                                    name='ContinuousMLPQFunction')

        replay_buffer = SimpleReplayBuffer(
            env_spec=env.spec,
            size_in_transitions=params['replay_buffer_size'],
            time_horizon=params['n_rollout_steps'])

        ddpg = DDPG(env_spec=env.spec,
                    policy=policy,
                    qf=qf,
                    replay_buffer=replay_buffer,
                    steps_per_epoch=params['steps_per_epoch'],
                    policy_lr=params['policy_lr'],
                    qf_lr=params['qf_lr'],
                    target_update_tau=params['tau'],
                    n_train_steps=params['n_train_steps'],
                    discount=params['discount'],
                    min_buffer_size=int(1e4),
                    exploration_strategy=action_noise,
                    policy_optimizer=tf.train.AdamOptimizer,
                    qf_optimizer=tf.train.AdamOptimizer)

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(ddpg, env, sampler_args=dict(n_envs=12))
        runner.train(n_epochs=params['n_epochs'],
                     batch_size=params['n_rollout_steps'])

        dowel_logger.remove_all()

        return tabular_log_file
Ejemplo n.º 19
0
        help='glob for tags to save')
    parser.add_argument(
        '--outdir', default='events',
        help='output directory to store values')
    parser.add_argument(
        '--force', type=boolean, default=False,
        help='overwrite existing files')
    parser.add_argument(
        '--workers', type=int, default=10,
        help='number of worker threads')
    args_, remaining = parser.parse_known_args()

    args_.outdir = os.path.expanduser(args_.outdir)
    remaining.insert(0, sys.argv[0])
    tags = ['*general/objectives/reward']
    logger.add_output(dowel.StdOutput())
    logger.add_output(dowel.CsvOutput('loss.csv'))

    base_dir = 'benchmark'
    phases = ['train', 'test']
    envs = ['cartpole_swingup']
    # methods = ['aug2', 'aug3']
    # methods = ['baseline3', 'resample_traj4', 'resample_traj6', 'aug2', 'aug3', 'aug4', 'aug5']
    methods = ['aug6']
    # envs = ['finger_spin', 'cartpole_swingup', 'reacher_easy', 'cheetah_run', 'walker_walk', 'cup_catch']

    for tag in tags:
        for env in envs:
            for meth in methods:
                tabular.record('Method', meth)
                for phase in phases: