Beispiel #1
0
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=args.batch_size,
            max_path_length=env.horizon,
            n_itr=args.num_epochs,
            discount=0.99,
            step_size=args.step_size,
            optimizer=ConjugateGradientOptimizer(
                reg_coeff=args.reg_coeff,
                hvp_approach=FiniteDifferenceHvp(base_eps=args.reg_coeff)))

        custom_train(algo, sess=sess)

        rollouts = algo.obtain_samples(args.num_epochs + 1)

        logger.log("Average reward for training rollouts on (%s): %f +- %f " %
                   (env_name, np.mean([np.sum(p['rewards'])
                                       for p in rollouts]),
                    np.std([np.sum(p['rewards']) for p in rollouts])))

    # Final evaluation on all environments using the learned policy

    total_rollouts = []
    for env_name, env in envs:
        rollouts = []
        for i in range(args.num_final_rollouts):
            rollout = rollout_policy(policy,
                                     env,
                                     max_path_length=env.horizon,
algo = TRPO(
    env=env,
    policy=policy,
    baseline=baseline,
    batch_size=50000, # Mujoco tasks need 20000-50000
    max_path_length=env.horizon, # And 500
    n_itr=iters,
    discount=0.99,
    step_size=0.01,
    optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
)

# run_experiment_lite(
#     ,
#     n_parallel=1,
#     snapshot_mode="last",
#     seed=1
# )
with tf.Session() as sess:

    algo.train(sess=sess)

    rollouts = algo.obtain_samples(iters+1)
    print("Average reward for expert rollouts: %f" % np.mean([np.sum(p['rewards']) for p in rollouts]))

# import pdb; pdb.set_trace()

with open(args.expert_rollout_pickle_path, "wb") as output_file:
    pickle.dump(rollouts, output_file)
Beispiel #3
0
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=args.batch_size,
            max_path_length=env.horizon,
            n_itr=args.num_epochs,
            discount=0.99,
            step_size=args.step_size,
            optimizer=ConjugateGradientOptimizer(reg_coeff=args.reg_coeff, hvp_approach=FiniteDifferenceHvp(base_eps=args.reg_coeff))
        )

        custom_train(algo, sess=sess)

        rollouts = algo.obtain_samples(args.num_epochs + 1)

        logger.log("Average reward for training rollouts on (%s): %f +- %f " % (env_name, np.mean([np.sum(p['rewards']) for p in rollouts]),  np.std([np.sum(p['rewards']) for p in rollouts])))

    # Final evaluation on all environments using the learned policy

    total_rollouts = []
    for env_name, env in envs:
        rollouts = []
        for i in range(args.num_final_rollouts):
            rollout = rollout_policy(policy, env, max_path_length=env.horizon, speedup=1, get_image_observations=True, animated=True)
            rollouts.append(rollout)
            total_rollouts.append(rollout)

        logger.log("Average reward for eval rollouts on (%s): %f +- %f " % (env_name, np.mean([np.sum(p['rewards']) for p in rollouts]),  np.std([np.sum(p['rewards']) for p in rollouts])))
def run_experiment(expert_rollout_pickle_path,
                   trained_policy_pickle_path,
                   env,
                   cost_trainer_type,
                   iterations=30,
                   num_frames=1,
                   traj_len=200,
                   config={}):

    # Load the expert rollouts into memory
    expert_rollouts = load_expert_rollouts(expert_rollout_pickle_path)

    # In the case that we only have one expert rollout in the file
    if type(expert_rollouts) is dict:
        expert_rollouts = [expert_rollouts]

    #TODO: make this configurable
    expert_rollouts = [
        shorten_tensor_dict(x, traj_len) for x in expert_rollouts
    ]

    # import pdb; pdb.set_trace()

    # Sanity check, TODO: should prune any "expert" rollouts with suboptimal reward?
    print("Average reward for expert rollouts: %f" %
          np.mean([np.sum(p['rewards']) for p in expert_rollouts]))

    if "transformers" in config and len(config["transformers"]) > 0:
        print("Transforming expert rollouts...")
        for rollout in tqdm(expert_rollouts):
            transformed_observations = []
            for ob in tqdm(rollout["observations"]):
                for transformer in config["transformers"]:
                    ob = transformer.transform(ob)
                transformed_observations.append(ob)
            rollout["observations"] = np.array(transformed_observations)

    # Handle both flattened state input and image input
    # TODO: this could be done better by looking at just the shape and determining from that
    if config["img_input"]:
        obs_dims = expert_rollouts[0]['observations'][0].shape
    else:
        # import pdb; pdb.set_trace()
        obs_dims = len(expert_rollouts[0]['observations'][0])

    if "num_novice_rollouts" in config:
        number_of_sample_trajectories = config["num_novice_rollouts"]
    else:
        number_of_sample_trajectories = len(expert_rollouts)

    print(number_of_sample_trajectories)

    # Choose a policy (Conv based on images, mlp based on states)
    # TODO: may also have to switch out categorical for something else in continuous state spaces??
    # Let's just avoid that for now?
    if config[
            "img_input"]:  # TODO: unclear right now if this even works ok. get poor results early on.
        policy = CategoricalConvPolicy(
            name="policy",
            env_spec=env.spec,
            conv_filters=[32, 64, 64],
            conv_filter_sizes=[3, 3, 3],
            conv_strides=[1, 1, 1],
            conv_pads=['SAME', 'SAME', 'SAME'],
            # The neural network policy should have two hidden layers, each with 100 hidden units each (see RLGAN paper)
            hidden_sizes=[200, 200])
    elif type(env.spec.action_space) == Discrete:
        policy = CategoricalMLPPolicy(
            name="policy",
            env_spec=env.spec,
            # The neural network policy should have two hidden layers, each with 100 hidden units each (see RLGAN paper)
            hidden_sizes=(400, 300))
    else:
        policy = GaussianMLPPolicy(name="policy",
                                   env_spec=env.spec,
                                   hidden_sizes=(100, 50, 25))

    if config["img_input"]:
        # TODO: right now the linear feature baseline is too computationally expensive to actually use
        # with full image inputs, so for now just use the zero baseline
        baseline = ZeroBaseline(env_spec=env.spec)
    else:
        baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=number_of_sample_trajectories *
        traj_len,  # This is actually used internally by the sampler. We make use of this sampler to generate our samples, hence we pass it here
        max_path_length=
        traj_len,  # same with this value. A cleaner way may be to create our own sampler, but for now doing it this way..
        n_itr=40,
        discount=0.995,
        step_size=0.01,
        optimizer=ConjugateGradientOptimizer(
            hvp_approach=FiniteDifferenceHvp(base_eps=1e-5),
            max_backtracks=40))

    # Prune the number of rollouts if that option is enabled
    if "num_expert_rollouts" in config:
        rollouts_to_use = min(config["num_expert_rollouts"],
                              len(expert_rollouts))
        expert_rollouts = expert_rollouts[:rollouts_to_use]
        print("Only using %d expert rollouts" % rollouts_to_use)

    true_rewards = []
    actual_rewards = []

    # Extract observations to a tensor
    expert_rollouts_tensor = tensor_utils.stack_tensor_list(
        [path["observations"] for path in expert_rollouts])

    if "oversample" in config and config["oversample"]:
        oversample_rate = max(
            int(number_of_sample_trajectories / len(expert_rollouts_tensor)),
            1.)
        expert_rollouts_tensor = expert_rollouts_tensor.repeat(oversample_rate,
                                                               axis=0)
        print("oversampling %d times to %d" %
              (oversample_rate, len(expert_rollouts_tensor)))

    with tf.Session() as sess:
        algo.start_worker()

        cost_trainer = cost_trainer_type([num_frames, obs_dims], config=config)

        trainer = Trainer(env=env,
                          sess=sess,
                          cost_approximator=cost_trainer,
                          cost_trainer=cost_trainer,
                          novice_policy=policy,
                          novice_policy_optimizer=algo,
                          num_frames=num_frames)
        sess.run(tf.global_variables_initializer())

        for iter_step in range(0, iterations):
            dump_data = (iter_step == (
                iterations -
                1)) and config["generate_option_graphs"]  # is last iteration
            true_reward, actual_reward = trainer.step(
                dump_datapoints=dump_data,
                config=config,
                expert_horizon=traj_len,
                number_of_sample_trajectories=number_of_sample_trajectories)
            true_rewards.append(true_reward)
            actual_rewards.append(actual_reward)

            # run a rollout for the video
            if "recording_env" in config:
                novice_rollouts = rollout_policy(policy,
                                                 config["recording_env"],
                                                 get_image_observations=False,
                                                 max_path_length=200)

        novice_rollouts = algo.obtain_samples(iter_step)

        rollout_rewards = [np.sum(x['rewards']) for x in novice_rollouts]

        print("Reward stats for final policy: %f +/- %f " %
              (np.mean(rollout_rewards), np.std(rollout_rewards)))
        # save the novice policy learned
        with open(trained_policy_pickle_path, "wb") as output_file:
            pickle.dump(policy, output_file)
        # TODO: also save the reward function?

        algo.shutdown_worker()

        second_true_rewards = []
        second_actual_rewards = []
        # Do our transfer learning task here:
        # TODO: move this to a separate script and save the learned weights
        if config['second_env'] is not None:
            with tf.variable_scope("second_policy"):
                #TODO: remove gross copypasta
                if not config["reset_second_policy"]:
                    second_policy = Serializable.clone(
                        policy)  # TODO: start with a fresh policy
                else:
                    if config[
                            "img_input"]:  # TODO: unclear right now if this even works ok. get poor results early on.
                        second_policy = CategoricalConvPolicy(
                            name="policy",
                            env_spec=config["second_env"].spec,
                            conv_filters=[32, 64, 64],
                            conv_filter_sizes=[3, 3, 3],
                            conv_strides=[1, 1, 1],
                            conv_pads=['SAME', 'SAME', 'SAME'],
                            # The neural network policy should have two hidden layers, each with 100 hidden units each (see RLGAN paper)
                            hidden_sizes=[200, 200])
                    elif type(env.spec.action_space) == Discrete:
                        second_policy = CategoricalMLPPolicy(
                            name="policy",
                            env_spec=config["second_env"].spec,
                            # The neural network policy should have two hidden layers, each with 100 hidden units each (see RLGAN paper)
                            hidden_sizes=(400, 300))
                    else:
                        second_policy = GaussianMLPPolicy(
                            name="policy",
                            env_spec=config["second_env"].spec,
                            hidden_sizes=(100, 50, 25))

                if config["img_input"]:
                    # TODO: right now the linear feature baseline is too computationally expensive to actually use
                    # with full image inputs, so for now just use the zero baseline
                    baseline = ZeroBaseline(env_spec=config["second_env"].spec)
                else:
                    baseline = LinearFeatureBaseline(
                        env_spec=config["second_env"].spec)

                algo = TRPO(
                    env=config["second_env"],
                    policy=second_policy,
                    baseline=baseline,
                    batch_size=number_of_sample_trajectories *
                    traj_len,  # This is actually used internally by the sampler. We make use of this sampler to generate our samples, hence we pass it here
                    max_path_length=
                    traj_len,  # same with this value. A cleaner way may be to create our own sampler, but for now doing it this way..
                    n_itr=40,
                    discount=0.995,
                    step_size=0.01,
                    optimizer=ConjugateGradientOptimizer(
                        hvp_approach=FiniteDifferenceHvp(base_eps=1e-5),
                        max_backtracks=40))

            if not config["stop_disc_training_on_second_run"] and config[
                    "use_prev_options_relearn_mixing_func"]:
                # If we're not retraining the discriminator at all in the transfer learning step,
                # just keep the old network
                options = cost_trainer.disc.discriminator_options
                cost_trainer.disc._remake_network_from_disc_options(
                    options,
                    stop_gradients=(not config["retrain_options"]),
                    num_extra_options=config["num_extra_options_on_transfer"])

            trainer = Trainer(
                env=config['second_env'],
                sess=sess,
                cost_approximator=cost_trainer,
                cost_trainer=cost_trainer,
                novice_policy=second_policy,
                novice_policy_optimizer=algo,
                num_frames=num_frames,
                train_disc=(not config["stop_disc_training_on_second_run"]))
            algo.start_worker()

            initialize_uninitialized(sess)
            for iter_step in range(0, iterations):
                # import pdb; pdb.set_trace()
                dump_data = (iter_step == (iterations - 1)) and config[
                    "generate_option_graphs"]  # is last iteration
                true_reward, actual_reward = trainer.step(
                    expert_rollouts_tensor=expert_rollouts_tensor,
                    dump_datapoints=dump_data,
                    config=config,
                    expert_horizon=traj_len,
                    number_of_sample_trajectories=number_of_sample_trajectories
                )
                second_true_rewards.append(true_reward)
                second_actual_rewards.append(actual_reward)

                # run a rollout for the video
                if "recording_env" in config:
                    novice_rollouts = rollout_policy(
                        second_policy,
                        config["recording_env"],
                        get_image_observations=False,
                        max_path_length=traj_len)

            novice_rollouts = algo.obtain_samples(iter_step)

            rollout_rewards = [np.sum(x['rewards']) for x in novice_rollouts]
            print("Reward stats for final policy: %f +/- %f " %
                  (np.mean(rollout_rewards), np.std(rollout_rewards)))
            # save the novice policy learned
            with open(trained_policy_pickle_path, "wb") as output_file:
                pickle.dump(second_policy, output_file)

            algo.shutdown_worker()

    return true_rewards, actual_rewards, second_true_rewards, second_actual_rewards