def rollout_row(train_config_num, env_ind, env, q):

    mean_rollouts = np.zeros(len(phi_configs))
    std_rollouts = np.zeros(len(phi_configs))

    # iterate over test configurations
    for test_config_num, test_config in enumerate(phi_configs):
        print("train config num : {}".format(train_config_num))
        print("test config num : {}".format(test_config_num))

        rollouts = []

        # iterate over agents
        for agent_num in range(num_agents):

            real_config_num = train_config_num - 1
            if train_config_num == 0:
                real_config_num = "nominal"

            file_str = '../policies_curriculum/{}/policy_{}_config_{}_agent_{}'.format(
                dynamic_environments[env_ind], dynamic_environments[env_ind],
                real_config_num, agent_num)

            # read in the agent's policy
            policy = loadModel(file_str)

            if train_config_num == 0:
                # set configuration for nominal policy
                policy.set_config(test_config)
                curriculum = None
            else:
                # note that policy config is set through the curriculum
                # by having only one element, we ensure this is the config during rollouts
                assert (isinstance(policy, CurriculumPolicy))
                curriculum = [test_config]

            cum_rewards = []
            for i in range(num_rollouts):
                rollout_dict = rollout(env=env,
                                       agent=policy,
                                       max_path_length=env.horizon,
                                       curriculum=curriculum)
                cum_rewards.append(np.sum(rollout_dict["rewards"]))
            rollouts.append(cum_rewards)

        mean_rollouts[test_config_num] = np.mean(rollouts)
        std_rollouts[test_config_num] = np.std(rollouts)
        q.put((train_config_num, test_config_num,
               mean_rollouts[test_config_num], std_rollouts[test_config_num]))

    # write to file in case something weird with multiproc happens...
    saveModel([mean_rollouts, std_rollouts],
              'rollouts_{}_config_{}'.format(dynamic_environments[env_ind],
                                             train_config_num))

    print("GOT HERE {}".format(train_config_num))
    return
    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=5000,
        max_path_length=env.horizon,
        n_itr=NUM_ITERS,
        discount=0.995,
        step_size=0.01,
        gae_lambda=0.97,
        sampler_args={'n_workers': 2},
        plot_learning_curve=GENERATE_PLOTS,
        trial=trial,
    )
    avg_rewards, std_rewards = algo.train()

    print('trial {}'.format(trial))
    saveModel(
        algo.policy,
        'policy_{}_{}_{}_{}_{}_{}_{}_{}'.format(ENV_NAME, TRAIN_ADVERSARIAL,
                                                NUM_ITERS, PROBABILITY, EPS,
                                                MAX_NORM, USE_DYNAMICS, trial))

    # save rewards per model over the iterations
    if GENERATE_PLOTS:
        saveModel([range(NUM_ITERS), avg_rewards, std_rewards],
                  'rewards_{}_{}_{}_{}_{}_{}_{}_{}'.format(
                      ENV_NAME, TRAIN_ADVERSARIAL, NUM_ITERS, PROBABILITY, EPS,
                      MAX_NORM, USE_DYNAMICS, trial))
def train(env_ind, config_num, num_agents):

    # get the original state space size first
    org_env = GymEnv(original_environments[env_ind])
    org_env_size = org_env.observation_space.shape[0]
    org_env.terminate()

    # the environment
    env = GymEnv(dynamic_environments[env_ind])

    # the configuration settings
    curriculum_config = curriculum_configs[config_num]

    if args.env_ind == 0:
        # batch size for Inverted Pendulum
        curriculum_config.set_batch_size(5000)
    else:
        # batch size for all other environments
        curriculum_config.set_batch_size(25000)

    # the nominal config
    config = curriculum_config.curriculum_list[0]

    for agent_num in range(num_agents):

        # define policy by reading from config class
        policy = CurriculumPolicy(
            env_spec=env.spec,
            hidden_sizes=config.hidden_sizes,
            adaptive_std=config.adaptive_std,
            adversarial=config.adversarial,
            eps=config.eps,
            probability=config.probability,
            use_dynamics=config.use_dynamics,
            random=config.random,
            observable_noise=config.observable_noise,
            zero_gradient_cutoff=org_env_size,
            use_max_norm=config.use_max_norm,
            curriculum_list=list(curriculum_config.curriculum_list),
            update_freq=curriculum_config.update_freq,
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=config.batch_size,
            max_path_length=env.horizon,
            n_itr=config.num_iter,
            discount=config.discount,
            step_size=config.step_size,
            gae_lambda=config.gae_lambda,
            num_workers=config.num_workers,
            plot_learning_curve=config.plot_learning_curve,
            trial=agent_num,
        )
        avg_rewards, std_rewards = algo.train()

        print("training completed!")
        saveModel(
            algo.policy, 'policy_{}_config_{}_agent_{}'.format(
                dynamic_environments[env_ind], config_num, agent_num))

        # save rewards per model over the iterations
        if config.plot_learning_curve:
            saveModel([range(config.num_iter), avg_rewards, std_rewards],
                      'rewards_{}_config_{}_agent_{}'.format(
                          dynamic_environments[env_ind], config_num,
                          agent_num))
Exemple #4
0
        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=config.batch_size,
            max_path_length=env.horizon,
            n_itr=config.num_iter,
            discount=config.discount,
            step_size=config.step_size,
            gae_lambda=config.gae_lambda,
            num_workers=config.num_workers,
            plot_learning_curve=config.plot_learning_curve,
            trial=agent_num,
        )
        avg_rewards, std_rewards = algo.train()

        print("training completed!")
        saveModel(
            algo.policy, 'policy_{}_config_{}_agent_{}'.format(
                dynamic_environments[args.env_ind], args.config_num,
                agent_num))

        # save rewards per model over the iterations
        if config.plot_learning_curve:
            saveModel([range(config.num_iter), avg_rewards, std_rewards],
                      'rewards_{}_config_{}_agent_{}'.format(
                          dynamic_environments[args.env_ind], args.config_num,
                          agent_num))
Exemple #5
0
            # read in the agent's policy
            policy = loadModel(file_str)

            # set policy parameters to ensure we test correctly (these are used by the rollout function internally)
            policy.adversarial = test_config.adversarial
            policy.eps = test_config.eps
            policy.probability = test_config.probability
            policy.use_dynamics = test_config.use_dynamics
            policy.random = test_config.random
            policy.observable_noise = test_config.observable_noise
            policy.use_max_norm = test_config.use_max_norm

            cum_rewards = []
            for i in range(num_rollouts):
                rollout_dict = rollout(env=env,
                                       agent=policy,
                                       max_path_length=env.horizon)
                cum_rewards.append(np.sum(rollout_dict["rewards"]))
            rollouts.append(cum_rewards)

        mean_rewards[test_config_num] = np.mean(rollouts)
        std_rewards[test_config_num] = np.std(rollouts)

    print("mean_rewards")
    print(mean_rewards)
    print("std_rewards")
    print(std_rewards)

    saveModel([mean_rewards, std_rewards], 'rollouts_{}_config_{}'.format(
        dynamic_environments[args.env_ind], args.config_num))
Exemple #6
0
        # read in the agent's policy
        policy = loadModel(fname)

        o = env.reset()
        original_dynamics = o[org_env_size:]
        assert (len(original_dynamics) == 2)

        for i in range(num_param_evals):
            for j in range(num_param_evals):
                new_dynamics = original_dynamics.copy()
                new_dynamics[0] = percentages[i] * original_dynamics[0]
                new_dynamics[1] = percentages[j] * original_dynamics[1]
                policy.set_dynamics = new_dynamics
                policy.adversarial = False

                # curriculum is just nominal config, no adversarial
                curriculum = [phi_configs[0]]

                # average over several rollouts
                cum_rewards = np.zeros(num_rollouts)
                for k in range(num_rollouts):
                    rollout_dict = rollout(env=env,
                                           agent=policy,
                                           max_path_length=env.horizon,
                                           curriculum=curriculum)
                    cum_rewards[k] = np.sum(rollout_dict["rewards"])
                results[i, j] = np.mean(cum_rewards)

        saveModel(results, "epopt_{}".format(f_suffix))
Exemple #7
0
        baseline=baseline,
        batch_size=config.batch_size,
        max_path_length=env.horizon,
        n_itr=n_itr,
        discount=config.discount,
        step_size=config.step_size,
        gae_lambda=config.gae_lambda,
        num_workers=config.num_workers,
        plot_learning_curve=config.plot_learning_curve,
        trial=agent_num,
    )
    avg_rewards, std_rewards = algo.train()

    print("training completed!")
    saveModel(
        algo.policy, 'policy_{}_config_{}_agent_{}'.format(
            dynamic_environments[args.env_ind], args.config_num, agent_num))

    # save rewards per model over the iterations
    # also plot the rewards
    if config.plot_learning_curve:
        saveModel([range(n_itr), avg_rewards, std_rewards],
                  'rewards_{}_config_{}_agent_{}'.format(
                      dynamic_environments[args.env_ind], args.config_num,
                      agent_num))

        plt.figure()
        plt.plot(range(n_itr), avg_rewards)
        plt.title('Learning Curve')
        plt.savefig('mr_{}_config_{}_agent_{}.png'.format(
            dynamic_environments[args.env_ind], args.config_num, agent_num))