Exemple #1
0
def get_robustness(policy,
                   env_name,
                   fric_fractions=[1.0],
                   fric_bodies=[b'foot'],
                   mass_fractions=[1.0],
                   mass_bodies=[b'torso'],
                   num_evals=5):
    P = policy
    #M=[];V=[];
    M = np.zeros((len(fric_fractions), len(mass_fractions)))
    V = np.zeros((len(fric_fractions), len(mass_fractions)))
    fis = np.zeros((len(fric_fractions), len(mass_fractions)))
    mis = np.zeros((len(fric_fractions), len(mass_fractions)))
    for fi, f in enumerate(fric_fractions):
        for mi, m in enumerate(mass_fractions):
            print('{}/{}'.format((fi * len(mass_fractions)) + mi,
                                 len(mass_fractions) * len(fric_fractions)))
            env = normalize(GymEnv(env_name, 1.0))
            e = np.array(env.wrapped_env.env.model.geom_friction)
            fric_ind = env.wrapped_env.env.model.body_names.index(
                fric_bodies[0])
            e[fric_ind, 0] = e[fric_ind, 0] * f
            env.wrapped_env.env.model.geom_friction = e
            me = np.array(env.wrapped_env.env.model.body_mass)
            mass_ind = env.wrapped_env.env.model.body_names.index(
                mass_bodies[0])
            me[mass_ind, 0] = me[mass_ind, 0] * m
            env.wrapped_env.env.model.body_mass = me
            t = []
            for _ in range(num_evals):
                t.append(test_const_adv(env, P, 1000, 1))
            t = np.array(t)
            M[fi, mi] = t.mean()
            V[fi, mi] = t.std()
            fis[fi, mi] = e[fric_ind, 0]
            mis[fi, mi] = me[mass_ind, 0]
    return M, V, fis, mis
Exemple #2
0
        batch_size=batch_size,
        max_path_length=path_length,
        n_itr=n_adv_itr,
        discount=0.995,
        gae_lambda=gae_lambda,
        step_size=step_size,
        is_protagonist=False,
        scope='adversary_optim'
    )

    ## Setting up summaries for testing for a specific training instance ##
    pro_rews = []
    adv_rews = []
    all_rews = []
    const_testing_rews = []
    const_testing_rews.append(test_const_adv(env_orig, pro_policy, path_length=path_length))
    rand_testing_rews = []
    rand_testing_rews.append(test_rand_adv(env_orig, pro_policy, path_length=path_length))
    step_testing_rews = []
    step_testing_rews.append(test_step_adv(env_orig, pro_policy, path_length=path_length))
    rand_step_testing_rews = []
    rand_step_testing_rews.append(test_rand_step_adv(env_orig, pro_policy, path_length=path_length))
    adv_testing_rews = []
    adv_testing_rews.append(test_learnt_adv(env, pro_policy, adv_policy, path_length=path_length))

    ## Beginning alternating optimization ##
    for ni in range(n_itr):
        logger.log('\n\nExperiment: {} Iteration: {}\n'.format(ne,ni,))
        
        ## Train Protagonist
        pro_algo.train()
Exemple #3
0
                        adv_policy=zero_adv_policy,
                        pro_baseline=pro_baseline,
                        adv_baseline=pro_baseline,
                        batch_size=batch_size,
                        max_path_length=path_length,
                        n_itr=n_pro_itr,
                        discount=0.995,
                        gae_lambda=gae_lambda,
                        step_size=step_size,
                        is_protagonist=True)

    ## Joint optimization ##
    if ifRender == True:
        test_const_adv(env,
                       pro_policy,
                       path_length=path_length,
                       n_traj=1,
                       render=True)
    pro_rews = []
    adv_rews = []
    all_rews = []
    const_testing_rews = []
    const_testing_rews.append(
        test_const_adv(env, pro_policy, path_length=path_length))
    rand_testing_rews = []
    rand_testing_rews.append(
        test_rand_adv(env, pro_policy, path_length=path_length))
    step_testing_rews = []
    step_testing_rews.append(
        test_step_adv(env, pro_policy, path_length=path_length))
    rand_step_testing_rews = []
Exemple #4
0
def train(num_experiments, thread_id, queue):

    ############ DEFAULT PARAMETERS ############

    env_name = None  #Name of adversarial environment
    path_length = 1000  #Maximum episode length
    layer_size = tuple([100, 100, 100])  #Layer definition
    ifRender = False  #Should we render?
    afterRender = 100  #After how many to animate
    n_exps = 1  #Number of training instances to run
    n_itr = 25  #Number of iterations of the alternating optimization
    n_pro_itr = 1  #Number of iterations for the protaginist
    n_adv_itr = 1  #Number of interations for the adversary
    batch_size = 4000  #Number of training samples for each iteration
    ifSave = True  #Should we save?
    save_every = 100  #Save checkpoint every save_every iterations
    n_process = 1  #Number of parallel threads for sampling environment
    adv_fraction = 0.25  #Fraction of maximum adversarial force to be applied
    step_size = 0.01  #kl step size for TRPO
    gae_lambda = 0.97  #gae_lambda for learner
    save_dir = './results'  #folder to save result in

    ############ ENV SPECIFIC PARAMETERS ############

    env_name = 'HopperAdv-v1'

    layer_size = tuple([64, 64])
    step_size = 0.01
    gae_lambda = 1.0
    batch_size = 25000

    n_exps = num_experiments
    n_itr = 500
    ifSave = False
    n_process = 4

    adv_fraction = 3.0

    save_dir = './../results/StaticHopper'

    args = [
        env_name, path_length, layer_size, ifRender, afterRender, n_exps,
        n_itr, n_pro_itr, n_adv_itr, batch_size, save_every, n_process,
        adv_fraction, step_size, gae_lambda, save_dir
    ]

    ############ ADVERSARIAL POLICY LOAD ############

    filepath = './../initial_results/Hopper/env-HopperAdv-v1_Exp1_Itr500_BS25000_Adv0.25_stp0.01_lam1.0_369983.p'
    res_D = pickle.load(open(filepath, 'rb'))
    pretrained_adv_policy = res_D['adv_policy']

    ############ MAIN LOOP ############

    ## Initializing summaries for the tests ##
    const_test_rew_summary = []
    rand_test_rew_summary = []
    step_test_rew_summary = []
    rand_step_test_rew_summary = []
    adv_test_rew_summary = []

    ## Preparing file to save results in ##
    save_prefix = 'static_env-{}_Exp{}_Itr{}_BS{}_Adv{}_stp{}_lam{}_{}'.format(
        env_name, n_exps, n_itr, batch_size, adv_fraction, step_size,
        gae_lambda, random.randint(0, 1000000))
    save_name = save_dir + '/' + save_prefix

    ## Looping over experiments to carry out ##
    for ne in range(n_exps):
        ## Environment definition ##
        ## The second argument in GymEnv defines the relative magnitude of adversary. For testing we set this to 1.0.
        env = normalize(GymEnv(env_name, adv_fraction))
        env_orig = normalize(GymEnv(env_name, 1.0))

        ## Protagonist policy definition ##
        pro_policy = GaussianMLPPolicy(env_spec=env.spec,
                                       hidden_sizes=layer_size,
                                       is_protagonist=True)
        pro_baseline = LinearFeatureBaseline(env_spec=env.spec)

        ## Zero Adversary for the protagonist training ##
        zero_adv_policy = ConstantControlPolicy(env_spec=env.spec,
                                                is_protagonist=False,
                                                constant_val=0.0)

        ## Adversary policy definition ##
        adv_policy = pretrained_adv_policy
        adv_baseline = LinearFeatureBaseline(env_spec=env.spec)

        ## Initializing the parallel sampler ##
        parallel_sampler.initialize(n_process)

        ## Optimizer for the Protagonist ##
        pro_algo = TRPO(env=env,
                        pro_policy=pro_policy,
                        adv_policy=adv_policy,
                        pro_baseline=pro_baseline,
                        adv_baseline=adv_baseline,
                        batch_size=batch_size,
                        max_path_length=path_length,
                        n_itr=n_pro_itr,
                        discount=0.995,
                        gae_lambda=gae_lambda,
                        step_size=step_size,
                        is_protagonist=True)

        ## Setting up summaries for testing for a specific training instance ##
        pro_rews = []
        adv_rews = []
        all_rews = []
        const_testing_rews = []
        const_testing_rews.append(
            test_const_adv(env_orig, pro_policy, path_length=path_length))
        rand_testing_rews = []
        rand_testing_rews.append(
            test_rand_adv(env_orig, pro_policy, path_length=path_length))
        step_testing_rews = []
        step_testing_rews.append(
            test_step_adv(env_orig, pro_policy, path_length=path_length))
        rand_step_testing_rews = []
        rand_step_testing_rews.append(
            test_rand_step_adv(env_orig, pro_policy, path_length=path_length))
        adv_testing_rews = []
        adv_testing_rews.append(
            test_learnt_adv(env,
                            pro_policy,
                            adv_policy,
                            path_length=path_length))

        ## Beginning alternating optimization ##
        for ni in range(n_itr):
            logger.log('\n\nThread: {} Experiment: {} Iteration: {}\n'.format(
                thread_id,
                ne,
                ni,
            ))

            ## Train Protagonist
            pro_algo.train()
            pro_rews += pro_algo.rews
            all_rews += pro_algo.rews
            logger.log('Protag Reward: {}'.format(
                np.array(pro_algo.rews).mean()))

            ## Test the learnt policies
            const_testing_rews.append(
                test_const_adv(env, pro_policy, path_length=path_length))
            rand_testing_rews.append(
                test_rand_adv(env, pro_policy, path_length=path_length))
            step_testing_rews.append(
                test_step_adv(env, pro_policy, path_length=path_length))
            rand_step_testing_rews.append(
                test_rand_step_adv(env, pro_policy, path_length=path_length))
            adv_testing_rews.append(
                test_learnt_adv(env,
                                pro_policy,
                                adv_policy,
                                path_length=path_length))

            if ni % afterRender == 0 and ifRender == True:
                test_const_adv(env,
                               pro_policy,
                               path_length=path_length,
                               n_traj=1,
                               render=True)

            if ni != 0 and ni % save_every == 0 and ifSave == True:
                ## SAVING CHECKPOINT INFO ##
                pickle.dump(
                    {
                        'args': args,
                        'pro_policy': pro_policy,
                        'adv_policy': adv_policy,
                        'zero_test': [const_testing_rews],
                        'rand_test': [rand_testing_rews],
                        'step_test': [step_testing_rews],
                        'rand_step_test': [rand_step_testing_rews],
                        'iter_save': ni,
                        'exp_save': ne,
                        'adv_test': [adv_testing_rews]
                    }, open(save_name + '_' + str(ni) + '.p', 'wb'))

        ## Shutting down the optimizer ##
        pro_algo.shutdown_worker()

        ## Updating the test summaries over all training instances
        const_test_rew_summary.append(const_testing_rews)
        rand_test_rew_summary.append(rand_testing_rews)
        step_test_rew_summary.append(step_testing_rews)
        rand_step_test_rew_summary.append(rand_step_testing_rews)
        adv_test_rew_summary.append(adv_testing_rews)

    queue.put([
        const_test_rew_summary, rand_test_rew_summary, step_test_rew_summary,
        rand_step_test_rew_summary, adv_test_rew_summary
    ])

    ############ SAVING MODEL ############
    '''
Exemple #5
0
if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument('file', type=str, help='path to the snapshot file')
    parser.add_argument('--max_path_length',
                        type=int,
                        default=1000,
                        help='Max length of rollout')
    parser.add_argument('--speedup', type=float, default=1, help='Speedup')
    args = parser.parse_args()

    # If the snapshot file use tensorflow, do:
    # import tensorflow as tf
    # with tf.Session():
    #     [rest of the code]
    with tf.Session() as sess:
        data = joblib.load(args.file)
        pro_policy = data['pro_policy']
        args_pickle = data['args']
        env = normalize(GymEnv(args_pickle.env, 0))
        while True:
            print(
                test_const_adv(env,
                               pro_policy,
                               path_length=args_pickle.path_length,
                               n_traj=5,
                               render=False,
                               speedup=10000))
            if not query_yes_no('Continue simulation?'):
                break
        pro_policy=pro_policy,
        adv_policy=adv_policy,
        pro_baseline=pro_baseline,
        adv_baseline=adv_baseline,
        batch_size=batch_size,
        max_path_length=path_length,
        n_itr=n_adv_itr,
        discount=0.995,
        gae_lambda=gae_lambda,
        step_size=step_size,
        is_protagonist=False,
        scope='adversary_optim'
    )

    ## Joint optimization ##
    if ifRender==True: test_const_adv(env, pro_policy, path_length=path_length, n_traj = 1, render=True)
    pro_rews = []
    adv_rews = []
    all_rews = []
    const_testing_rews = []
    const_testing_rews.append(test_const_adv(env_orig, pro_policy, path_length=path_length))
    adv_testing_rews = []
    adv_testing_rews.append(test_learnt_adv(env, pro_policy, adv_policy, path_length=path_length))
    #embed()
    for ni in range(n_itr):
        logger.log('\n\n\n####expNO{}_{} global itr# {}####\n\n\n'.format(ne,adv_name,ni))
        adv_algo.train()
        adv_rews += adv_algo.rews; all_rews += adv_algo.rews;
        logger.log('Advers Reward: {}'.format(np.array(adv_algo.rews).mean()))
        pro_res_temp = test_const_adv(env, pro_policy, path_length=path_length)
        logger.log('Protag Reward No Adversary: {}'.format(pro_res_temp))