Esempio n. 1
0
def train_mirror(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)
    joblib.dump(str(env.env.__dict__), logger.get_dir() + '/env_specs.pkl', compress=True)
    def policy_fn(name, ob_space, ac_space):
        return mlp_mirror_policy.MlpMirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                                 hid_size=64, num_hid_layers=3, gmm_comp=1,
                                                 mirror_loss=True,
                                                 observation_permutation=np.array(
                                                     [0.0001, -1, 2, -3, -4, -5, -6, 7, 14, -15, -16, 17, 18, -19, 8,
                                                      -9, -10, 11, 12, -13,
                                                      20, 21, -22, 23, -24, -25, -26, -27, 28, 35, -36, -37, 38, 39,
                                                      -40, 29, -30, -31, 32, 33,
                                                      -34, 42, 41, 43]),
                                                 action_permutation=np.array(
                                                     [-0.0001, -1, 2, 9, -10, -11, 12, 13, -14, 3, -4, -5, 6, 7, -8]))
    env = bench.Monitor(env, logger.get_dir() and
        osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed+MPI.COMM_WORLD.Get_rank())
    gym.logger.setLevel(logging.WARN)
    pposgd_mirror.learn(env, policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_batch=int(2500),
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
            gamma=0.99, lam=0.95, schedule='linear',
            callback=callback,
            sym_loss_weight=0.0,
            positive_rew_enforce=False,
            #init_policy_params = joblib.load('data/ppo_DartWalker3d-v119_energy03_vel4_3s_mirror4_velrew3_damping5_anklesprint100_5_rotpen0_rew01xinit_stagedcurriculum4s75s34ratio/policy_params.pkl'),
            reward_drop_bound=True,
        )
    env.close()
Esempio n. 2
0
def train_mirror(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)

    joblib.dump(str(env.env.__dict__),
                logger.get_dir() + '/env_specs.pkl',
                compress=True)

    def policy_fn(name, ob_space, ac_space):
        return mlp_mirror_policy.MlpMirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                                 hid_size=64, num_hid_layers=3, gmm_comp=1,
                                                 mirror_loss=True,
                                                 observation_permutation=np.array([0.0001,-1,2,-3,-4, -11,12,-13,14,15,16, -5,6,-7,8,9,10, -17,18, -19, -24,25,-26,27, -20,21,-22,23,\
                                          28,29,-30,31,-32,-33, -40,41,-42,43,44,45, -34,35,-36,37,38,39, -46,47, -48, -53,54,-55,56, -49,50,-51,52, 58,57, 59]),
        action_permutation=np.array([-6,7,-8, 9, 10,11,  -0.001,1,-2, 3, 4,5, -12,13, -14, -19,20,-21,22, -15,16,-17,18]))

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed + MPI.COMM_WORLD.Get_rank())
    ref_policy_params = None
    '''init_q, init_dq = joblib.load('data/skel_data/init_states.pkl')
    env.env.env.init_qs = init_q
    env.env.env.init_dqs = init_dq'''

    if env.env.env.use_ref_policy:
        ref_policy_params = joblib.load(
            'data/ppo_DartHumanWalker-v1210_energy015_vel65_6s_mirror_up01fwd01ltl15_spinepen1yaw001_thighyawpen005_initbentelbow_velrew3_avg_dcon1_asinput_damping2kneethigh_thigh150knee100_curriculum_1xjoint_shoulder90_dqpen00001/policy_params.pkl'
        )
    gym.logger.setLevel(logging.WARN)
    pposgd_mirror.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_batch=int(2500),
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
        callback=callback,
        sym_loss_weight=0.0,
        positive_rew_enforce=False,
        #init_policy_params = joblib.load('data/ppo_DartHumanWalker-v1333_energy03_vel2_2s_mirror4_velrew3_adaptalivebonus_rew1xinit_thigh160_100springankle_stagedcurriculum_05reduce_07rewthres/policy_params.pkl'),
        reward_drop_bound=True,
        ref_policy_params=ref_policy_params,
        #init_policy_params = joblib.load('data/ppo_DartHumanWalker-v1124_energy25_vel3_kd1000_mirror_up1fwd01ltl15_spinepen1yaw001_thighyawpen005_initbentelbow_runningavg3_dcontrolconstraint1_asinput_damping2_fromvel3_kd500/policy_params.pkl')
    )
    env.close()
Esempio n. 3
0
def train_mirror(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):
        return mlp_mirror_policy.MlpMirrorPolicy(
            name=name,
            ob_space=ob_space,
            ac_space=ac_space,
            hid_size=64,
            num_hid_layers=3,
            gmm_comp=1,
            mirror_loss=True,
            observation_permutation=np.array([
                0.0001, -1, 2, -3, -4, 9, 10, 11, 12, 5, 6, 7, 8, 17, 18, 19,
                20, 13, 14, 15, 16, 21, 22, -23, 24, -25, -26, 31, 32, 33, 34,
                27, 28, 29, 30, 39, 40, 41, 42, 35, 36, 37, 38, 44, 43, 46, 45,
                47
            ]),
            action_permutation=np.array(
                [4, 5, 6, 7, 0.0001, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11]),
        )

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed + MPI.COMM_WORLD.Get_rank())
    gym.logger.setLevel(logging.WARN)
    pposgd_mirror.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_batch=int(2500),
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
        callback=callback,
        sym_loss_weight=4.0,
        positive_rew_enforce=False,
        init_policy_params=joblib.load(
            'data/ppo_DartDogRobot-v1101_energy035_vel7_3s_mirror4_velrew3_ab11_rotqpen1105_08xtorque_12termination_hindleg0314limit_zrotterm12_rew05xinit_stagedcurriculum_075reduce_07rewthres/policy_params.pkl'
        ),
        reward_drop_bound=True,
        #init_policy_params = joblib.load('data/ppo_DartHumanWalker-v1124_energy25_vel3_kd1000_mirror_up1fwd01ltl15_spinepen1yaw001_thighyawpen005_initbentelbow_runningavg3_dcontrolconstraint1_asinput_damping2_fromvel3_kd500/policy_params.pkl')
    )
    env.close()
Esempio n. 4
0
def train_mirror(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):
        return mlp_mirror_policy.MlpMirrorPolicy(
            name=name,
            ob_space=ob_space,
            ac_space=ac_space,
            hid_size=64,
            num_hid_layers=3,
            gmm_comp=1,
            mirror_loss=True,
            observation_permutation=np.array([
                0.0001, -1, 2, -3, -4, -5, -6, 7, 14, -15, -16, 17, 18, -19, 8,
                -9, -10, 11, 12, -13, 20, 21, -22, 23, -24, -25, -26, -27, 28,
                35, -36, -37, 38, 39, -40, 29, -30, -31, 32, 33, -34, 42, 41,
                43
            ]),
            action_permutation=np.array([
                -0.0001, -1, 2, 9, -10, -11, 12, 13, -14, 3, -4, -5, 6, 7, -8
            ]))

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed + MPI.COMM_WORLD.Get_rank())
    gym.logger.setLevel(logging.WARN)
    pposgd_mirror.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_batch=int(2500),
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
        callback=callback,
        sym_loss_weight=4.0,
        positive_rew_enforce=False,
        #init_policy_params = joblib.load('data/ppo_DartHumanWalker-v1156_energy1_vel55_mirror_up1fwd01ltl15_spinepen1yaw001_thighyawpen005_initbentelbow_runningavg4_dcontrolconstraint1_asinput_damping2kneethigh_thigh250knee60/policy_params.pkl'),
        reward_drop_bound=True,
        #init_policy_params = joblib.load('data/ppo_DartHumanWalker-v1124_energy25_vel3_kd1000_mirror_up1fwd01ltl15_spinepen1yaw001_thighyawpen005_initbentelbow_runningavg3_dcontrolconstraint1_asinput_damping2_fromvel3_kd500/policy_params.pkl')
    )
    env.close()
def main():
    import argparse
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env', help='environment ID', default='DartDogRobot-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--init_policy', help='Initial Policy',
                        default='data/ppo_DartDogRobot-v156_energy2_vel8_8s_mirror4_velrew3_dcon1_asinput_damping5_velspd1000_feetcover3off_newstrength13x_dqpen0001_shoulder170_-102range_thigh200_-0.21.4range_lighterhead_2kassist/policy_params.pkl')
    parser.add_argument('--init_curriculum', help='Initial Curriculum', nargs='+', default=[2000.0, 2000])
    parser.add_argument('--ref_policy', help='Reference Policy',
                        default='data/ppo_DartDogRobot-v156_energy2_vel8_8s_mirror4_velrew3_dcon1_asinput_damping5_velspd1000_feetcover3off_newstrength13x_dqpen0001_shoulder170_-102range_thigh200_-0.21.4range_lighterhead_2kassist/policy_params.pkl')
    parser.add_argument('--ref_curriculum', help='Reference Curriculum', nargs='+', default=[2000.0, 2000])
    parser.add_argument('--anc_thres', help='Anchor Threshold', type=float, default=0.85)
    parser.add_argument('--prog_thres', help='Progress Threshold', type=float, default=0.7)
    parser.add_argument('--batch_size', help='Batch Size', type=int, default=2500)
    parser.add_argument('--max_iter', help='Maximum Iteration', type=int, default=2000)
    parser.add_argument('--use_reftraj', help='Use reference trajectory', type=int, default=0)
    args = parser.parse_args()
    logger.reset()
    logger.configure(
        'data/ppo_curriculum_150eachit_vel8_mirror4_runningavg3_2s_torque13x_e1' + args.env + '_' + str(
            args.seed) + '_' + str(args.anc_thres) + '_' + str(args.prog_thres) + '_' + str(args.batch_size))
    sess = U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env)

    ob_space = env.observation_space
    ac_space = env.action_space

    def policy_fn(name, ob_space, ac_space):
        return mlp_mirror_policy.MlpMirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                                 hid_size=64, num_hid_layers=3, gmm_comp=1,
                                                 mirror_loss=True,
                                                 observation_permutation=np.array(
                                                     [0.0001, -1, 2, -3, -4, 9, 10, 11, 12, 5, 6, 7, 8, 17, 18, 19, 20,
                                                      13,
                                                      14, 15, 16,
                                                      21, 22, -23, 24, -25, -26, 31, 32, 33, 34, 27, 28, 29, 30, 39, 40,
                                                      41,
                                                      42, 35, 36, 37, 38, 44, 43, 46, 45, 47]),
                                                 action_permutation=np.array(
                                                     [4, 5, 6, 7, 0.0001, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11]))

    policy = policy_fn('policy', ob_space, ac_space)
    init_curriculum = np.array(args.init_curriculum)
    ref_policy = policy_fn('ref_policy', ob_space, ac_space)
    ref_curriculum = np.array(args.ref_curriculum)

    policy_params = joblib.load(args.init_policy)
    ref_policy_params = joblib.load(args.ref_policy)
    U.initialize()
    cur_scope = policy.get_variables()[0].name[0:policy.get_variables()[0].name.find('/')]
    orig_scope = list(policy_params.keys())[0][0:list(policy_params.keys())[0].find('/')]
    ref_scope = list(ref_policy_params.keys())[0][0:list(ref_policy_params.keys())[0].find('/')]
    for i in range(len(policy.get_variables())):
        assign_op = policy.get_variables()[i].assign(
            policy_params[policy.get_variables()[i].name.replace(cur_scope, orig_scope, 1)])
        sess.run(assign_op)
        assign_op = ref_policy.get_variables()[i].assign(
            ref_policy_params[ref_policy.get_variables()[i].name.replace('ref_' + cur_scope, ref_scope, 1)])
        sess.run(assign_op)

    anchor_threshold = args.anc_thres
    progress_threshold = args.prog_thres

    env = bench.Monitor(env, logger.get_dir() and
                        osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True)
    env.seed(args.seed + MPI.COMM_WORLD.Get_rank())
    gym.logger.setLevel(logging.WARN)

    curriculum_evolution = []

    env.env.env.anchor_kp = ref_curriculum
    ref_score = None
    ref_max_score = None
    reference_trajectory = None
    # if MPI.COMM_WORLD.Get_rank() == 0:
    if args.use_reftraj == 1:
        reference_trajecotry = gen_reftraj(env, ref_policy, 299)
        env.env.reference_trajectory = reference_trajectory
    ref_score, ref_max_score = evaluate_policy(env, ref_policy, 24)
    ref_score = MPI.COMM_WORLD.bcast(ref_score, root=0)
    ref_max_score = MPI.COMM_WORLD.bcast(ref_max_score, root=0)
    reference_score = ref_score * progress_threshold
    reference_anchor_score = ref_score * anchor_threshold
    reference_max_score = ref_max_score * 0.9
    env.env.env.anchor_kp = init_curriculum
    reference_trajectory = MPI.COMM_WORLD.bcast(reference_trajectory, root=0)
    env.env.reference_trajectory = reference_trajectory

    current_curriculum = np.copy(init_curriculum)
    print('reference scores: ', reference_score, reference_anchor_score, reference_max_score)
    #env.env.env.energy_weight *= 0.5
    # env.env.env.final_tv -= 0.5
    previous_params = policy_params
    for iter in range(args.max_iter):
        print('curriculum iter ', iter)
        print('ref score: ', reference_anchor_score)

        opt_pi, final_rew = pposgd_mirror.learn(env, policy_fn,
                                                max_timesteps=args.batch_size * MPI.COMM_WORLD.Get_size() * 150,
                                                timesteps_per_batch=int(args.batch_size),
                                                clip_param=0.2, entcoeff=0.0,
                                                optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
                                                gamma=0.99, lam=0.95, schedule='linear',
                                                callback=callback,
                                                sym_loss_weight=4.0,
                                                return_threshold=reference_anchor_score,
                                                init_policy_params=previous_params,
                                                policy_scope='pi' + str(iter),
                                                min_iters=0,
                                                reward_drop_bound=True,
                                                # max_threshold = reference_max_score,
                                                )
        print('one learning iteration done')
        if np.linalg.norm(current_curriculum) >= 0.0001:
            # re-compute reference trajectory
            if MPI.COMM_WORLD.Get_rank() == 0 and args.use_reftraj == 1:
                print('recompute reference traj')
                reference_trajecotry = gen_reftraj(env, opt_pi, 299)
            reference_trajectory = MPI.COMM_WORLD.bcast(reference_trajectory, root=0)
            env.env.reference_trajectory = reference_trajectory

            if final_rew < reference_anchor_score * 0.95:
                print('update reference scores')
                reference_score = reference_score / reference_anchor_score * final_rew
                reference_anchor_score = final_rew

            closest_candidate = None
            # if MPI.COMM_WORLD.Get_rank() == 0:
            directions = [np.array([-1, 0]), np.array([0, -1]),
                          -current_curriculum / np.linalg.norm(current_curriculum)]
            int_d1 = directions[0] + directions[2]
            int_d2 = directions[1] + directions[2]
            directions.append(int_d1 / np.linalg.norm(int_d1))
            directions.append(int_d2 / np.linalg.norm(int_d2))

            # directions = [np.array([0.0, -1.0])] # only search in one direction
            candidate_next_anchors = []
            for direction in directions:
                found_point, perf = binary_search_curriculum(env, opt_pi, current_curriculum, direction,
                                                             reference_score, reference_max_score, 6)
                print(direction, found_point, perf)
                candidate_next_anchors.append(found_point)
                if closest_candidate is None:
                    closest_candidate = np.copy(found_point)
                elif np.linalg.norm(closest_candidate) > np.linalg.norm(found_point):
                    closest_candidate = np.copy(found_point)
            if np.linalg.norm(closest_candidate) < 0.5:
                closest_candidate = np.array([0, 0])
            if np.abs(closest_candidate[0]) < 0.1:
                closest_candidate[0] = 0.0
            if np.abs(closest_candidate[1]) < 0.1:
                closest_candidate[1] = 0.0
            # closest_candidate = MPI.COMM_WORLD.bcast(closest_candidate, root=0)

            current_curriculum = np.copy(closest_candidate)
        env.env.env.anchor_kp = current_curriculum

        '''print('Update Init Pose Distributions')
        update_init_poses(env, opt_pi)
        if MPI.COMM_WORLD.Get_rank() == 0:
            joblib.dump([env.env.env.init_qs, env.env.env.init_dqs], logger.get_dir()+'/init_poses_'+np.array2string(current_curriculum, separator=',')+'.pkl', compress=True)
            joblib.dump([env.env.env.init_qs, env.env.env.init_dqs], logger.get_dir() + '/init_poses.pkl', compress=True)'''

        curriculum_evolution.append(current_curriculum)
        print('Current curriculum: ', current_curriculum)
        opt_variable = opt_pi.get_variables()
        previous_params = {}
        for i in range(len(opt_variable)):
            cur_val = opt_variable[i].eval()
            previous_params[opt_variable[i].name] = cur_val
        if np.linalg.norm(current_curriculum) < 0.0001:
            if reference_anchor_score < ref_score:
                reference_anchor_score = ref_score
            else:
                break

    env.close()
def train_mirror(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):
        return mlp_mirror_policy.MlpMirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                                 hid_size=64, num_hid_layers=3, gmm_comp=1,
                                                 mirror_loss=True,
                                                 observation_permutation=np.array(
                                                     [0.0001, -1, 2, -3, -4, -11, 12, -13, 14, 15, 16, -5, 6, -7, 8, 9,
                                                      10, -17, 18, -19, -24, 25, -26, 27, -20, 21, -22, 23, \
                                                      28, 29, -30, 31, -32, -33, -40, 41, -42, 43, 44, 45, -34, 35, -36,
                                                      37, 38, 39, -46, 47, -48, -53, 54, -55, 56, -49, 50, -51, 52, 58,
                                                      57, 59]),
                                                 action_permutation=np.array(
                                                     [-6, 7, -8, 9, 10, 11, -0.001, 1, -2, 3, 4, 5, -12, 13, -14, -19,
                                                      20, -21, 22, -15, 16, -17, 18]))

    env = bench.Monitor(env,
                        logger.get_dir()
                        and osp.join(logger.get_dir(), "monitor.json"),
                        allow_early_resets=True)
    env.seed(seed + MPI.COMM_WORLD.Get_rank())
    gym.logger.setLevel(logging.WARN)

    previous_params = None
    iter_num = 0
    last_iter = False

    # if initialize from previous runs
    #previous_params = joblib.load('')
    #env.env.env.assist_schedule = []

    joblib.dump(str(env.env.env.__dict__),
                logger.get_dir() + '/env_specs.pkl',
                compress=True)

    reward_threshold = None
    while True:
        if not last_iter:
            rollout_length_thershold = env.env.env.assist_schedule[2][
                0] / env.env.env.dt
        else:
            rollout_length_thershold = None
        opt_pi, rew = pposgd_mirror.learn(
            env,
            policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_batch=int(2500),
            clip_param=0.2,
            entcoeff=0.0,
            optim_epochs=10,
            optim_stepsize=3e-4,
            optim_batchsize=64,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
            callback=callback,
            sym_loss_weight=4.0,
            positive_rew_enforce=False,
            init_policy_params=previous_params,
            reward_drop_bound=True,
            rollout_length_thershold=rollout_length_thershold,
            policy_scope='pi' + str(iter_num),
            return_threshold=reward_threshold,
        )
        if iter_num == 0:
            reward_threshold = 0.7 * rew
        if last_iter:
            reward_threshold = None
        iter_num += 1

        opt_variable = opt_pi.get_variables()
        previous_params = {}
        for i in range(len(opt_variable)):
            cur_val = opt_variable[i].eval()
            previous_params[opt_variable[i].name] = cur_val
        # update the assist schedule
        for s in range(len(env.env.env.assist_schedule) - 1):
            env.env.env.assist_schedule[s][1] = np.copy(
                env.env.env.assist_schedule[s + 1][1])
        env.env.env.assist_schedule[-1][1][0] *= 0.75
        env.env.env.assist_schedule[-1][1][1] *= 0.75
        if env.env.env.assist_schedule[-1][1][0] < 5.0:
            env.env.env.assist_schedule[-1][1][0] = 0.0
        if env.env.env.assist_schedule[-1][1][1] < 5.0:
            env.env.env.assist_schedule[-1][1][1] = 0.0
        zero_assist = True
        for s in range(len(env.env.env.assist_schedule) - 1):
            for v in env.env.env.assist_schedule[s][1]:
                if v != 0.0:
                    zero_assist = False
        print('Current Schedule: ', env.env.env.assist_schedule)
        if zero_assist:
            last_iter = True
            print('Entering Last Iteration!')

    env.close()
Esempio n. 7
0
def train_mirror(args, num_timesteps):
    from baselines.ppo1 import mlp_mirror_policy, mlp_mirror_norms_policy, pposgd_mirror
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env)

    env.env._seed(args.seed + MPI.COMM_WORLD.Get_rank())
    env.env.init_params(args)

    U.ALREADY_INITIALIZED = set()
    U.ALREADY_INITIALIZED.update(set(tf.global_variables()))

    obs_per = np.array([0.0001, -1, 2, -3, -4,
                        11, 12, 13, 14, 15, 16, 5, 6, 7, 8, 9, 10, -17, 18, -19,
                        24, 25, 26, 27, 20, 21, 22, 23,
                        28, 29, -30, 31, -32, -33,
                        40, 41, 42, 43, 44, 45, 34, 35, 36, 37, 38, 39, -46, 47, -48,
                        53, 54, 55, 56, 49, 50, 51, 52])

    if env.env.include_additional_info:
        obs_per = np.concatenate((obs_per, np.array([58, 57])))
        obs_per = np.concatenate((obs_per, np.array([59])))
        obs_per = np.concatenate((obs_per, np.array([63, 64, -65, 60, 61, -62])))
        obs_per = np.concatenate((obs_per, np.array([66, 67, -68])))
        obs_per = np.concatenate((obs_per, np.array([72, 73, -74, 69, 70, -71])))
        obs_per = np.concatenate((obs_per, np.array([75, 76, -77])))
        obs_per = np.concatenate((obs_per, np.array([78, 79, -80])))
        assert env.env.obs_dim == (57 + 3 + 3 * 6 + 3)
        assert env.env.act_dim == 97            # change action/state permutation if change action/state in env

    def policy_fn(name, ob_space, ac_space):
        old_act_permute = [-86, 87, -88, 93, 94, 95, 96, 89, 90, 91, 92]
        mus_act_l = np.arange(43)
        mus_act_r = mus_act_l + 43
        mus_act_l[0] = 0.001
        act_permute = np.concatenate([mus_act_r, mus_act_l, old_act_permute])
        if env.env.env.state_self_standardize:
            return mlp_mirror_norms_policy.MlpMirrorNormsPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                                                hid_size=args.hsize, num_hid_layers=args.layers,
                                                                gmm_comp=1,
                                                                mirror_loss=True,
                                                                observation_permutation=obs_per,
                                                                action_permutation=act_permute)
        else:
            return mlp_mirror_policy.MlpMirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                                     hid_size=args.hsize, num_hid_layers=args.layers, gmm_comp=1,
                                                     mirror_loss=True,
                                                     observation_permutation=obs_per,
                                                     action_permutation=act_permute)

    env = bench.Monitor(env, logger.get_dir() and
                        osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True)
    env.seed(args.seed + MPI.COMM_WORLD.Get_rank())
    gym.logger.setLevel(logging.WARN)

    joblib.dump(str(env.env.env.__dict__), logger.get_dir() + '/env_specs.pkl', compress=True)
    with open(logger.get_dir() + '/env_specs.txt', 'w') as f:
        pprint.pprint(env.env.env.__dict__, f)
    f.close()
    shutil.copyfile(env.env.env.model_file_name, logger.get_dir() + '/using_model.skel')

    cur_sym_loss = 3.0
    iter_num = 0
    previous_params = None
    # previous_params = joblib.load('')
    reward_threshold = None
    rollout_length_threshold = None
    pposgd_mirror.learn(env, policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_batch=int(2000),
                        clip_param=args.clip, entcoeff=0.0,
                        optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
                        gamma=0.99, lam=0.95, schedule='linear',
                        callback=callback,
                        sym_loss_weight=cur_sym_loss,
                        init_policy_params=previous_params,
                        reward_drop_bound=None,
                        rollout_length_threshold=rollout_length_threshold,
                        policy_scope='pi' + str(iter_num),
                        return_threshold=reward_threshold,
                        )

    env.close()
def train_mirror(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)
    env.env.assist_timeout = 100.0

    def policy_fn(name, ob_space, ac_space):
        return mlp_mirror_policy.MlpMirrorPolicy(
            name=name,
            ob_space=ob_space,
            ac_space=ac_space,
            hid_size=64,
            num_hid_layers=3,
            gmm_comp=1,
            mirror_loss=True,
            observation_permutation=np.array([
                0.0001, -1, 2, -3, -4, -5, -6, 7, 14, -15, -16, 17, 18, -19, 8,
                -9, -10, 11, 12, -13, 20, 21, -22, 23, -24, -25, -26, -27, 28,
                35, -36, -37, 38, 39, -40, 29, -30, -31, 32, 33, -34, 42, 41,
                43
            ]),
            action_permutation=np.array([
                -0.0001, -1, 2, 9, -10, -11, 12, 13, -14, 3, -4, -5, 6, 7, -8
            ]))

    env = bench.Monitor(env,
                        logger.get_dir()
                        and osp.join(logger.get_dir(), "monitor.json"),
                        allow_early_resets=True)
    env.seed(seed + MPI.COMM_WORLD.Get_rank())
    gym.logger.setLevel(logging.WARN)

    previous_params = None
    iter_num = 0
    last_iter = False

    # if initialize from previous runs
    '''previous_params = joblib.load('data/ppo_DartWalker3d-v111_energy04_vel1_1s_mirror4_velrew3_damping5_anklesprint100_5_rotpen1_rew01xinit_stagedcurriculum/policy_params.pkl')
    env.env.env.assist_schedule = [[0.0,np.array([250.,125.])],[3.0,np.array([125.,62.5])],[6.0,[62.5,31.25]]]'''

    joblib.dump(str(env.env.env.__dict__),
                logger.get_dir() + '/env_specs.pkl',
                compress=True)

    reward_threshold = None
    while True:
        if not last_iter:
            rollout_length_threshold = env.env.env.assist_schedule[2][
                0] / env.env.env.dt
        else:
            rollout_length_threshold = None
        opt_pi, rew = pposgd_mirror.learn(
            env,
            policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_batch=int(2500),
            clip_param=0.2,
            entcoeff=0.0,
            optim_epochs=10,
            optim_stepsize=3e-4,
            optim_batchsize=64,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
            callback=callback,
            sym_loss_weight=4.0,
            positive_rew_enforce=False,
            init_policy_params=previous_params,
            reward_drop_bound=500,
            rollout_length_thershold=rollout_length_threshold,
            policy_scope='pi' + str(iter_num),
            return_threshold=reward_threshold,
        )
        if iter_num == 0:
            reward_threshold = 0.7 * rew
        if last_iter:
            reward_threshold = None
        iter_num += 1

        opt_variable = opt_pi.get_variables()
        previous_params = {}
        for i in range(len(opt_variable)):
            cur_val = opt_variable[i].eval()
            previous_params[opt_variable[i].name] = cur_val
        # update the assist schedule
        for s in range(len(env.env.env.assist_schedule) - 1):
            env.env.env.assist_schedule[s][1] = np.copy(
                env.env.env.assist_schedule[s + 1][1])
        env.env.env.assist_schedule[-1][1][0] *= 0.75
        env.env.env.assist_schedule[-1][1][1] *= 0.75
        if env.env.env.assist_schedule[-1][1][0] < 5.0:
            env.env.env.assist_schedule[-1][1][0] = 0.0
        if env.env.env.assist_schedule[-1][1][1] < 5.0:
            env.env.env.assist_schedule[-1][1][1] = 0.0
        zero_assist = True
        for s in range(len(env.env.env.assist_schedule) - 1):
            for v in env.env.env.assist_schedule[s][1]:
                if v != 0.0:
                    zero_assist = False
        print('Current Schedule: ', env.env.env.assist_schedule)
        if zero_assist:
            last_iter = True
            print('Entering Last Iteration!')

    env.close()
def train_mirror_sig(env, num_timesteps, seed, obs_perm, act_perm):
    from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)


    def policy_fn(name, ob_space, ac_space):
        return mlp_mirror_policy.MlpMirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                                 hid_size=64, num_hid_layers=3, gmm_comp=1,
                                                 mirror_loss=True,
                                                 observation_permutation=obs_perm,
                                                 action_permutation=act_perm)
    env = bench.Monitor(env, logger.get_dir() and
        osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True)
    env.seed(seed+MPI.COMM_WORLD.Get_rank())
    gym.logger.setLevel(logging.WARN)

    previous_params = None
    iter_num = 0
    last_iter = False

    # if initialize from previous runs
    #previous_params = joblib.load('')
    #env.env.env.assist_schedule = []

    joblib.dump(str(env.env.env.__dict__), logger.get_dir() + '/env_specs.pkl', compress=True)

    reward_threshold = None
    while True:
        if not last_iter:
            rollout_length_thershold = env.env.env.assist_schedule[2][0] / env.env.env.dt
        else:
            rollout_length_thershold = None
            reward_threshold *= 1.2
        opt_pi, rew = pposgd_mirror.learn(env, policy_fn,
                max_timesteps=num_timesteps,
                timesteps_per_batch=int(2500),
                clip_param=0.2, entcoeff=0.0,
                optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
                gamma=0.99, lam=0.95, schedule='linear',
                callback=callback,
                sym_loss_weight=4.0,
                positive_rew_enforce=False,
                init_policy_params = previous_params,
                reward_drop_bound=500,
                rollout_length_thershold = rollout_length_thershold,
                policy_scope='pi' + str(iter_num),
                return_threshold = reward_threshold,
            )
        if iter_num == 0:
            reward_threshold = 0.7 * rew
        if last_iter:
            break
        iter_num += 1

        opt_variable = opt_pi.get_variables()
        previous_params = {}
        for i in range(len(opt_variable)):
            cur_val = opt_variable[i].eval()
            previous_params[opt_variable[i].name] = cur_val
        # update the assist schedule
        for s in range(len(env.env.env.assist_schedule)-1):
            env.env.env.assist_schedule[s][1] = np.copy(env.env.env.assist_schedule[s+1][1])
        env.env.env.assist_schedule[-1][1][0] *= 0.75
        env.env.env.assist_schedule[-1][1][1] *= 0.75
        if env.env.env.assist_schedule[-1][1][0] < 5.0:
            env.env.env.assist_schedule[-1][1][0] = 0.0
        if env.env.env.assist_schedule[-1][1][1] < 5.0:
            env.env.env.assist_schedule[-1][1][1] = 0.0
        zero_assist = True
        for s in range(len(env.env.env.assist_schedule)-1):
            for v in env.env.env.assist_schedule[s][1]:
                if v != 0.0:
                    zero_assist = False
        print('Current Schedule: ', env.env.env.assist_schedule)
        if zero_assist:
            last_iter = True
            print('Entering Last Iteration!')

    env.close()
def train_mirror(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_mirror_policy, pposgd_mirror
    U.make_session(num_cpu=2).__enter__(
    )  #originally 1 CPU    --  creates session with X CPUs
    set_global_seeds(seed)
    env = gym.make(env_id)  #creates gym env -- Here it is walker3d.py
    env.env.assist_timeout = 100.0

    def policy_fn(name, ob_space, ac_space):
        return mlp_mirror_policy.MlpMirrorPolicy(
            name=name,
            ob_space=ob_space,
            ac_space=ac_space,
            hid_size=64,
            num_hid_layers=3,
            gmm_comp=1,
            mirror_loss=True,
            observation_permutation=np.array([
                0.0001, -1, 2, -3, -4, -5, -6, 7, 14, -15, -16, 17, 18, -19, 8,
                -9, -10, 11, 12, -13, 20, 21, -22, 23, -24, -25, -26, -27, 28,
                35, -36, -37, 38, 39, -40, 29, -30, -31, 32, 33, -34, 42, 41,
                43
            ]),
            action_permutation=np.array([
                -0.0001, -1, 2, 9, -10, -11, 12, 13, -14, 3, -4, -5, 6, 7, -8
            ]))  # initializes a random MLP Mirror Policy aka the good stuff

    env = bench.Monitor(env,
                        logger.get_dir()
                        and osp.join(logger.get_dir(), "monitor.json"),
                        allow_early_resets=True)
    env.seed(seed + MPI.COMM_WORLD.Get_rank())
    gym.logger.setLevel(logging.WARN)

    previous_params = None  #interesting -- can i use it to use prev params as starting point?
    iter_num = 0
    last_iter = False

    # if initialize from previous runs
    '''previous_params = joblib.load('data/ppo_DartWalker3d-v111_energy04_vel1_1s_mirror4_velrew3_damping5_anklesprint100_5_rotpen1_rew01xinit_stagedcurriculum/policy_params.pkl')  
    env.env.env.assist_schedule = [[0.0,np.array([250.,125.])],[3.0,np.array([125.,62.5])],[6.0,[62.5,31.25]]]''' #sets params from a previous run -- choose starting schedule too!

    previous_params = joblib.load(
        'data/Walker_0_to_1/[[0.0,array([0.,0.])],[3.0,array([0.,0.])],[6.0,[0.0,0.0]]]/policy_params_6400.pkl'
    )
    env.env.env.assist_schedule = [[0.0, np.array([2000, 2000])],
                                   [3.0, np.array([1500, 1500])],
                                   [6.0, [1125.0, 1125.0]]]

    joblib.dump(str(env.env.env.__dict__),
                logger.get_dir() + '/env_specs.pkl',
                compress=True)

    reward_threshold = None
    while True:  #do the actual training
        if not last_iter:
            rollout_length_threshold = env.env.env.assist_schedule[2][
                0] / env.env.env.dt  #if not last iter, update rew length threshold
        else:
            rollout_length_threshold = None
        opt_pi, rew = pposgd_mirror.learn(
            env,
            policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_batch=int(2500),
            clip_param=0.2,
            entcoeff=0.0,
            optim_epochs=10,
            optim_stepsize=3e-4,
            optim_batchsize=64,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
            callback=callback,
            sym_loss_weight=4.0,
            positive_rew_enforce=False,
            init_policy_params=previous_params,
            reward_drop_bound=500,
            rollout_length_thershold=rollout_length_threshold,
            policy_scope='pi' + str(iter_num),
            return_threshold=reward_threshold,
        )  # Use Neural Net, to get new policy and reward  -- Notice Callback function to save policy
        if iter_num == 0:
            reward_threshold = 0.7 * rew  #if new schedule, reward to attain is 70% of old reward
        if last_iter:  #if last iter, no threshold
            reward_threshold = None
        iter_num += 1

        opt_variable = opt_pi.get_variables()  #get policy's GLOBAL variables
        previous_params = {}
        for i in range(len(opt_variable)):
            cur_val = opt_variable[i].eval()  #curr variable = variable's VALUE
            previous_params[
                opt_variable[i].
                name] = cur_val  # Previous = dictionnary  var name -> var value
            # update the assist schedule
        for s in range(len(env.env.env.assist_schedule) - 1):
            env.env.env.assist_schedule[s][1] = np.copy(
                env.env.env.assist_schedule[s + 1][1])
        env.env.env.assist_schedule[-1][1][0] *= 0.75
        env.env.env.assist_schedule[-1][1][1] *= 0.75
        if env.env.env.assist_schedule[-1][1][0] < 5.0:
            env.env.env.assist_schedule[-1][1][0] = 0.0
        if env.env.env.assist_schedule[-1][1][1] < 5.0:
            env.env.env.assist_schedule[-1][1][1] = 0.0
        zero_assist = True
        for s in range(
                len(env.env.env.assist_schedule) - 1
        ):  #check if we are in last iteration (all 0 schedules) -- if yes set last iter to true and notify the user
            for v in env.env.env.assist_schedule[s][1]:
                if v != 0.0:
                    zero_assist = False
        print('Current Schedule: ', env.env.env.assist_schedule)
        logger.log("Current Schedule: %s" % env.env.env.assist_schedule
                   )  #added by me. keep track of schedule in log file
        if zero_assist:
            last_iter = True
            print('Entering Last Iteration!')
            logger.log(
                "Entering Last Iteration"
            )  #added by me, helps know in logger when hit last schedule

    env.close()  #when done close envt