def main(env_name, seed, run_num, data_saving_path, batch_size_per_process, num_iterations,
         autoencoder_base="./novelty_data/local/autoencoders/"):
    num_processes = MPI.COMM_WORLD.Get_size()
    num_timesteps_per_process = batch_size_per_process
    num_iterations_enforce = num_iterations

    import baselines.common.tf_util as U

    comm = MPI.COMM_WORLD
    mpi_rank = comm.Get_rank()

    tf.reset_default_graph()

    with U.single_threaded_session() as sess:
        autoencoder_list = []
        for i in range(run_num):
            autoencoder_model = load_model(
                autoencoder_base + env_name + '_autoencoder_seed_' + str(seed) + '_run_' + str(i) + '.h5')
            autoencoder_list.append(autoencoder_model)

        U.ALREADY_INITIALIZED.update(set(tf.global_variables()))

        logger.reset()
        # logger.configure(
        #     '../data/ppo_' + enforce_env_name + '_autoencoder_' + str(len(autoencoder_list)) + '_seed=' + str(
        #         seed) + '/' + str(st))

        logger.configure(data_saving_path)

        model = train(sess, env_name,
                      num_timesteps=num_iterations_enforce * num_processes * num_timesteps_per_process,
                      timesteps_per_actor=num_timesteps_per_process,
                      autoencoders=autoencoder_list,
                      seed=seed)
        #
        if mpi_rank == 0:
            env = gym.make(env_name)
            env.env.novel_autoencoders = autoencoder_list

            if hasattr(env.env, 'disableViewer'):
                env.env.disableViewer = False

            env = wrappers.Monitor(env, logger.get_dir() + '/results', force=True)

            obs = env.reset()

            step = 0
            while (True):

                env.render()
                actions = model._act(False, obs)
                obs, _, done, _ = env.step(actions[0][0])
                env.render()
                if done:
                    obs = env.reset()
                if done:
                    print("Visualization is Done")
                    break

                step += 1
예제 #2
0
def main():
    import argparse
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env', help='environment ID', default='DartHumanWalkerMD-v2')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--hsize', type=int, default=80)
    parser.add_argument('--layers', type=int, default=2)
    parser.add_argument('--clip', type=float, default=0.2)

    parser.add_argument('--HW_final_tar_v', help='final target velocity', type=float, default=1.7)
    parser.add_argument('--HW_tar_acc_time', help='time to acc to final target velocity', type=float, default=1.1)
    parser.add_argument('--HW_energy_weight', help='energy pen weight', type=float, default=0.5)
    parser.add_argument('--HW_alive_bonus_rew', help='alive bonus weight', type=float, default=7.0)
    parser.add_argument('--HW_vel_reward_weight', help='velocity pen weight', type=float, default=9.0)
    parser.add_argument('--HW_side_devia_weight', help='side deviation pen weight', type=float, default=1.5)
    parser.add_argument('--HW_jl_pen_weight', help='joint limit pen weight', type=float, default=0.7)
    parser.add_argument('--HW_alive_pen', help='alive pen weight', type=float, default=0.0)

    args = parser.parse_args()
    logger.reset()

    import datetime
    now = datetime.datetime.now()
    stampstring = now.isoformat()

    logdir = 'data/wtoe_MD_20080_ppo_noAssist_adds' + stampstring[:16] + args.env + '_' + str(
        args.seed) + '_' + str(args.hsize) + '-' + str(args.layers) + '_' + str(args.clip)
    for arg in vars(args):
        if arg[:3] == 'HW_':
            logdir += arg[2:6]
            logdir += '_'
            logdir += arg[-3:]
            logdir += str(getattr(args, arg))
    logger.configure(logdir)
    train_mirror(args, num_timesteps=int(2000 * 8 * 1600))
예제 #3
0
def main():
    import argparse
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env', help='environment ID', default='DartHexapod-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    args = parser.parse_args()
    logger.reset()
    logger.configure('data/ppo_'+args.env+str(args.seed)+'_energy03_vel15_15s_mirror4_velrew3_rew01xinit_thigh200_100springankle_stagedcurriculum')
    train_mirror(args.env, num_timesteps=int(5000*4*800), seed=args.seed)
예제 #4
0
def main():
    import argparse
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env', help='environment ID', default='DartWalker3d-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    args = parser.parse_args()
    logger.reset()
    logger.configure('data/ppo_'+args.env+str(args.seed)+'_energy03_vel5_3s_mirror0_velrew3_asinput_damping5_ab7_torque1x_anklesprint100_5_rotpen01_rew01xinit')
    train_mirror(args.env, num_timesteps=int(5000*4*2500), seed=args.seed)
예제 #5
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='DartHexapod-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    args = parser.parse_args()
    logger.reset()
    logger.configure('data/ppo_' + args.env + str(args.seed) +
                     '_energy005_vel8_mirror_velrew3_asinput')
    train_mirror(args.env, num_timesteps=int(5000 * 4 * 800), seed=args.seed)
예제 #6
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='DartHopper-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    args = parser.parse_args()
    logger.reset()
    logger.configure('data/ppo_' + args.env + str(args.seed) +
                     '_using_disc_ref_policy_iter_2')
    train(args.env, num_timesteps=int(5000 * 40), seed=args.seed)
예제 #7
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='DartHumanWalker-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    args = parser.parse_args()
    logger.reset()
    logger.configure('data/ppo_' + args.env + str(args.seed) +
                     '_vf_vanilla_weak_2k')
    #logger.configure('data/ppo_'+args.env+str(args.seed)+'_energy05_bal_vel4smooth_mirror_up1fwd01ltl1_spinepen1yaw001_thighyawpen005_initbentelbow_velrew3_dcontrolconstraint1_strongerarm_asinput_treadmill')
    train(args.env, num_timesteps=int(500 * 4 * 100), seed=args.seed)
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='DartHumanWalker-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    args = parser.parse_args()
    logger.reset()
    logger.configure(
        'data/ppo_' + args.env + str(args.seed) +
        '_energy03_vel15_1s_mirror4_velrew3_ab6_norotpen_dofpen11508_rew05xinit_thigh160_50springankle_1p2term_stagedcurriculum_075reduce_07rewthres_2kassist'
    )
    train_mirror(args.env, num_timesteps=int(5000 * 4 * 800), seed=args.seed)
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='DartDogRobot-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    args = parser.parse_args()
    logger.reset()
    logger.configure(
        'data/ppo_' + args.env + str(args.seed) +
        '_energy02_vel2_2s_mirror4_velrew3_ab4_norotpen_rew01xinit_stagedcurriculum_075reduce_07rewthres'
    )
    train_mirror(args.env, num_timesteps=int(5000 * 4 * 800), seed=args.seed)
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='DartWalker3d-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    args = parser.parse_args()
    logger.reset()
    logger.configure(
        'data/ppo_' + args.env + str(args.seed) +
        '_energy04_vel1_1s_mirror4_velrew3_ab4_anklesprint100_5_rotpen0_rew05xinit_stagedcurriculum4s75s34ratio'
    )
    train_mirror(args.env, num_timesteps=int(5000 * 4 * 800), seed=args.seed)
예제 #11
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='DartWalker3dSPD-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    args = parser.parse_args()
    logger.reset()
    logger.configure(
        'data/ppo_' + args.env + str(args.seed) +
        '_energy003_vel15_mirror4_velrew3_spd1k300_kd001_nocurriculum_frameskip5'
    )
    train_mirror(args.env, num_timesteps=int(5000 * 4 * 1500), seed=args.seed)
예제 #12
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='DartHumanWalker-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    args = parser.parse_args()
    logger.reset()
    logger.configure(
        'data/ppo_' + args.env + str(args.seed) +
        '_energy03_vel15_1s_mirror0_up03fwd03ltl15_spinepen1yaw001_ab3_thighyawpen005_velrewavg3_2s_dcon1_damping2kneethigh_thigh160knee100waist150_shoulder100_velrew15xinit_baseline'
    )
    #logger.configure('data/ppo_'+args.env+str(args.seed)+'_energy05_bal_vel4smooth_mirror_up1fwd01ltl1_spinepen1yaw001_thighyawpen005_initbentelbow_velrew3_dcontrolconstraint1_strongerarm_asinput_treadmill')
    train_mirror(args.env, num_timesteps=int(5000 * 4 * 2500), seed=args.seed)
예제 #13
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--policy',
                        help='Policy architecture',
                        choices=['cnn', 'lstm', 'lnlstm'],
                        default='cnn')
    parser.add_argument('--lrschedule',
                        help='Learning rate schedule',
                        choices=['constant', 'linear'],
                        default='constant')
    parser.add_argument('--million_frames',
                        help='How many frames to train (/ 1e6). '
                        'This number gets divided by 4 due to frameskip',
                        type=int,
                        default=200)
    parser.add_argument('--logdir',
                        help='Log directory',
                        type=str,
                        default="logs")
    parser.add_argument('--note',
                        help='A short note to add to the log file',
                        type=str,
                        default="")
    parser.add_argument('--model_path',
                        help='Path to pre-trained model',
                        type=str,
                        default="")
    parser.add_argument(
        '--num_cpus',
        help='Number of CPUs (i.e. number of parallel enviornments)',
        type=int,
        default=16)
    parser.add_argument('--nsteps',
                        help='Number of steps for each rollout',
                        type=int,
                        default=1)
    parser.add_argument('--lr',
                        help='Learning rate',
                        type=float,
                        default=1.5e-3)
    parser.add_argument('--pg_coef',
                        help='Coefficient for policy gradient loss',
                        type=float,
                        default=0.1)
    parser.add_argument('--ent_coef',
                        help='Coefficient for policy entropy loss',
                        type=float,
                        default=0.001)
    parser.add_argument('--vf_coef',
                        help='Coefficient for value function loss',
                        type=float,
                        default=0.5)
    args = parser.parse_args()

    timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f")
    logdir = os.path.join(args.logdir, args.env, timestamp)
    logger.reset()
    logger.configure(logdir)
    logger.log("")
    for arg in sorted(vars(args)):
        logger.log("{}: {}".format(arg, getattr(args, arg)))
    logger.log("")
    train(args.env,
          num_frames=1e6 * args.million_frames,
          seed=args.seed,
          nsteps=args.nsteps,
          policy=args.policy,
          lrschedule=args.lrschedule,
          num_cpu=args.num_cpus,
          model_path=args.model_path,
          lr=args.lr,
          pg_coef=args.pg_coef,
          ent_coef=args.ent_coef,
          vf_coef=args.vf_coef)
예제 #14
0
def main():

    path = 'data/value_iter_cartpole_discrete_v4'
    logger.reset()
    logger.configure(path)

    env = gym.make('DartCartPoleSwingUp-v1')
    env.seed(0)
    #obs_disc = bin_disc([[50, 0, -0.01], [50, 0.0, -0.01]])
    #act_disc = bin_disc([[10, 1.01, -1.01]])
    obs_disc = bin_disc([[50, 0, -0.01], [50, 0.0, -0.01], [50, 0.0, -0.01], [50, 0.0, -0.01]])
    act_disc = bin_disc([[50, 1.01, -1.01]])

    '''s_disc = []
    for i in range(11):
        s_disc.append([30, 0.0, -0.0])
    obs_disc = bin_disc(s_disc)
    act_disc = bin_disc([[10, 1.01, -1.01], [10, 1.01, -1.01], [10, 1.01, -1.01]])
    #obs_disc = bin_disc([[5, -1, 1], [5, -1, 1], [5, -1, 1], [5, -1, 1], [5, -1, 1], [5, -1, 1], [5, -1, 1], [5, -1, 1], [5, -1, 1], [5, -1, 1]])
    #act_disc = bin_disc([[4, 1.0, -1.0], [4, 1.0, -1.0], [4, 1.0, -1.0], [4, 1.0, -1.0], [4, 1.0, -1.0]])'''

    obs_disc_dim = 1
    act_disc_dim = 1
    for s in obs_disc.disc_scheme:
        obs_disc_dim *= s[0]
    for s in act_disc.disc_scheme:
        act_disc_dim *= s[0]

    state_filter_fn = state_filter_cartpole
    state_unfilter_fn = state_unfilter_cartpole

    policy = None
    '''sess = tf.InteractiveSession()
    policy_params = joblib.load(
        'data/ppo_DartCartPoleSwingUp-v11_vanilla/policy_params.pkl')
    ob_space = env.observation_space
    ac_space = env.action_space
    policy = policy_fn("pi", ob_space, ac_space)
    U.initialize()
    cur_scope = policy.get_variables()[0].name[0:policy.get_variables()[0].name.find('/')]
    orig_scope = list(policy_params.keys())[0][0:list(policy_params.keys())[0].find('/')]
    vars = policy.get_variables()
    for i in range(len(policy.get_variables())):
        assign_op = policy.get_variables()[i].assign(
            policy_params[policy.get_variables()[i].name.replace(cur_scope, orig_scope, 1)])
        sess.run(assign_op)
    env.env.use_disc_ref_policy = None'''

    dyn_model, col_data, obs_disc = learn_model(env, obs_disc, obs_disc_dim, act_disc, act_disc_dim, state_filter_fn, state_unfilter_fn, policy=policy, disc_policy = False)
    Vfunc, policy = optimize_policy(dyn_model, 0.99)

    for iter in range(50):
        print('--------------- Iteration ', str(iter), ' -------------------')
        dyn_model, col_data, obs_disc = learn_model(env, obs_disc, obs_disc_dim, act_disc, act_disc_dim, state_filter_fn, state_unfilter_fn, policy = policy, collected_data=col_data)
        Vfunc, policy = optimize_policy(dyn_model, 0.99, Vfunc = Vfunc)
        joblib.dump(dyn_model, path+'/dyn_model_'+str(iter)+'.pkl', compress=True)
        joblib.dump(policy, path + '/policy_'+str(iter)+'.pkl', compress=True)
        joblib.dump([Vfunc, obs_disc, act_disc, state_filter_fn, state_unfilter_fn], path + '/ref_policy_funcs_'+str(iter)+'.pkl', compress=True)

        joblib.dump(dyn_model, path + '/dyn_model.pkl', compress=True)
        joblib.dump(policy, path + '/policy.pkl', compress=True)
        joblib.dump([Vfunc, obs_disc, act_disc, state_filter_fn, state_unfilter_fn], path + '/ref_policy_funcs.pkl', compress=True)
    joblib.dump(dyn_model, path + '/dyn_model.pkl', compress=True)
    joblib.dump(policy, path + '/policy.pkl', compress=True)
    joblib.dump([Vfunc, obs_disc, act_disc, state_filter_fn, state_unfilter_fn], path + '/ref_policy_funcs.pkl', compress=True)
예제 #15
0
def learn(env, policy_func, *,
        timesteps_per_actorbatch, # timesteps per actor per update
        clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
        optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
        gamma, lam, # advantage estimation
        max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
        callback=None, # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant', # annealing for stepsize parameters (epsilon and adam)
        load_saved_model_dir=None
        ):

    ##logger setup    
    logger.reset()
    log_dir = os.path.join(str(Path.home()), "Desktop", "Darksouls" + "ppo",datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    logger.configure(log_dir, ["tensorboard", "stdout"])

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = spaces.Discrete(9)
    pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy
    atarg = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
    ret = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    lrmult = tf.compat.v1.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
    surr1 = ratio * atarg # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
    pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function([],[], updates=[tf.compat.v1.assign(oldv, newv)
        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    if load_saved_model_dir is not None:
        U.load_state(load_saved_model_dir+"/saved_model")
        print("Loaded saved model at: ",load_saved_model_dir)
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards

    assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************"%iters_so_far)
       
        seg = seg_gen.__next__()
        print("Training")
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"] # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy

        assign_old_eq_new() # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [] # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult) 
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
            losses.append(newlosses)            
        meanlosses,_,_ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_"+name, lossval)
        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("LastEpRew", rews[-1])
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        iters_so_far += 1
        if MPI.COMM_WORLD.Get_rank()==0:
            logger.dump_tabular()
예제 #16
0
def main():

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='SunblazeCartPoleRandomNormal-v0')
    parser.add_argument('--seed',
                        type=int,
                        help='RNG seed, defaults to random')
    parser.add_argument('--output', type=str)
    parser.add_argument('--processes', default=1, help='int or "max" for all')

    # EPOpt specific
    parser.add_argument('--epsilon', type=float, default=1.0)
    # EPOpt paper keept epsilon=1 until iters>100 (max 200 iters)
    parser.add_argument('--activate',
                        type=int,
                        default=100,
                        help='How long to fix epsilon to 1.0 before e')
    parser.add_argument(
        '--paths',
        type=int,
        default=100,
        help='number of trajectories to sample from each iteration')
    parser.add_argument('--algorithm',
                        type=str,
                        choices=['ppo2', 'a2c'],
                        default='ppo2',
                        help='Inner batch policy optimization algorithm')
    parser.add_argument('--policy',
                        choices=['mlp', 'lstm'],
                        default='mlp',
                        help='Policy architecture')

    # Episode-modification specific:
    # parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    parser.add_argument('--total-episodes', type=int, default=5e4)

    # RL algo. yyperparameters
    parser.add_argument('--lr', type=float, default=3e-4)
    parser.add_argument('--nsteps', type=int, default=2048)
    parser.add_argument('--ent-coef',
                        type=float,
                        default=1e-2,
                        help='Only relevant for A2C')
    parser.add_argument('--nminibatches',
                        type=int,
                        default=32,
                        help='Only relevant for PPO2')

    args = parser.parse_args()

    # Configure logger
    if args.output:
        try:
            os.makedirs(args.output)
        except OSError:
            pass
        logger.reset()
        logger.configure(dir=args.output)

    # If seed is unspecified, generate a pseudorandom one
    if not args.seed:
        # "Seed must be between 0 and 2**32 - 1"
        seed = create_seed(args.seed, max_bytes=4)
    else:
        seed = args.seed

    # Log it for reference
    with open(os.path.join(args.output, 'seed.txt'), 'w') as fout:
        fout.write("%d\n" % seed)

    if args.processes == 'max':
        ncpu = multiprocessing.cpu_count()
        # from: https://github.com/openai/baselines/blob/1f8a03f3a62367526f20215188fb5ea4b9ec27e0/baselines/ppo2/run_atari.py#L15
        if sys.platform == 'darwin': ncpu //= 2
    else:
        try:
            ncpu = int(args.processes)
        except ValueError:
            raise argparse.ArgumentTypeError("Invalid number of processes")

    train_epopt(
        args.env,
        total_episodes=args.total_episodes,
        seed=seed,
        lr=args.lr,
        epsilon=args.epsilon,
        activate_at=args.activate,
        paths=args.paths,
        algorithm=args.algorithm,
        policy=args.policy,
        ncpu=ncpu,
        nsteps=args.nsteps,
        nminibatches=args.nminibatches,
        ent_coef=args.
        ent_coef,  # default 0.01 in baselines, 0.0001 in chainer A3C
    )
from baselines.siggraph_script.training_utils import *
import gym
from baselines import logger
import numpy as np

if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='DartHumanWalker-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    args = parser.parse_args()
    logger.reset()
    logger.configure('data/ppo_' + args.env + str(args.seed) + '_run')

    env = gym.make(args.env)
    env.env.assist_timeout = 100.0
    env.env.target_vel = 5.0
    env.env.init_tv = 0.0
    env.env.final_tv = 5.0
    env.env.tv_endtime = 3.0
    env.env.energy_weight = 0.15
    env.env.alive_bonus_rew = 9.0
    train_mirror_sig(env, num_timesteps=int(5000000), seed=args.seed, obs_perm=np.array(
                                                     [0.0001, -1, 2, -3, -4, -11, 12, -13, 14, 15, 16, -5, 6, -7, 8, 9,
                                                      10, -17, 18, -19, -24, 25, -26, 27, -20, 21, -22, 23, \
                                                      28, 29, -30, 31, -32, -33, -40, 41, -42, 43, 44, 45, -34, 35, -36,
                                                      37, 38, 39, -46, 47, -48, -53, 54, -55, 56, -49, 50, -51, 52, 58,
def main():
    # enforce_env_name = 'SimplerPathFinding-v0'

    enforce_env_name = 'DartEel-v0'

    num_processes = MPI.COMM_WORLD.Get_size()
    num_timesteps_per_process = 1000

    num_iterations_enforce = 1000
    num_iterations_release = 100

    release_env_name = 'SimplerPathFinding-v1'

    # for i in range(1):
    i = 0
    seed = i * 13 + 7 * (i**2)
    # seed = 128
    import baselines.common.tf_util as U
    import datetime
    import time
    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S')

    tf.reset_default_graph()

    with U.single_threaded_session() as sess:
        logger.reset()
        logger.configure('../data/ppo_' + enforce_env_name +
                         '_baseline_seed=' + str(seed) + '/' + str(st))

        model = train(sess,
                      enforce_env_name,
                      num_timesteps=num_iterations_enforce * num_processes *
                      num_timesteps_per_process,
                      timesteps_per_actor=num_timesteps_per_process,
                      seed=seed)

        # logger.reset()
        # logger.configure('data/ppo_PathFinding-v0_release/baseline_seed=' + str(seed))
        # with tf.variable_scope(tf.get_variable_scope(), reuse=True):
        #     model = train(sess, release_env_name,
        #                   num_timesteps=num_iterations_release * num_processes * num_timesteps_per_process,
        #                   timesteps_per_actor=num_timesteps_per_process, seed=seed)

        comm = MPI.COMM_WORLD
        mpi_rank = comm.Get_rank()

        if mpi_rank == 0:
            env = gym.make(enforce_env_name)

            env = wrappers.Monitor(env,
                                   logger.get_dir() + '/results',
                                   force=True)

            obs = env.reset()

            step = 0
            while (True):

                env.render()
                actions = model._act(True, obs)
                obs, _, done, _ = env.step(actions[0][0])
                env.render()
                if done:
                    obs = env.reset()
                if done:
                    print("Visualization is Done")
                    break

                step += 1
예제 #19
0
def main():

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='SunblazeBreakout-v0')
    parser.add_argument('--seed',
                        type=int,
                        help='RNG seed, defaults to random')
    parser.add_argument('--output', type=str)
    parser.add_argument('--processes', default=1, help='int or "max" for all')
    # parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    parser.add_argument('--total-episodes', type=int, default=int(5e4))
    parser.add_argument('--policy',
                        help='Policy architecture',
                        choices=['mlp', 'lstm'],
                        default='mlp')

    # Hyperparameters
    parser.add_argument('--lr', type=float, default=3e-4)
    parser.add_argument('--nsteps', type=int, default=2048)
    parser.add_argument('--nminibatches', type=int, default=32)

    args = parser.parse_args()

    # Configure logger
    if args.output:
        try:
            os.makedirs(args.output)
        except OSError:
            pass
        logger.reset()
        logger.configure(dir=args.output)

    # If seed is unspecified, generate a pseudorandom one
    if not args.seed:
        # "Seed must be between 0 and 2**32 - 1"
        seed = create_seed(args.seed, max_bytes=4)
    else:
        seed = args.seed

    # Log it for reference
    with open(os.path.join(args.output, 'seed.txt'), 'w') as fout:
        fout.write("%d\n" % seed)

    if args.processes == 'max':
        ncpu = multiprocessing.cpu_count()
        # from: https://github.com/openai/baselines/blob/1f8a03f3a62367526f20215188fb5ea4b9ec27e0/baselines/ppo2/run_atari.py#L15
        if sys.platform == 'darwin': ncpu //= 2
    else:
        try:
            ncpu = int(args.processes)
        except ValueError:
            raise argparse.ArgumentTypeError("Invalid number of processes")

    train(
        args.env,
        total_episodes=args.total_episodes,
        seed=seed,
        ncpu=ncpu,
        policy=args.policy,
        lr=args.lr,
        nsteps=args.nsteps,
        nminibatches=args.nminibatches,
    )
예제 #20
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='DartHopperPT-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--name',
                        help='name of experiments',
                        type=str,
                        default="")
    parser.add_argument('--max_step',
                        help='maximum step size',
                        type=int,
                        default=1000000)
    parser.add_argument('--batch_size',
                        help='batch size',
                        type=int,
                        default=4000)
    parser.add_argument('--clip', help='clip', type=float, default=0.2)
    parser.add_argument('--schedule', help='schedule', default='constant')
    parser.add_argument('--train_up', help='whether train up', default='True')
    parser.add_argument('--dyn_params', action='append', type=int)
    parser.add_argument('--output_interval',
                        help='interval of outputting policies',
                        type=int,
                        default=10)
    parser.add_argument(
        '--mirror',
        help=
        'whether to use mirror, (0: not mirror, 1: hard mirror, 2: soft mirror)',
        type=int,
        default=0)
    parser.add_argument('--warmstart',
                        help='path to warmstart policies',
                        type=str,
                        default="")

    args = parser.parse_args()
    global output_interval
    output_interval = args.output_interval
    logger.reset()
    config_name = 'data/ppo_' + args.env + str(args.seed) + '_' + args.name

    if args.mirror == 1:
        config_name += '_mirror'
    elif args.mirror == 2:
        config_name += '_softmirror'

    if len(args.warmstart) > 0:
        config_name += '_warmstart'

    if args.train_up == 'True':
        config_name += '_UP'

    logger.configure(config_name, ['json', 'stdout'])
    train(args.env,
          num_timesteps=int(args.max_step),
          seed=args.seed,
          batch_size=args.batch_size,
          clip=args.clip,
          schedule=args.schedule,
          mirror=args.mirror,
          warmstart=args.warmstart,
          train_up=args.train_up == 'True',
          dyn_params=args.dyn_params)
예제 #21
0
def main():

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='SunblazeCartPole-v0')
    parser.add_argument('--seed',
                        type=int,
                        help='RNG seed, defaults to random')
    parser.add_argument('--output', type=str)

    # parser.add_argument('--episodes-per-trial', type=int, default=5)
    # parser.add_argument('--trials', type=int, default=10 ** 4)
    # The total number of episodes is now trials*episodes_per_trial
    parser.add_argument('--total-episodes', type=int, default=5e4)

    parser.add_argument('--policy',
                        help='Policy architecture',
                        choices=['mlp', 'lstm'],
                        default='mlp')
    parser.add_argument('--processes', default=1, help='int or "max" for all')
    parser.add_argument('--reward-scale', type=float, default=1.0)

    # Hyperparameters
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--nsteps', type=int, default=5)
    parser.add_argument('--ent-coef', type=float, default=1e-2)

    args = parser.parse_args()
    #total_episodes = args.trials * args.episodes_per_trial

    # Configure logger
    if args.output:
        try:
            os.makedirs(args.output)
        except OSError:
            pass

        logger.reset()
        logger.configure(dir=args.output)

    # If seed is unspecified, generate a pseudorandom one
    if not args.seed:
        # "Seed must be between 0 and 2**32 - 1"
        seed = create_seed(args.seed, max_bytes=4)
    else:
        seed = args.seed

    # Log it for reference
    with open(os.path.join(args.output, 'seed.txt'), 'w') as fout:
        fout.write("%d\n" % seed)

    if args.processes == 'max':
        ncpu = multiprocessing.cpu_count()
        # from: https://github.com/openai/baselines/blob/1f8a03f3a62367526f20215188fb5ea4b9ec27e0/baselines/ppo2/run_atari.py#L15
        if sys.platform == 'darwin': ncpu //= 2
    else:
        try:
            ncpu = int(args.processes)
        except ValueError:
            raise argparse.ArgumentTypeError("Invalid number of processes")

    train(
        args.env,
        total_episodes=args.total_episodes,
        policy=args.policy,
        lr=args.lr,
        num_processes=ncpu,
        rew_scale=args.reward_scale,
        seed=seed,
        nsteps=args.nsteps,
        ent_coef=args.
        ent_coef,  # default 0.01 in baselines, 0.0001 in chainer A3C
    )
예제 #22
0
def main():
    import argparse
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env', help='environment ID', default='DartDogRobot-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--init_policy', help='Initial Policy',
                        default='data/ppo_DartDogRobot-v156_energy2_vel8_8s_mirror4_velrew3_dcon1_asinput_damping5_velspd1000_feetcover3off_newstrength13x_dqpen0001_shoulder170_-102range_thigh200_-0.21.4range_lighterhead_2kassist/policy_params.pkl')
    parser.add_argument('--init_curriculum', help='Initial Curriculum', nargs='+', default=[2000.0, 2000])
    parser.add_argument('--ref_policy', help='Reference Policy',
                        default='data/ppo_DartDogRobot-v156_energy2_vel8_8s_mirror4_velrew3_dcon1_asinput_damping5_velspd1000_feetcover3off_newstrength13x_dqpen0001_shoulder170_-102range_thigh200_-0.21.4range_lighterhead_2kassist/policy_params.pkl')
    parser.add_argument('--ref_curriculum', help='Reference Curriculum', nargs='+', default=[2000.0, 2000])
    parser.add_argument('--anc_thres', help='Anchor Threshold', type=float, default=0.85)
    parser.add_argument('--prog_thres', help='Progress Threshold', type=float, default=0.7)
    parser.add_argument('--batch_size', help='Batch Size', type=int, default=2500)
    parser.add_argument('--max_iter', help='Maximum Iteration', type=int, default=2000)
    parser.add_argument('--use_reftraj', help='Use reference trajectory', type=int, default=0)
    args = parser.parse_args()
    logger.reset()
    logger.configure(
        'data/ppo_curriculum_150eachit_vel8_mirror4_runningavg3_2s_torque13x_e1' + args.env + '_' + str(
            args.seed) + '_' + str(args.anc_thres) + '_' + str(args.prog_thres) + '_' + str(args.batch_size))
    sess = U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env)

    ob_space = env.observation_space
    ac_space = env.action_space

    def policy_fn(name, ob_space, ac_space):
        return mlp_mirror_policy.MlpMirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                                 hid_size=64, num_hid_layers=3, gmm_comp=1,
                                                 mirror_loss=True,
                                                 observation_permutation=np.array(
                                                     [0.0001, -1, 2, -3, -4, 9, 10, 11, 12, 5, 6, 7, 8, 17, 18, 19, 20,
                                                      13,
                                                      14, 15, 16,
                                                      21, 22, -23, 24, -25, -26, 31, 32, 33, 34, 27, 28, 29, 30, 39, 40,
                                                      41,
                                                      42, 35, 36, 37, 38, 44, 43, 46, 45, 47]),
                                                 action_permutation=np.array(
                                                     [4, 5, 6, 7, 0.0001, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11]))

    policy = policy_fn('policy', ob_space, ac_space)
    init_curriculum = np.array(args.init_curriculum)
    ref_policy = policy_fn('ref_policy', ob_space, ac_space)
    ref_curriculum = np.array(args.ref_curriculum)

    policy_params = joblib.load(args.init_policy)
    ref_policy_params = joblib.load(args.ref_policy)
    U.initialize()
    cur_scope = policy.get_variables()[0].name[0:policy.get_variables()[0].name.find('/')]
    orig_scope = list(policy_params.keys())[0][0:list(policy_params.keys())[0].find('/')]
    ref_scope = list(ref_policy_params.keys())[0][0:list(ref_policy_params.keys())[0].find('/')]
    for i in range(len(policy.get_variables())):
        assign_op = policy.get_variables()[i].assign(
            policy_params[policy.get_variables()[i].name.replace(cur_scope, orig_scope, 1)])
        sess.run(assign_op)
        assign_op = ref_policy.get_variables()[i].assign(
            ref_policy_params[ref_policy.get_variables()[i].name.replace('ref_' + cur_scope, ref_scope, 1)])
        sess.run(assign_op)

    anchor_threshold = args.anc_thres
    progress_threshold = args.prog_thres

    env = bench.Monitor(env, logger.get_dir() and
                        osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True)
    env.seed(args.seed + MPI.COMM_WORLD.Get_rank())
    gym.logger.setLevel(logging.WARN)

    curriculum_evolution = []

    env.env.env.anchor_kp = ref_curriculum
    ref_score = None
    ref_max_score = None
    reference_trajectory = None
    # if MPI.COMM_WORLD.Get_rank() == 0:
    if args.use_reftraj == 1:
        reference_trajecotry = gen_reftraj(env, ref_policy, 299)
        env.env.reference_trajectory = reference_trajectory
    ref_score, ref_max_score = evaluate_policy(env, ref_policy, 24)
    ref_score = MPI.COMM_WORLD.bcast(ref_score, root=0)
    ref_max_score = MPI.COMM_WORLD.bcast(ref_max_score, root=0)
    reference_score = ref_score * progress_threshold
    reference_anchor_score = ref_score * anchor_threshold
    reference_max_score = ref_max_score * 0.9
    env.env.env.anchor_kp = init_curriculum
    reference_trajectory = MPI.COMM_WORLD.bcast(reference_trajectory, root=0)
    env.env.reference_trajectory = reference_trajectory

    current_curriculum = np.copy(init_curriculum)
    print('reference scores: ', reference_score, reference_anchor_score, reference_max_score)
    #env.env.env.energy_weight *= 0.5
    # env.env.env.final_tv -= 0.5
    previous_params = policy_params
    for iter in range(args.max_iter):
        print('curriculum iter ', iter)
        print('ref score: ', reference_anchor_score)

        opt_pi, final_rew = pposgd_mirror.learn(env, policy_fn,
                                                max_timesteps=args.batch_size * MPI.COMM_WORLD.Get_size() * 150,
                                                timesteps_per_batch=int(args.batch_size),
                                                clip_param=0.2, entcoeff=0.0,
                                                optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
                                                gamma=0.99, lam=0.95, schedule='linear',
                                                callback=callback,
                                                sym_loss_weight=4.0,
                                                return_threshold=reference_anchor_score,
                                                init_policy_params=previous_params,
                                                policy_scope='pi' + str(iter),
                                                min_iters=0,
                                                reward_drop_bound=True,
                                                # max_threshold = reference_max_score,
                                                )
        print('one learning iteration done')
        if np.linalg.norm(current_curriculum) >= 0.0001:
            # re-compute reference trajectory
            if MPI.COMM_WORLD.Get_rank() == 0 and args.use_reftraj == 1:
                print('recompute reference traj')
                reference_trajecotry = gen_reftraj(env, opt_pi, 299)
            reference_trajectory = MPI.COMM_WORLD.bcast(reference_trajectory, root=0)
            env.env.reference_trajectory = reference_trajectory

            if final_rew < reference_anchor_score * 0.95:
                print('update reference scores')
                reference_score = reference_score / reference_anchor_score * final_rew
                reference_anchor_score = final_rew

            closest_candidate = None
            # if MPI.COMM_WORLD.Get_rank() == 0:
            directions = [np.array([-1, 0]), np.array([0, -1]),
                          -current_curriculum / np.linalg.norm(current_curriculum)]
            int_d1 = directions[0] + directions[2]
            int_d2 = directions[1] + directions[2]
            directions.append(int_d1 / np.linalg.norm(int_d1))
            directions.append(int_d2 / np.linalg.norm(int_d2))

            # directions = [np.array([0.0, -1.0])] # only search in one direction
            candidate_next_anchors = []
            for direction in directions:
                found_point, perf = binary_search_curriculum(env, opt_pi, current_curriculum, direction,
                                                             reference_score, reference_max_score, 6)
                print(direction, found_point, perf)
                candidate_next_anchors.append(found_point)
                if closest_candidate is None:
                    closest_candidate = np.copy(found_point)
                elif np.linalg.norm(closest_candidate) > np.linalg.norm(found_point):
                    closest_candidate = np.copy(found_point)
            if np.linalg.norm(closest_candidate) < 0.5:
                closest_candidate = np.array([0, 0])
            if np.abs(closest_candidate[0]) < 0.1:
                closest_candidate[0] = 0.0
            if np.abs(closest_candidate[1]) < 0.1:
                closest_candidate[1] = 0.0
            # closest_candidate = MPI.COMM_WORLD.bcast(closest_candidate, root=0)

            current_curriculum = np.copy(closest_candidate)
        env.env.env.anchor_kp = current_curriculum

        '''print('Update Init Pose Distributions')
        update_init_poses(env, opt_pi)
        if MPI.COMM_WORLD.Get_rank() == 0:
            joblib.dump([env.env.env.init_qs, env.env.env.init_dqs], logger.get_dir()+'/init_poses_'+np.array2string(current_curriculum, separator=',')+'.pkl', compress=True)
            joblib.dump([env.env.env.init_qs, env.env.env.init_dqs], logger.get_dir() + '/init_poses.pkl', compress=True)'''

        curriculum_evolution.append(current_curriculum)
        print('Current curriculum: ', current_curriculum)
        opt_variable = opt_pi.get_variables()
        previous_params = {}
        for i in range(len(opt_variable)):
            cur_val = opt_variable[i].eval()
            previous_params[opt_variable[i].name] = cur_val
        if np.linalg.norm(current_curriculum) < 0.0001:
            if reference_anchor_score < ref_score:
                reference_anchor_score = ref_score
            else:
                break

    env.close()
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='DartHopper-v1')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--name',
                        help='name of experiments',
                        type=str,
                        default="")
    parser.add_argument('--max_iter',
                        help='maximum iteration number',
                        type=int,
                        default=1000)
    parser.add_argument('--inner_iter',
                        help='inner iteration number',
                        type=int,
                        default=30)
    parser.add_argument('--output_interval',
                        help='interval of outputting policies',
                        type=int,
                        default=100)
    parser.add_argument('--warmstart',
                        help='warmstart of experiments',
                        type=str,
                        default="")
    parser.add_argument('--skilldim',
                        help='dimension of latent variable',
                        type=int,
                        default=2)
    parser.add_argument('--task_number',
                        help='number of tasks to sample per iteration',
                        type=int,
                        default=5)
    parser.add_argument('--mirror', help='use mirror policy', default="False")
    parser.add_argument('--dyn_params', action='append', type=int)

    args = parser.parse_args()
    global config_name, output_interval
    output_interval = args.output_interval
    logger.reset()
    config_name = 'data/mso_ars_' + args.env + str(args.seed) + '_' + args.name

    config_name += '_skilldim' + str(args.skilldim)
    config_name += '_maxiter' + str(args.max_iter)
    config_name += '_tasknum' + str(args.task_number)
    config_name += '_inneriter' + str(args.inner_iter)
    if len(args.warmstart) > 0:
        config_name += '_warmstart'

    if args.mirror == 'True':
        config_name += '_mirror'

    logger.configure(config_name, ['json', 'stdout'])
    train(args.env,
          skilldim=args.skilldim,
          max_iter=int(args.max_iter),
          inner_iter=int(args.inner_iter),
          seed=args.seed,
          tasknum=int(args.task_number),
          warmstart=args.warmstart,
          mirror=args.mirror == 'True',
          dyn_params=args.dyn_params)