def train(env_id, num_timesteps, seed, num_cpu):
    from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
    from baselines.trpo_mpi import trpo_mpi
    import baselines.common.tf_util as U
    whoami  = mpi_fork(num_cpu)
    if whoami == "parent":
        return
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    logger.session().__enter__()
    if rank != 0:
        logger.set_level(logger.DISABLED)


    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space)
    env = bench.Monitor(env, osp.join(logger.get_dir(), "%i.monitor.json"%rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    num_timesteps /= 4 # because we're wrapping the envs to do frame skip
    env.seed(workerseed)

    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3,
        max_timesteps=num_timesteps, gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00)
    env.close()
Beispiel #2
0
def train(env_id, num_timesteps, seed, num_cpu):
    from baselines.pposgd import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    whoami  = mpi_fork(num_cpu)
    if whoami == "parent": return
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    logger.session().__enter__()
    if rank != 0: logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
    env = bench.Monitor(env, osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    num_timesteps /= 4 # because we're wrapping the envs to do frame skip
    env.seed(workerseed)

    pposgd_simple.learn(env, policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_batch=256,
        clip_param=0.2, entcoeff=0.01,
        optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
        gamma=0.99, lam=0.95,
        schedule='linear'
    )
    env.close()
Beispiel #3
0
def train(env_id, num_timesteps, timesteps_per_batch, seed, num_cpu, resume,
          agentName, logdir, hid_size, num_hid_layers, clip_param, entcoeff,
          optim_epochs, optim_stepsize, optim_batchsize, gamma, lam, portnum,
          max_to_keep):
    from baselines.ppo1 import mlp_policy, pposgd_simple

    whoami = mpi_fork(num_cpu)
    if whoami == "parent": return
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()

    if rank != 0: logger.set_level(logger.DISABLED)
    utils.portnum = portnum + rank

    workerseed = seed + 10000 * rank
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    env.seed(seed)

    if logger.get_dir():
        env = bench.Monitor(env, osp.join(logger.get_dir(), "monitor.json"))

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=hid_size,
                                    num_hid_layers=num_hid_layers)

    gym.logger.setLevel(logging.WARN)
    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_batch=timesteps_per_batch,
                        clip_param=clip_param,
                        entcoeff=entcoeff,
                        optim_epochs=optim_epochs,
                        optim_stepsize=optim_stepsize,
                        optim_batchsize=optim_batchsize,
                        gamma=gamma,
                        lam=lam,
                        resume=resume,
                        agentName=agentName,
                        logdir=logdir,
                        max_to_keep=max_to_keep)
    env.close()
Beispiel #4
0
def train(env_id, num_timesteps, seed, model_path, load_model,
          timesteps_per_batch, hidden_units, hidden_layers, trainmodel, ACTION,
          EMBEDDING, MODEL, LOGGING):
    whoami = mpi_fork(num_cpu)
    if whoami == "parent":
        return
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    workerseed = 2221438774
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    env = wrappers.ConfigWrapper(env, ACTION, EMBEDDING, MODEL, LOGGING)

    def policy_fn(name, ob_space, ac_space):
        return LSTMPolicy(name=name,
                          ob_space=ob_space,
                          ac_space=ac_space,
                          hid_size=hidden_units,
                          num_hid_layers=hidden_layers)

    env.seed(workerseed)

    trpo_indi.learn(env,
                    policy_fn,
                    timesteps_per_batch=timesteps_per_batch,
                    max_kl=max_kl,
                    cg_iters=cg_iters,
                    cg_damping=cd_damping,
                    max_episodes=num_timesteps,
                    gamma=gamma,
                    lam=lam,
                    vf_iters=vf_iters,
                    vf_stepsize=vf_stepsize,
                    load_model=load_model,
                    model_path=model_path,
                    trainmodel=trainmodel)
    env.close()
Beispiel #5
0
def train(env_id, num_timesteps, seed):
    whoami = mpi_fork(num_cpu)
    if whoami == "parent":
        return
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=env.observation_space,
                         ac_space=env.action_space,
                         hid_size=32,
                         num_hid_layers=2)

    logger.configure()
    env = bench.Monitor(env,
                        osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    trpo_mpi_modified.learn(env,
                            policy_fn,
                            timesteps_per_batch=1024,
                            max_kl=0.01,
                            cg_iters=10,
                            cg_damping=0.1,
                            max_timesteps=num_timesteps,
                            gamma=0.99,
                            lam=0.98,
                            vf_iters=5,
                            vf_stepsize=1e-3,
                            alpha=100)
    env.close()
Beispiel #6
0
def train(env_id, num_timesteps, seed, num_cpu):
    from baselines.pposgd import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    whoami = mpi_fork(num_cpu)
    if whoami == "parent": return
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    logger.session().__enter__()
    if rank != 0: logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):  #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space)

    env = bench.Monitor(env,
                        osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    num_timesteps /= 4  # because we're wrapping the envs to do frame skip
    env.seed(workerseed)

    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_batch=256,
                        clip_param=0.2,
                        entcoeff=0.01,
                        optim_epochs=4,
                        optim_stepsize=1e-3,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear')
    env.close()
def train(env_id, num_timesteps, seed, model_name, model_path, para,load_model,
          timesteps_per_batch,hidden_units,hidden_layers):
    whoami  = mpi_fork(num_cpu)
    if whoami == "parent":
        return
    import baselines.common.tf_util as U
    logger.session().__enter__()
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    env = SubsetWrapper(env, para)
    #env = gym_kidney.LogWrapper(env, NN, EXP, OUT, FREQ, PARAM)
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=env.observation_space,
                         ac_space=env.action_space,
                         hid_size=hidden_units,
                         num_hid_layers=hidden_layers)
    env = bench.Monitor(env, osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    # env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    trpo_indi.learn(env, policy_fn,
                   timesteps_per_batch=timesteps_per_batch,
                   max_kl=max_kl, cg_iters=cg_iters,
                   cg_damping=cd_damping,
                   max_episodes=num_timesteps,
                   gamma=gamma, lam=lam,
                   vf_iters=vf_iters,
                   vf_stepsize=vf_stepsize,
                   load_model=load_model,
                   model_path=model_path
                    )
    env.close()
def main():
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('-n', '--exp_name', dest='exp_name', default='tmp')
    parser.add_argument('-r', '--render', dest='render', action='store_true')
    parser.add_argument('-c', '--num_cpu', dest='num_cpu', default=1, type=int)
    parser.add_argument('--resdir', dest='resdir', default='results')
    parser.add_argument('--max_timesteps',
                        dest='max_timesteps',
                        default=1e9,
                        type=int)
    parser.add_argument('--seed', dest='seed', default=123, type=int)
    parser.add_argument('--force_override',
                        dest='force_override',
                        action='store_true')
    parser.add_argument('--timesteps_per_batch',
                        dest='timesteps_per_batch',
                        default=2048,
                        type=int)
    parser.add_argument('--clip_param',
                        dest='clip_param',
                        default=0.2,
                        type=float)
    parser.add_argument('--optim_epochs',
                        dest='optim_epochs',
                        default=10,
                        type=int)
    parser.add_argument('--optim_stepsize',
                        dest='optim_stepsize',
                        default=3e-4,
                        type=float)
    parser.add_argument('--optim_batchsize',
                        dest='optim_batchsize',
                        default=64,
                        type=int)
    parser.add_argument('--entcoeff', dest='entcoeff', default=0., type=float)
    parser.add_argument('--gamma', dest='gamma', default=0.99, type=float)
    parser.add_argument('--lam', dest='lam', default=0.95, type=float)
    parser.add_argument('--hid_size', dest='hid_size', default=64, type=int)
    parser.add_argument('--num_hid_layers',
                        dest='num_hid_layers',
                        default=2,
                        type=int)
    parser.add_argument('--shaping', dest='shaping', default=None, type=str)
    parser.add_argument('--save_every',
                        dest='save_every',
                        default=20,
                        type=int)
    parser.add_argument('--diff', dest='diff', default=0, type=int)
    parser.add_argument('--relative_x',
                        dest='relative_x',
                        action='store_true',
                        help='DEPRECATED')
    parser.add_argument('--transform_inputs',
                        dest='transform_inputs',
                        type=str,
                        default=None)
    parser.add_argument('--bound_by_sigmoid',
                        dest='bound_by_sigmoid',
                        action='store_true')
    parser.add_argument('--sigmoid_coef',
                        dest='sigmoid_coef',
                        default=1.,
                        type=float)
    parser.add_argument('--noobsthack', dest='noobsthack', action='store_true')
    parser.add_argument('--nogaussian_fixed_var',
                        dest='nogaussian_fixed_var',
                        action='store_true')

    parser.add_argument('--activation',
                        dest='activation',
                        default='tanh',
                        type=str)
    parser.add_argument('--nonormalize_obs',
                        dest='nonormalize_obs',
                        action='store_true')

    parser.add_argument('--nostochastic',
                        dest='nostochastic',
                        action='store_true')
    parser.add_argument('--nostochastic2',
                        dest='nostochastic2',
                        action='store_true')
    parser.add_argument('--load_model',
                        dest='load_model',
                        default=None,
                        type=str)
    parser.add_argument('--test_only', dest='test_only', action='store_true')
    parser.add_argument('--evaluate', dest='evaluate', action='store_true')
    parser.add_argument('--n_eval_episodes',
                        dest='n_eval_episodes',
                        default=10000,
                        type=int)
    parser.add_argument('--submit', dest='submit', action='store_true')
    parser.add_argument('--max_env_steps',
                        dest='max_env_steps',
                        default=1000,
                        type=int)
    parser.add_argument('--run_logs_dir',
                        dest='run_logs_dir',
                        default=None,
                        type=str)
    parser.add_argument('--avg_norm_symmetry',
                        dest='avg_norm_symmetry',
                        action='store_true')
    parser.add_argument('--symmetric_interpretation',
                        dest='symmetric_interpretation',
                        action='store_true')
    parser.add_argument('--stdclip', dest='stdclip', default=5.0, type=float)
    parser.add_argument('--memory_size',
                        dest='memory_size',
                        default=1,
                        type=int)
    parser.add_argument('--swap_legs_mode',
                        dest='swap_legs_mode',
                        default=None,
                        type=str)
    parser.add_argument('--filter_obs', dest='filter_obs', action='store_true')
    parser.add_argument('--actions',
                        dest='actions',
                        default='gaussian',
                        type=str)

    parser.add_argument('--binary_actions',
                        dest='binary_actions',
                        action='store_true',
                        help='deprecated')
    parser.add_argument('--beta_dist',
                        dest='beta_dist',
                        action='store_true',
                        help='deprecated')
    parser.add_argument('--gaussian_bias',
                        dest='gaussian_bias',
                        action='store_true')
    parser.add_argument('--muscles', dest='muscles', action='store_true')
    parser.add_argument('--repeats', dest='repeats', default=1, type=int)
    parser.add_argument('--add_time', dest='add_time', action='store_true')
    parser.add_argument('--simwalker', dest='simwalker', action='store_true')
    parser.add_argument('--log_walker', dest='log_walker', action='store_true')
    parser.add_argument('--log_simwalker',
                        dest='log_simwalker',
                        action='store_true')
    parser.add_argument('--symmetric_training',
                        dest='symmetric_training',
                        action='store_true')
    parser.add_argument('--step_timeout',
                        dest='step_timeout',
                        default=None,
                        type=float)
    parser.add_argument('--gaussian_from_binary',
                        dest='gaussian_from_binary',
                        action='store_true')
    parser.add_argument('--pv', dest='parallel_value', action='store_true')
    parser.add_argument('--pv_layers', dest='pv_layers', default=2, type=int)
    parser.add_argument('--pv_hid_size',
                        dest='pv_hid_size',
                        default=512,
                        type=int)
    parser.add_argument('--horizon_hack',
                        dest='horizon_hack',
                        action='store_true')
    parser.add_argument('--single_episode',
                        dest='single_episode',
                        action='store_true')
    parser.add_argument('--n_obstacles',
                        dest='n_obstacles',
                        default=3,
                        type=int)
    parser.add_argument('--nologs', dest='nologs', action='store_true')
    parser.add_argument('--init_three', dest='init_three', action='store_true')
    parser.add_argument('--three', dest='three', action='store_true')
    parser.add_argument('--pause', dest='pause', action='store_true')
    parser.add_argument('--nobind', dest='nobind', action='store_true')
    parser.add_argument('--running_avg_len',
                        dest='running_avg_len',
                        default=100,
                        type=int)
    parser.add_argument('--submit_token',
                        dest='submit_token',
                        default=None,
                        type=str)
    parser.add_argument('--fall_penalty',
                        dest='fall_penalty',
                        action='store_true')
    parser.add_argument('--fall_penalty_val',
                        dest='fall_penalty_val',
                        default=2.,
                        type=float)
    parser.add_argument('--higher_pelvis',
                        dest='higher_pelvis',
                        default=0.65,
                        type=float)
    parser.add_argument('--print_action',
                        dest='print_action',
                        action='store_true')
    parser.add_argument('--new8_fix', dest='new8_fix', action='store_true')
    parser.add_argument('--symmetric_training_trick',
                        dest='symmetric_training_trick',
                        action='store_true')
    parser.add_argument('--submit_round2',
                        dest='submit_round2',
                        action='store_true')
    parser.add_argument('--noisy_obstacles',
                        dest='noisy_obstacles',
                        action='store_true')
    parser.add_argument('--noisy_obstacles2',
                        dest='noisy_obstacles2',
                        action='store_true')
    parser.add_argument('--execute_just',
                        dest='execute_just',
                        default=None,
                        type=int)
    parser.add_argument('--seeds_fn', dest='seeds_fn', default=None, type=str)
    parser.add_argument('--bootstrap_seeds',
                        dest='bootstrap_seeds',
                        action='store_true')
    parser.add_argument('--noisy_fix', dest='noisy_fix', action='store_true')

    args = parser.parse_args()

    if args.transform_inputs in [
            'new_5', 'new_6', 'new_7', 'new_8', 'new_9', 'new_a', 'new_8b'
    ]:
        args.filter_obs = True

    if args.binary_actions:
        logger.warn('Deprecated option')
        args.actions = 'binary'
    if args.beta_dist:
        logger.warn('Deprecated option')
        args.actions = 'beta'

    if args.relative_x:
        assert args.transform_inputs is None
        args.transform_inputs = 'relative_x'

    if args.transform_inputs == 'new_4':
        logger.warn("Overriding the memory size to 3")
        args.memory_size = 3

    if args.submit:
        assert args.load_model
        args.evaluate = True

    if args.submit_round2:
        assert args.load_model
        args.evaluate = True
        args.n_eval_episodes = 100000
        args.log_simwalker = False
        args.log_walker = False
        args.nobind = True
        args.num_cpu = 1
        args.nologs = True

    if args.render:
        args.num_cpu = 1

    # Create exp dir
    env_name = f'Walker_d{args.diff}'
    if args.max_env_steps is not None and args.max_env_steps != 1000:
        env_name += f'_{args.max_env_steps:03d}'
    if args.n_obstacles != 3:
        env_name += f'_o{args.n_obstacles:02d}'
    env_name += '-v0'

    args.exp_path = path.join(args.resdir, env_name, 'PPOOAI', args.exp_name,
                              str(args.seed))
    if args.run_logs_dir is None and not args.test_only and not args.evaluate:
        args.run_logs_dir = path.join(args.exp_path, 'run_logs')
    if args.nologs:
        args.run_logs_dir = None

    whoami = mpi_fork(args.num_cpu, not args.nobind)
    if whoami == 'parent': return
    if MPI.COMM_WORLD.Get_rank() == 0:
        if not args.test_only and not args.evaluate:
            prepare_env(args)
    else:
        time.sleep(0.5)  # Just in case

    train(args)
Beispiel #9
0
def run(env_id, seed, noise_type, num_cpu, layer_norm, logdir, gym_monitor, evaluation, bind_to_core, **kwargs):
    kwargs['logdir'] = logdir
    whoami = mpi_fork(num_cpu, bind_to_core=bind_to_core)
    if whoami == 'parent':
        sys.exit(0)

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        # Write to temp directory for all non-master workers.
        actual_dir = None
        Logger.CURRENT.close()
        Logger.CURRENT = Logger(dir=mkdtemp(), output_formats=[])
        logger.set_level(logger.DISABLED)
    
    # Create envs.
    if rank == 0:
        env = gym.make(env_id)
        if gym_monitor and logdir:
            env = gym.wrappers.Monitor(env, os.path.join(logdir, 'gym_train'), force=True)
        env = SimpleMonitor(env)

        if evaluation:
            eval_env = gym.make(env_id)
            if gym_monitor and logdir:
                eval_env = gym.wrappers.Monitor(eval_env, os.path.join(logdir, 'gym_eval'), force=True)
            eval_env = SimpleMonitor(eval_env)
        else:
            eval_env = None
    else:
        env = gym.make(env_id)
        if evaluation:
            eval_env = gym.make(env_id)
        else:
            eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
        action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    Logger.CURRENT.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Beispiel #10
0
def main(args):
    # print("\n\n\n\n\nXXX")
    # print(sys.path)
    # import baselines
    # print(baselines.__file__())
    # for varname in ['PMI_RANK', 'OMPI_COMM_WORLD_RANK']:
    #     if varname in os.environ:
    #         print(varname, int(os.environ[varname]))
    # print("parsing args...")

    arg_parser = init_arg_parser()
    args, unknown_args = arg_parser.parse_known_args(args)

    # if args.num_cpu > 1:
    if args.allow_run_as_root:
        whoami = mpi_fork_run_as_root(args.num_cpu,
                                      bind_to_core=args.bind_to_core)
    else:
        whoami = mpi_fork(args.num_cpu, bind_to_core=args.bind_to_core)
    if whoami == 'parent':
        print('parent exiting with code 0...')
        sys.exit(0)

    U.single_threaded_session().__enter__()

    rank = MPI.COMM_WORLD.Get_rank()

    # assert MPI.COMM_WORLD.Get_size() == args.num_cpu, MPI.COMM_WORLD.Get_size()

    # configure logger
    # rank = MPI.COMM_WORLD.Get_rank()  # FIXME: how to log when rank != 0??
    # if rank == 0:
    configure_logger(args.log_path, format_strs=[])
    logger.info(f"main: {rank} / {MPI.COMM_WORLD.Get_size()}")
    logger.info(f"logger dir: {logger.get_dir()}")

    extra_args = parse_cmdline_kwargs(unknown_args)
    logger.info(args, extra_args)

    # else:
    #     configure_logger(log_path=None)  # or still args.log_path?

    # raise RuntimeError(f"tf session: {tf.get_default_session()}, {MPI.COMM_WORLD.Get_rank()} / {MPI.COMM_WORLD.Get_size()}")

    def make_wrapped_env():
        env = gym.make(args.env)
        if args.env_type == 'maze':
            pass
        elif args.env_type == 'robotics':
            from baselines.envs.goal_sampler_env_wrapper import GoalSamplerEnvWrapper
            env = GoalSamplerEnvWrapper(env)
        elif args.env_type == 'ant':
            env = GoalExplorationEnv(env=env,
                                     only_feasible=True,
                                     extend_dist_rew=0,
                                     inner_weight=0,
                                     goal_weight=1)
        else:
            raise NotImplementedError(args.env_type)
        # FIXME: if resample space is feasible, can set only_feasible = False to avoid unnecessary computation
        return env

    venv_kwargs = dict(
        make_wrapped_env=make_wrapped_env,
        seed=args.seed,
        reward_scale=args.reward_scale,
        flatten_dict_observations=False,
        mpi_rank=rank,
        monitor_log_dir=args.log_path,  # FIXME
    )
    venv = make_vec_env(num_env=args.num_env, **venv_kwargs)
    eval_venv = make_vec_env(num_env=args.num_env, **venv_kwargs)
    if args.debug:
        plotter_venv = make_vec_env(num_env=1, **venv_kwargs)
    else:
        plotter_venv = None

    # Seed everything.
    rank_seed = args.seed + 1000000 * rank if args.seed is not None else None
    set_global_seeds(rank_seed)
    logger.info(f'setting global rank: {rank_seed} ')

    # Prepare params.
    params = dict()
    params.update(config.DEFAULT_PARAMS)
    params.update(config.DEFAULT_ENV_PARAMS[args.env])
    params.update(**extra_args)  # makes it possible to override any parameter

    # if args.debug:
    #     params['n_cycles'] = 2
    #     params['n_batches'] = 2
    #     params['ve_n_batches'] = 2
    #     params['size_ensemble'] = 2

    # env settings
    params['env_name'] = args.env
    params['num_cpu'] = args.num_cpu
    params['rollout_batch_size'] = args.num_env
    params['timesteps_per_cpu'] = int(args.num_timesteps)

    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)

    params['make_env'] = make_wrapped_env

    learn_fun_return = learn(
        venv=venv,
        eval_venv=eval_venv,
        plotter_venv=plotter_venv,
        params=params,
        save_path=args.log_path,
        save_interval=args.save_interval,
    )

    if rank == 0:
        save_path = os.path.expanduser(logger.get_dir())
        for k, v in learn_fun_return.items():
            v.save(os.path.join(save_path, f"final-{k}.joblib"))

    venv.close()
    eval_venv.close()
    if plotter_venv is not None:
        plotter_venv.close()
Beispiel #11
0
def learn(*,
          network,
          env,
          total_timesteps,
          num_cpu,
          allow_run_as_root,
          seed=None,
          eval_env=None,
          replay_strategy='future',
          save_interval=5,
          clip_return=True,
          demo_file=None,
          override_params=None,
          load_path=None,
          save_path=None,
          **kwargs):
    rank = MPI.COMM_WORLD.Get_rank()
    logger.info('before mpi_fork: rank', rank, 'num_cpu',
                MPI.COMM_WORLD.Get_size())

    if num_cpu > 1:
        if allow_run_as_root:
            whoami = mpi_fork_run_as_root(num_cpu)
        else:
            whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            logger.info('parent exiting with code 0...')
            sys.exit(0)

        U.single_threaded_session().__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    num_cpu = MPI.COMM_WORLD.Get_size()
    logger.info('after mpi_fork: rank', rank, 'num_cpu', num_cpu)

    override_params = override_params or {}

    # Seed everything.
    rank_seed = seed + 1000000 * rank if seed is not None else None
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    env_name = env.spec.id
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    params['rollout_batch_size'] = env.num_envs
    params['num_cpu'] = num_cpu
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)

    if demo_file is not None:
        params['bc_loss'] = 1
    params.update(kwargs)

    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return)
    if load_path is not None:
        tf_util.load_variables(load_path)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    eval_env = eval_env or env

    rollout_worker = RolloutWorker(env,
                                   policy,
                                   dims,
                                   logger,
                                   monitor=True,
                                   **rollout_params)
    evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params)

    n_cycles = params['n_cycles']
    n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size
    logger.info("actual total timesteps : {}".format(
        n_epochs * n_cycles * rollout_worker.T *
        rollout_worker.rollout_batch_size))

    return train(save_path=save_path,
                 policy=policy,
                 rollout_worker=rollout_worker,
                 evaluator=evaluator,
                 n_epochs=n_epochs,
                 n_test_rollouts=params['n_test_rollouts'],
                 n_cycles=params['n_cycles'],
                 n_batches=params['n_batches'],
                 save_interval=save_interval,
                 demo_file=demo_file)
Beispiel #12
0
def train(env_id, num_timesteps, timesteps_per_batch, seed, num_cpu, resume,
          agentName, logdir, hid_size, num_hid_layers, noisy_nets, clip_param,
          entcoeff, optim_epochs, optim_batchsize, optim_stepsize,
          optim_schedule, desired_kl, gamma, lam, portnum, num_parallel):
    from baselines.ppo1 import mlp_policy, pposgd_parallel
    print("num cpu = " + str(num_cpu))
    if (num_cpu > 1) and (num_parallel > 1):
        print(
            "num_cpu > 1 and num_parallel > 0 can't be used together at the moment!"
        )
        exit(0)

    whoami = mpi_fork(num_cpu)
    if whoami == "parent": return
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()

    if rank != 0: logger.set_level(logger.DISABLED)
    utils.portnum = portnum + rank
    workerseed = seed + 10000 * rank

    if utils.server_list != "":
        servers = utils.server_list.split(",")
        num_thread = utils.num_thread_list.split(",")
        tmp = 0
        a = 0
        snum = -1
        num_total = 0
        for t in num_thread:
            num_total += int(t)

        for t in num_thread:
            if rank < tmp + int(t):
                snum = a
                break
            tmp += int(t)
            a += 1
        if num_total != num_cpu:
            print("Sum of num_thread_list must be equal to num_cpu")
            quit()
        print("Connect to tcp://" + servers[snum] + ":" + str(utils.portnum))
        utils.server_ip = servers[snum]

    set_global_seeds(workerseed)
    if num_parallel > 1:
        env = CustomParallelEnv(num_parallel)
    else:
        env = gym.make(env_id)
        env.seed(seed)

    if logger.get_dir():
        if num_parallel <= 1:
            env = bench.Monitor(env, osp.join(logger.get_dir(),
                                              "monitor.json"))

    def policy_fn(name, ob_space, ac_space, noisy_nets=False):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=hid_size,
                                    num_hid_layers=num_hid_layers,
                                    noisy_nets=noisy_nets)

    gym.logger.setLevel(logging.WARN)
    pposgd_parallel.learn(env,
                          policy_fn,
                          max_timesteps=num_timesteps,
                          timesteps_per_batch=timesteps_per_batch,
                          clip_param=clip_param,
                          entcoeff=entcoeff,
                          optim_epochs=optim_epochs,
                          optim_stepsize=optim_stepsize,
                          optim_batchsize=optim_batchsize,
                          schedule=optim_schedule,
                          desired_kl=desired_kl,
                          gamma=gamma,
                          lam=lam,
                          resume=resume,
                          noisy_nets=noisy_nets,
                          agentName=agentName,
                          logdir=logdir,
                          num_parallel=num_parallel,
                          num_cpu=num_cpu)
    if num_parallel <= 1:
        env.close()
Beispiel #13
0
def learn(
    *,
    env_type,
    env,
    eval_env,
    plotter_env,
    total_timesteps,
    num_cpu,
    allow_run_as_root,
    bind_to_core,
    seed=None,
    save_interval=5,
    clip_return=True,
    override_params=None,
    load_path=None,
    save_path=None,
    policy_pkl=None,
):

    rank = MPI.COMM_WORLD.Get_rank()
    logger.info('before mpi_fork: rank', rank, 'num_cpu',
                MPI.COMM_WORLD.Get_size())

    if num_cpu > 1:
        if allow_run_as_root:
            whoami = mpi_fork_run_as_root(num_cpu, bind_to_core=bind_to_core)
        else:
            whoami = mpi_fork(num_cpu, bind_to_core=bind_to_core)
        if whoami == 'parent':
            logger.info('parent exiting with code 0...')
            sys.exit(0)

        U.single_threaded_session().__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    num_cpu = MPI.COMM_WORLD.Get_size()
    logger.info('after mpi_fork: rank', rank, 'num_cpu', num_cpu)

    override_params = override_params or {}

    # Seed everything.
    rank_seed = seed + 1000000 * rank if seed is not None else None
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    env_name = env.spec.id
    params['env_name'] = env_name
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    params['rollout_batch_size'] = env.num_envs
    params['num_cpu'] = num_cpu
    params['env_type'] = env_type
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_ve_params(params)

    dims = config.configure_dims(params)
    policy, value_ensemble, sample_disagreement_goals_fun, sample_uniform_goals_fun = \
        config.configure_ve_ddpg(dims=dims, params=params, clip_return=clip_return, policy_pkl=policy_pkl)

    if policy_pkl is not None:
        env.set_sample_goals_fun(sample_dummy_goals_fun)
    else:
        env.envs_op("update_goal_sampler",
                    goal_sampler=sample_disagreement_goals_fun)
        eval_env.envs_op("update_goal_sampler",
                         goal_sampler=sample_uniform_goals_fun)
        if plotter_env is not None:
            plotter_env.envs_op("update_goal_sampler",
                                goal_sampler=sample_uniform_goals_fun)

    if load_path is not None:
        tf_util.load_variables(
            os.path.join(load_path, 'final_policy_params.joblib'))
        return play(env=env, policy=policy)

    rollout_params, eval_params, plotter_params = config.configure_rollout_worker_params(
        params)

    rollout_worker = RolloutWorker(env,
                                   policy,
                                   dims,
                                   logger,
                                   monitor=True,
                                   **rollout_params)

    n_cycles = params['n_cycles']
    n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size
    params['n_epochs'] = n_epochs
    params[
        'total_timesteps'] = n_epochs * n_cycles * rollout_worker.T * rollout_worker.rollout_batch_size

    config.log_params(params, logger=logger)

    if policy_pkl is not None:
        train_fun = train_ve
        evaluator = None
    else:
        train_fun = train
        # construct evaluator
        # assert eval_env.sample_goals_fun is None
        # eval_env.set_sample_goals_fun(sample_dummy_goals_fun)
        evaluator = RolloutWorker(eval_env, policy, dims, logger,
                                  **eval_params)
        if plotter_env is not None:
            raise NotImplementedError
            # from baselines.misc.html_report import HTMLReport
            # plotter_worker = RolloutWorker(plotter_env, policy, dims, logger, **plotter_params)
            # rank = MPI.COMM_WORLD.Get_rank()
            # report = HTMLReport(os.path.join(save_path, f'report-{rank}.html'), images_per_row=8)
            #
            # # report.add_header("{}".format(EXPERIMENT_TYPE))
            # # report.add_text(format_dict(v))
            # plotter = config.configure_plotter(policy, value_ensemble, plotter_worker, params, report)
        else:
            plotter = None

    return train_fun(save_path=save_path,
                     policy=policy,
                     value_ensemble=value_ensemble,
                     rollout_worker=rollout_worker,
                     evaluator=evaluator,
                     n_epochs=n_epochs,
                     n_test_rollouts=params['n_test_rollouts'],
                     n_cycles=params['n_cycles'],
                     n_batches=params['n_batches'],
                     ve_n_batches=params['ve_n_batches'],
                     save_interval=save_interval,
                     plotter=plotter)
        print('Saving model ' + args.model)
        saver = tf.train.Saver()
        saver.save(session, args.model)
        print('Saved model ' + args.model + ' at ' + time())


def on_iteration_start(local_vars, global_vars):
    on_iteration_start.iteration += 1
    load_model(on_iteration_start.iteration)
    plot_history(local_vars['history'], on_iteration_start.iteration)
    save_model(on_iteration_start.iteration)


on_iteration_start.iteration = 0

whoami = mpi_fork(args.cores)
if whoami == 'parent':
    exit(0)

session = U.single_threaded_session()
session.__enter__()
logger.session().__enter__()

env = RunEnv(args.visualize, max_obstacles=args.obstacles, original_reward=args.original)
env.spec.timestep_limit = args.max_steps
if args.visualize:
    vis = env.osim_model.model.updVisualizer().updSimbodyVisualizer()
    vis.setBackgroundType(vis.GroundAndSky)
    vis.setShowFrameNumber(True)
    vis.zoomCameraToShowAllGeometry()
    vis.setCameraFieldOfView(1)