Esempio n. 1
0
def train(args, extra_args):
    env_type, env_id = run.get_env_type(args.env)

    if args.alg == 'gail':
        env_type += '_gail'
        args.alg = 'bgail'
    elif args.alg not in ['bgail', 'gail']:
        raise NotImplementedError

    learn = run.get_learn_function(args.alg)
    alg_kwargs = run.get_learn_function_defaults(args.alg, env_type)
    alg_kwargs.update(extra_args)

    env = build_env(args)
    logger.configure(os.path.join("log", "GAIL", args.env, "subsample_{}".format(extra_args["data_subsample_freq"]),
                                  "traj_{}".format(extra_args["num_expert_trajs"]), "batch_size_{}".format(extra_args["timesteps_per_batch"]),
                                  "seed_{}".format(args.seed)))

    print('Training {} on {}:{} with arguments \n{}'.format(args.alg, env_type, env_id, alg_kwargs))

    model = learn(env=env,
                  seed=args.seed,
                  save_path=args.save_path,
                  load_path=args.load_path,
                  render=args.render,
                  **alg_kwargs)
Esempio n. 2
0
def train(args, extra_args):
    env_type, env_id = get_env_type(args)
    print('env_type: {}'.format(env_type))
    total_timesteps = int(args.num_timesteps)
    seed = args.seed
    set_global_seeds(seed)

    learn = get_learn_function(args.alg)
    alg_kwargs = get_learn_function_defaults(args.alg, env_type)
    alg_kwargs.update(extra_args)

    env = build_env(args, normalize_ob=False)
    if args.save_video_interval != 0:
        env = VecVideoRecorder(
            env,
            osp.join(logger.get_dir(), "videos"),
            record_video_trigger=lambda x: x % args.save_video_interval == 0,
            video_length=args.save_video_length)

    if args.network:
        alg_kwargs['network'] = args.network
    else:
        if alg_kwargs.get('network') is None:
            alg_kwargs['network'] = get_default_network(env_type)

    print('Training {} on {}:{} with arguments \n{}'.format(
        args.alg, env_type, env_id, alg_kwargs))
    model = learn(env=env,
                  seed=seed,
                  total_timesteps=total_timesteps,
                  sil_update=args.sil_update,
                  sil_loss=args.sil_loss,
                  **alg_kwargs)

    return model, env
Esempio n. 3
0
def main():

    args = parse_args()

    format_strs = ['log', 'csv', 'stdout']

    if args.tensorboard:
        format_strs.append('tensorboard')

    config = parse_config(args.config)

    outdir = os.path.join(args.outdir,
                          os.path.splitext(os.path.basename(args.config))[0])
    logger.configure(dir=outdir, format_strs=format_strs)

    env_type, env_id = get_env_type(GAME_ENVIRONMENT)
    env = make_vec_env(env_id, env_type, 1, args.seed)

    model = trpo_mpi.learn(env=env,
                           network=NETWORK_ARCHITECTURE,
                           total_timesteps=args.total_timesteps,
                           **config)

    env.close()

    if args.save:
        model.save(os.path.join(outdir, 'model'))
Esempio n. 4
0
def build_env(args):
    env_type, env_id = run.get_env_type(args.env)
    if env_type in ['mujoco', 'classic_control']:
        env = gym.make(env_id)
        env.seed(args.seed)
    else:
        raise NotImplementedError

    return env
def build_env(env_name, num_env=1, seed=None):
    env_type, env_id = get_env_type(env_name)

    env_ = make_vec_env(env_id, env_type, num_env, seed)

    if env_type == 'mujoco':
        env_ = VecNormalize(env_)

    return env_
Esempio n. 6
0
 def __init__(self, agent):
     self.agent = agent
     env_type, env_id = get_env_type(self.agent.env_name)
     self.nenv = self.agent.nenv
     self.env = make_vec_env(env_id,
                             env_type,
                             self.nenv,
                             self.agent.seed,
                             reward_scale=1)
     self.reset()
Esempio n. 7
0
def train(args, extra_args):
    env_type, env_id = get_env_type(args)
    print('env_type: {}'.format(env_type))
    total_timesteps = int(args.num_timesteps)
    seed = args.seed
    set_global_seeds(seed)

    learn = get_learn_function(args.alg)
    alg_kwargs = get_learn_function_defaults(args.alg, env_type)
    alg_kwargs.update(extra_args)
    #######################################################
    #in default one more normalization_observations=True!!!!
    ################################################################
    ########!!!arguments for build_env need to be adjusted according to train+eval details later
    env = build_env(args, normalize_ob=False)
    eval_env = build_env(args, normalize_ob=False, is_eval=True)
    ########################################################
    if args.save_video_interval != 0:
        env = VecVideoRecorder(
            env,
            osp.join(logger.get_dir(), "videos"),
            record_video_trigger=lambda x: x % args.save_video_interval == 0,
            video_length=args.save_video_length)

    if args.network:
        alg_kwargs['network'] = args.network
    else:
        if alg_kwargs.get('network') is None:
            alg_kwargs['network'] = get_default_network(env_type)

    print('Training {} on {}:{} with arguments \n{}'.format(
        args.alg, env_type, env_id, alg_kwargs))
    iters = 0
    for model in learn(env=env,
                       env_id=env_id,
                       eval_env=eval_env,
                       make_eval_env=lambda: build_env(
                           args, normalize_ob=False, is_eval=True),
                       seed=seed,
                       total_timesteps=total_timesteps,
                       sil_update=args.sil_update,
                       sil_loss=args.sil_loss,
                       **alg_kwargs):
        if args.store_ckpt:
            save_path = osp.join(logger.get_dir(), 'model-{}'.format(iters))
            model.save(save_path)
            if isinstance(env, VecNormalize):
                rms_path = osp.join(logger.get_dir(), 'rms-{}'.format(iters))
                with open(rms_path, 'wb') as f:
                    rms = (env.ob_rms, env.ret_rms)
                    pickle.dump(rms, f)
            logger.log('Save {} model'.format(iters + 1))
        iters += 1

    return model, env
Esempio n. 8
0
File: run.py Progetto: zwc662/BGAIL
def train(args, extra_args):
    env_type, env_id = run.get_env_type(args.env)

    if args.alg == 'gail':
        env_type += '_gail'
        args.alg = 'bgail'
    elif args.alg not in ['bgail', 'gail']:
        raise NotImplementedError

    learn = run.get_learn_function(args.alg)
    alg_kwargs = run.get_learn_function_defaults(args.alg, env_type)
    alg_kwargs.update(extra_args)

    env = build_env(args)

    print('Training {} on {}:{} with arguments \n{}'.format(
        args.alg, env_type, env_id, alg_kwargs))

    model = learn(env=env,
                  seed=args.seed,
                  save_path=args.save_path,
                  load_path=args.load_path,
                  render=args.render,
                  **alg_kwargs)
Esempio n. 9
0
    'num_env': 1,
    'nsteps': 2048,
    'noptepochs': 10,
    'save_interval': 20,
    'log_interval': 1,
    'save_path': save_path,
    'model_load_path': model_load_path,
    'seed': 0,
    'reward_scale': 1,
    'flatten_dict_observations': True,
    'transfer_weights': False
}
args = SimpleNamespace(**args_dict)

# Prepare the environment and learning algorithm
env_type, env_id = get_env_type(args.env)
learn = get_learn_function(args.alg)
alg_kwargs = get_learn_function_defaults(args.alg, env_type)
env = build_env(args)
alg_kwargs['network'] = args.network

# The path we will store the results of this experiment
full_path = args.save_path + '/' + args.env + '-' + args.alg

# Make folders that we will store the checkpoints, models and epoch results
if not os.path.exists(full_path):
    os.makedirs(full_path)
    os.makedirs(full_path + '/checkpoints')

print("About to start learning model")
Esempio n. 10
0
def train(args, extra_args):
    env_type, env_id = get_env_type(args)
    print('env_type: {}'.format(env_type))
    total_timesteps = int(args.num_timesteps)
    seed = args.seed
    set_global_seeds(seed)

    learn = get_learn_function(args.alg)
    alg_kwargs = get_learn_function_defaults(args.alg, env_type)
    alg_kwargs.update(extra_args)

    env = build_env(args, normalize_ob=False)
    eval_env = build_env(args, normalize_ob=False, is_eval=True)
    if args.save_video_interval != 0:
        env = VecVideoRecorder(
            env,
            osp.join(logger.get_dir(), "videos"),
            record_video_trigger=lambda x: x % args.save_video_interval == 0,
            video_length=args.save_video_length)

    if args.network:
        alg_kwargs['network'] = args.network
    else:
        if alg_kwargs.get('network') is None:
            alg_kwargs['network'] = get_default_network(env_type)
    beta = -1
    if beta < 0:
        #print(alg_kwargs)
        nr_episodes = total_timesteps // alg_kwargs['timesteps_per_batch']
        # Automatically compute beta based on initial entropy and number of iterations
        policy = build_policy(
            env,
            alg_kwargs['network'],
            value_network='copy',
            normalize_observations=alg_kwargs['normalize_observations'],
            copos=True)
        ob = observation_placeholder(env.observation_space)

        sess = U.single_threaded_session()
        sess.__enter__()
        with tf.variable_scope("tmp_pi"):
            tmp_pi = policy(observ_placeholder=ob)
        sess.run(tf.global_variables_initializer())

        tmp_ob = np.zeros((1, ) + env.observation_space.shape)
        entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.X: tmp_ob})
        #beta = 2 * entropy / nr_episodes
        beta = 0
        print("Initial entropy: " + str(entropy) + ", episodes: " +
              str(nr_episodes))
        print("Constantly set beta: " + str(beta))

    print('Training {} on {}:{} with arguments \n{}'.format(
        args.alg, env_type, env_id, alg_kwargs))
    iters = 0
    for model in learn(env=env,
                       env_id=env_id,
                       eval_env=eval_env,
                       make_eval_env=lambda: build_env(
                           args, normalize_ob=False, is_eval=True),
                       seed=seed,
                       beta=beta,
                       total_timesteps=total_timesteps,
                       **alg_kwargs):
        if args.store_ckpt:
            save_path = osp.join(logger.get_dir(), 'model-{}'.format(iters))
            model.save(save_path)
            if isinstance(env, VecNormalize):
                rms_path = osp.join(logger.get_dir(), 'rms-{}'.format(iters))
                with open(rms_path, 'wb') as f:
                    rms = (env.ob_rms, env.ret_rms)
                    pickle.dump(rms, f)
            logger.log('Save {} model'.format(iters + 1))
        iters += 1

    return model, env
Esempio n. 11
0
def train(args, extra_args):
    env_type, env_id = get_env_type(args)
    print('env_type: {}'.format(env_type))
    total_timesteps = int(args.num_timesteps)
    seed = args.seed
    set_global_seeds(seed)
    #workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    #set_global_seeds(workerseed)

    learn = get_learn_function(args.alg)
    alg_kwargs = get_learn_function_defaults(args.alg, env_type)
    alg_kwargs.update(extra_args)

    env = build_env(args, normalize_ob=False, normalize_ret=False)
    if args.save_video_interval != 0:
        env = VecVideoRecorder(
            env,
            osp.join(logger.get_dir(), "videos"),
            record_video_trigger=lambda x: x % args.save_video_interval == 0,
            video_length=args.save_video_length)

    if args.network:
        alg_kwargs['network'] = args.network
    else:
        if alg_kwargs.get('network') is None:
            alg_kwargs['network'] = get_default_network(env_type)

    #timesteps_per_batch=1024
    #timesteps_per_batch=2048
    beta = -1
    if beta < 0:
        #print(alg_kwargs)
        nr_episodes = total_timesteps // alg_kwargs['timesteps_per_batch']
        # Automatically compute beta based on initial entropy and number of iterations
        policy = build_policy(
            env,
            alg_kwargs['network'],
            value_network='copy',
            normalize_observations=alg_kwargs['normalize_observations'],
            copos=True)
        ob = observation_placeholder(env.observation_space)

        sess = U.single_threaded_session()
        sess.__enter__()
        with tf.variable_scope("tmp_pi"):
            tmp_pi = policy(observ_placeholder=ob)
        sess.run(tf.global_variables_initializer())

        tmp_ob = np.zeros((1, ) + env.observation_space.shape)
        entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.X: tmp_ob})
        #beta = 2 * entropy / nr_episodes
        beta = 0
        print("Initial entropy: " + str(entropy) + ", episodes: " +
              str(nr_episodes))
        print("Constantly set beta: " + str(beta))

    print('Training {} on {}:{} with arguments \n{}'.format(
        args.alg, env_type, env_id, alg_kwargs))
    model = learn(env=env,
                  seed=seed,
                  beta=beta,
                  total_timesteps=total_timesteps,
                  sil_update=args.sil_update,
                  sil_loss=args.sil_loss,
                  **alg_kwargs)
    return model, env
Esempio n. 12
0
def make_env(env_name, nenv=1, seed=132, debug=True):
    if debug:
        return GymEnvWrapperDebug(env_name)
    env_type, env_id = get_env_type(env_name)
    env = make_vec_env(env_id, env_type, nenv, seed, reward_scale=1)
    return env