Example #1
0
def train(args, extra_args):
    env_type, env_id = get_env_type(args)
    print('env_type: {}'.format(env_type))
    total_timesteps = int(args.num_timesteps)
    seed = args.seed
    set_global_seeds(seed)

    learn = get_learn_function(args.alg)
    alg_kwargs = get_learn_function_defaults(args.alg, env_type)
    alg_kwargs.update(extra_args)
    #######################################################
    #in default one more normalization_observations=True!!!!
    ################################################################
    ########!!!arguments for build_env need to be adjusted according to train+eval details later
    env = build_env(args, normalize_ob=False)
    eval_env = build_env(args, normalize_ob=False, is_eval=True)
    ########################################################
    if args.save_video_interval != 0:
        env = VecVideoRecorder(
            env,
            osp.join(logger.get_dir(), "videos"),
            record_video_trigger=lambda x: x % args.save_video_interval == 0,
            video_length=args.save_video_length)

    if args.network:
        alg_kwargs['network'] = args.network
    else:
        if alg_kwargs.get('network') is None:
            alg_kwargs['network'] = get_default_network(env_type)

    print('Training {} on {}:{} with arguments \n{}'.format(
        args.alg, env_type, env_id, alg_kwargs))
    iters = 0
    for model in learn(env=env,
                       env_id=env_id,
                       eval_env=eval_env,
                       make_eval_env=lambda: build_env(
                           args, normalize_ob=False, is_eval=True),
                       seed=seed,
                       total_timesteps=total_timesteps,
                       sil_update=args.sil_update,
                       sil_loss=args.sil_loss,
                       **alg_kwargs):
        if args.store_ckpt:
            save_path = osp.join(logger.get_dir(), 'model-{}'.format(iters))
            model.save(save_path)
            if isinstance(env, VecNormalize):
                rms_path = osp.join(logger.get_dir(), 'rms-{}'.format(iters))
                with open(rms_path, 'wb') as f:
                    rms = (env.ob_rms, env.ret_rms)
                    pickle.dump(rms, f)
            logger.log('Save {} model'.format(iters + 1))
        iters += 1

    return model, env
Example #2
0
def train(args, extra_args):
    env_type, env_id = get_env_type(args)
    print('env_type: {}'.format(env_type))
    total_timesteps = int(args.num_timesteps)
    seed = args.seed
    set_global_seeds(seed)

    learn = get_learn_function(args.alg)
    alg_kwargs = get_learn_function_defaults(args.alg, env_type)
    alg_kwargs.update(extra_args)

    env = build_env(args, normalize_ob=False)
    if args.save_video_interval != 0:
        env = VecVideoRecorder(
            env,
            osp.join(logger.get_dir(), "videos"),
            record_video_trigger=lambda x: x % args.save_video_interval == 0,
            video_length=args.save_video_length)

    if args.network:
        alg_kwargs['network'] = args.network
    else:
        if alg_kwargs.get('network') is None:
            alg_kwargs['network'] = get_default_network(env_type)

    print('Training {} on {}:{} with arguments \n{}'.format(
        args.alg, env_type, env_id, alg_kwargs))
    model = learn(env=env,
                  seed=seed,
                  total_timesteps=total_timesteps,
                  sil_update=args.sil_update,
                  sil_loss=args.sil_loss,
                  **alg_kwargs)

    return model, env
Example #3
0
def main():
    arg_parser = common_arg_parser()
    args, unknown_args = arg_parser.parse_known_args()
    extra_args = parse_cmdline_kwargs(unknown_args)

    args.num_timesteps = 0
    args.play = True
    args.env = 'YamaXRealForwardWalk-v0'

    model, env = train(args, extra_args)
    env.close()

    env = build_env(args)
    obs = env.reset()

    def initialize_placeholders(nlstm=128, **kwargs):
        return np.zeros((args.num_env or 1, 2 * nlstm)), np.zeros((1))

    state, dones = initialize_placeholders(**extra_args)
    while True:
        actions, _, state, _ = model.step(obs, S=state, M=dones)
        obs, _, done, _ = env.step(actions)
        env.render()
        done = done.any() if isinstance(done, np.ndarray) else done

        if done:
            obs = env.reset()

    env.close()
def train_copos(args):
    import baselines.common.tf_util as U
    sess = U.single_threaded_session()
    sess.__enter__()

    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
        rank = 0
        configure_logger(args.log_path)
    else:
        rank = MPI.COMM_WORLD.Get_rank()
        configure_logger(args.log_path, format_strs=[])

    workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()

    def policy_fn(name, ob_space, ac_space):
        return CompatibleMlpPolicy(name=name,
                                   ob_space=ob_space,
                                   ac_space=ac_space,
                                   hid_size=32,
                                   num_hid_layers=2)

    set_global_seeds(workerseed)
    env = build_env(args, normalize_ob=True)
    #env = gym.make(args.env)
    #env.seed(workerseed)

    timesteps_per_batch = 10000
    #timesteps_per_batch=2048
    beta = -1
    if beta < 0:
        nr_episodes = int(args.num_timesteps) // timesteps_per_batch
        # Automatically compute beta based on initial entropy and number of iterations
        tmp_pi = policy_fn("tmp_pi", env.observation_space, env.action_space)

        sess.run(tf.global_variables_initializer())

        tmp_ob = np.zeros((1, ) + env.observation_space.shape)
        entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.ob: tmp_ob})
        beta = 2 * entropy / nr_episodes
        print("Initial entropy: " + str(entropy) + ", episodes: " +
              str(nr_episodes))
        print("Automatically set beta: " + str(beta))
    copos_mpi.learn(env,
                    policy_fn,
                    timesteps_per_batch=timesteps_per_batch,
                    epsilon=0.01,
                    beta=beta,
                    cg_iters=10,
                    cg_damping=0.1,
                    max_timesteps=int(args.num_timesteps),
                    gamma=0.99,
                    lam=0.98,
                    vf_iters=5,
                    vf_stepsize=1e-3)
    env.close()
Example #5
0
def train(args,extra_args):
    env_type, env_id = get_env_type(args)
    print('env_type: {}'.format(env_type))
    total_timesteps = int(args.num_timesteps)
    seed = args.seed
    set_global_seeds(seed)
    #workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    #set_global_seeds(workerseed)

    learn = get_learn_function(args.alg)
    alg_kwargs = get_learn_function_defaults(args.alg, env_type)
    alg_kwargs.update(extra_args)
    
    env = build_env(args,normalize_ob=False,normalize_ret=False)
    if args.save_video_interval != 0:
        env = VecVideoRecorder(env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length)

    if args.network:
        alg_kwargs['network'] = args.network
    else:
        if alg_kwargs.get('network') is None:
            alg_kwargs['network'] = get_default_network(env_type)
   
    #timesteps_per_batch=1024
    #timesteps_per_batch=2048
    beta = -1
    if beta < 0:
        #print(alg_kwargs)
        nr_episodes = total_timesteps // alg_kwargs['timesteps_per_batch']
        # Automatically compute beta based on initial entropy and number of iterations
        policy = build_policy(env, alg_kwargs['network'], value_network='copy', normalize_observations=alg_kwargs['normalize_observations'], copos=True)
        ob = observation_placeholder(env.observation_space)
        
        sess = U.single_threaded_session()
        sess.__enter__()
        with tf.variable_scope("tmp_pi"):
            tmp_pi = policy(observ_placeholder=ob)
        sess.run(tf.global_variables_initializer())
        
        tmp_ob = np.zeros((1,) + env.observation_space.shape)
        entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.X: tmp_ob})
        #beta = 2 * entropy / nr_episodes
        beta = 0
        print("Initial entropy: " + str(entropy) + ", episodes: " + str(nr_episodes))
        print("Constantly set beta: " + str(beta))
    
    print('Training {} on {}:{} with arguments \n{}'.format(args.alg, env_type, env_id, alg_kwargs))
    model=learn(env=env, seed=seed, beta=beta,
                total_timesteps=total_timesteps,
                **alg_kwargs)
    return model, env
Example #6
0
def setUp(env, alg, load_path):
    args = Bunch({
        'env': env,
        'alg': alg,
        'num_timesteps': 0,
        'seed': None,
        'num_env': 1,
        'network': None
    })
    extra_args = {'load_path': load_path}

    model, env = train(args, extra_args)
    env.close()
    env = build_env(args, extra_args)

    return env, model
Example #7
0
def main():
    arg_parser = common_arg_parser()
    args, unknown_args = arg_parser.parse_known_args()
    args.num_env = 1
    extra_args = parse_cmdline_kwargs(unknown_args)

    model, env = train(args, extra_args)
    env.close()
    logger.log("Running trained model")
    env = build_env(args)
    if not args.play:
        ts = time.gmtime()
        directory = time.strftime("./render/%s", ts)
        logger.log("Output video to directory:", directory)
        env.envs = [gym.wrappers.Monitor(env.envs[0], directory=directory)]
    obs = env.reset()

    def initialize_placeholders(nlstm=128, **kwargs):
        return np.zeros((args.num_env, 2 * nlstm)), np.zeros((1))

    state, dones = initialize_placeholders(**extra_args)
    NUM_VIDEO = 1
    while True:
        actions, _, state, _ = model.step(obs, S=state, M=dones)
        obs, _, done, _ = env.step(actions)
        if args.play:
            env.render()
        done = done.any() if isinstance(done, np.ndarray) else done

        if done:
            NUM_VIDEO -= 1
            if NUM_VIDEO <= 0:
                break
            obs = env.reset()

    env.close()
Example #8
0
    'save_interval': 20,
    'log_interval': 1,
    'save_path': save_path,
    'model_load_path': model_load_path,
    'seed': 0,
    'reward_scale': 1,
    'flatten_dict_observations': True,
    'transfer_weights': False
}
args = SimpleNamespace(**args_dict)

# Prepare the environment and learning algorithm
env_type, env_id = get_env_type(args.env)
learn = get_learn_function(args.alg)
alg_kwargs = get_learn_function_defaults(args.alg, env_type)
env = build_env(args)
alg_kwargs['network'] = args.network

# The path we will store the results of this experiment
full_path = args.save_path + '/' + args.env + '-' + args.alg

# Make folders that we will store the checkpoints, models and epoch results
if not os.path.exists(full_path):
    os.makedirs(full_path)
    os.makedirs(full_path + '/checkpoints')

print("About to start learning model")

model = learn(env=env,
              seed=args.seed,
              total_timesteps=args.total_timesteps,
Example #9
0
def train(args, extra_args):
    env_type, env_id = get_env_type(args)
    print('env_type: {}'.format(env_type))
    total_timesteps = int(args.num_timesteps)
    seed = args.seed
    set_global_seeds(seed)

    learn = get_learn_function(args.alg)
    alg_kwargs = get_learn_function_defaults(args.alg, env_type)
    alg_kwargs.update(extra_args)

    env = build_env(args, normalize_ob=False)
    eval_env = build_env(args, normalize_ob=False, is_eval=True)
    if args.save_video_interval != 0:
        env = VecVideoRecorder(
            env,
            osp.join(logger.get_dir(), "videos"),
            record_video_trigger=lambda x: x % args.save_video_interval == 0,
            video_length=args.save_video_length)

    if args.network:
        alg_kwargs['network'] = args.network
    else:
        if alg_kwargs.get('network') is None:
            alg_kwargs['network'] = get_default_network(env_type)
    beta = -1
    if beta < 0:
        #print(alg_kwargs)
        nr_episodes = total_timesteps // alg_kwargs['timesteps_per_batch']
        # Automatically compute beta based on initial entropy and number of iterations
        policy = build_policy(
            env,
            alg_kwargs['network'],
            value_network='copy',
            normalize_observations=alg_kwargs['normalize_observations'],
            copos=True)
        ob = observation_placeholder(env.observation_space)

        sess = U.single_threaded_session()
        sess.__enter__()
        with tf.variable_scope("tmp_pi"):
            tmp_pi = policy(observ_placeholder=ob)
        sess.run(tf.global_variables_initializer())

        tmp_ob = np.zeros((1, ) + env.observation_space.shape)
        entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.X: tmp_ob})
        #beta = 2 * entropy / nr_episodes
        beta = 0
        print("Initial entropy: " + str(entropy) + ", episodes: " +
              str(nr_episodes))
        print("Constantly set beta: " + str(beta))

    print('Training {} on {}:{} with arguments \n{}'.format(
        args.alg, env_type, env_id, alg_kwargs))
    iters = 0
    for model in learn(env=env,
                       env_id=env_id,
                       eval_env=eval_env,
                       make_eval_env=lambda: build_env(
                           args, normalize_ob=False, is_eval=True),
                       seed=seed,
                       beta=beta,
                       total_timesteps=total_timesteps,
                       **alg_kwargs):
        if args.store_ckpt:
            save_path = osp.join(logger.get_dir(), 'model-{}'.format(iters))
            model.save(save_path)
            if isinstance(env, VecNormalize):
                rms_path = osp.join(logger.get_dir(), 'rms-{}'.format(iters))
                with open(rms_path, 'wb') as f:
                    rms = (env.ob_rms, env.ret_rms)
                    pickle.dump(rms, f)
            logger.log('Save {} model'.format(iters + 1))
        iters += 1

    return model, env
Example #10
0
    'noptepochs': 10,
    'save_interval': 20,
    'log_interval': 1,
    'save_path': save_path,
    'model_load_path': model_load_path,
    'seed': 0,
    'reward_scale': 1,
    'flatten_dict_observations': True
}
second_env_args = SimpleNamespace(**second_env_args_dict)

# Prepare the environment and learning algorithm
env_type, env_id = get_env_type(args.env)
learn = get_learn_function(args.alg)
alg_kwargs = get_learn_function_defaults(args.alg, env_type)
env = build_env(args)

# Prepare the second environment if needed
second_env = build_env(second_env_args)

alg_kwargs['network'] = args.network

# The path we will store the results of this experiment
full_path = args.save_path + '/' + args.env + '-' + args.alg

# Make folders that we will store the checkpoints, models and epoch results
if not os.path.exists(full_path):
    os.makedirs(full_path)
    os.makedirs(full_path + '/checkpoints')

print("About to start learning model")