def main(gpu, envid, seed, lam_fraction, final_lam, path):
    with tf.device('/device:GPU:%s' % gpu):

        logger.configure(dir=dirs + '%s/%s/' % (path, seed))

        env = gym.make(envid)
        env = bench.Monitor(env, logger.get_dir())
        env = DummyVecEnv([lambda: env])

        kwargs = dict(network='mlp',
                      env=env,
                      total_timesteps=1000000,
                      seed=seed,
                      nsteps=2048,
                      nminibatches=32,
                      lam=0.1,
                      lam_fraction=lam_fraction,
                      final_lam=final_lam,
                      gamma=0.99,
                      noptepochs=10,
                      log_interval=1,
                      ent_coef=0.0,
                      lr=lambda f: 3e-4 * f,
                      cliprange=0.2,
                      value_network='copy')

        f = open(dirs + '%s/%s/params.txt' % (path, seed), 'w')
        f.write(str(kwargs))
        f.close()

        model = ppo2.learn(**kwargs)
        model.save(dirs + '%s/%s/model.pkl' % (path, seed))

        env.close()
Ejemplo n.º 2
0
def make_vec_envs(env_name,
                  seed,
                  num_processes,
                  num_frame_stack=1,
                  downsample=True,
                  color=False,
                  gamma=0.99,
                  log_dir='./tmp/',
                  device=torch.device('cpu')):
    Path(log_dir).mkdir(parents=True, exist_ok=True)
    envs = [
        make_env(env_name, seed, i, log_dir, downsample, color)
        for i in range(num_processes)
    ]

    if len(envs) > 1:
        envs = SubprocVecEnv(envs, context='fork')
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        if gamma is None:
            envs = VecNormalize(envs, ret=False)
        else:
            envs = VecNormalize(envs, gamma=gamma)

    envs = VecPyTorch(envs, device)

    if num_frame_stack > 1:
        envs = VecPyTorchFrameStack(envs, num_frame_stack, device)

    return envs
Ejemplo n.º 3
0
def ppo2(task, config, eval=False):
    assert config["act_discrete"] == False
    dummy_env = DummyVecEnv([lambda: task])
    learn_args = {
        "env": dummy_env,
        "network": config["model_network"],
        "total_timesteps": config["train_steps"],
        "nsteps": config["train_steps_update"],
        "nminibatches": config["train_minibatches_update"],
        "noptepochs": config["train_epoch_update"],
        "vf_coef": config["train_value_fn_coeff"],
        "ent_coef": config["train_entropy_coeff"],
        "gamma": config["train_gamma"],
        "lam": config["train_lambda"],
        "lr": float(config["train_learning_rate"]),
        "save_interval": config["ckp_model_save_interval"],
        # Network parameters.
        "nactions": task.action_space.shape[0]
    }
    if eval:
        learn_args["total_timesteps"] = config["eval_steps"]
        learn_args["nsteps"] = config["eval_steps"]
        learn_args["nminibatches"] = config["eval_steps"]
        learn_args["lr"] = 0.0
        models_path = config["working_dir"] + "/checkpoints/"
        models_path = models_path + str(
            max([f for f in os.listdir(models_path)]))
        learn_args["load_path"] = models_path
    return ppo2_learn(**learn_args)
Ejemplo n.º 4
0
    def make_vec_env(env_id,
                     env_type,
                     num_env,
                     seed,
                     wrapper_kwargs=None,
                     start_index=0,
                     reward_scale=1.0,
                     flatten_dict_observations=True,
                     gamestate=None):
        """
        Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
        """
        wrapper_kwargs = wrapper_kwargs or {}
        mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
        seed = seed + 10000 * mpi_rank if seed is not None else None
        logger_dir = logger.get_dir()

        def make_thunk(rank):
            return lambda: make_env(env_id=env_id,
                                    env_type=env_type,
                                    mpi_rank=mpi_rank,
                                    subrank=rank,
                                    seed=seed,
                                    reward_scale=reward_scale,
                                    gamestate=gamestate,
                                    flatten_dict_observations=
                                    flatten_dict_observations,
                                    wrapper_kwargs=wrapper_kwargs,
                                    logger_dir=logger_dir)

        set_global_seeds(seed)
        return DummyVecEnv(
            [make_thunk(i + start_index) for i in range(num_env)])
Ejemplo n.º 5
0
def deepq(task, config, eval=False):
    assert config["act_discrete"] == True
    dummy_env = DummyVecEnv([lambda: task])
    learn_args = {
        "env": dummy_env,
        "network": config["model_network"],
        "total_timesteps": config["train_steps"],
        "gamma": config["train_gamma"],
        "buffer_size": config["train_buffer_size"],
        "exploration_fraction": config["train_exploration_fraction"],
        "exploration_final_eps": config["train_exploration_final_eps"],
        "prioritized_replay": config["train_prioritized_replay"],
        "lr": float(config["train_learning_rate"]),
        "print_freq": config["ckp_model_save_interval"],
        "checkpoint_freq": config["ckp_model_save_interval"],
        "checkpoint_path": config["working_dir"],
        "learning_starts": -1,
        # Network parameters.
        "nactions": 1
    }
    if eval:
        learn_args["total_timesteps"] = config["eval_steps"]
        learn_args["learning_starts"] = config["eval_steps"]
        learn_args["exploration_fraction"] = 1.0 / config["eval_steps"]
        learn_args["exploration_final_eps"] = 0.0
        learn_args["lr"] = 0.0
        models_path = config["working_dir"] + "/checkpoints/model"
        learn_args["load_path"] = models_path
    return deepq_learn(**learn_args)
Ejemplo n.º 6
0
def make_vec_envs(env_name,
                  seed,
                  num_processes,
                  num_frame_stack=1,
                  downsample=True,
                  color=False,
                  gamma=0.99,
                  log_dir='./tmp/',
                  device=torch.device('cpu'),
                  use_extended_wrapper=False,
                  train_mode="train_encoder"):
    try:
        Path(log_dir).mkdir(parents=True, exist_ok=True)
    except OSError as exc:
        if exc.errno != errno.EEXIST:
            raise
        pass
    envs = [
        make_env(env_name,
                 seed,
                 i,
                 log_dir,
                 downsample,
                 color,
                 frame_stack=num_frame_stack,
                 use_extended_wrapper=use_extended_wrapper,
                 train_mode=train_mode) for i in range(num_processes)
    ]

    if len(envs) > 1:
        envs = SubprocVecEnv(envs, context='fork')
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        if gamma is None:
            envs = VecNormalize(envs, ret=False)
        else:
            envs = VecNormalize(envs, gamma=gamma)

    envs = VecPyTorch(envs, device)

    # if num_frame_stack > 1:
    #     envs = VecPyTorchFrameStack(envs, num_frame_stack, device)

    return envs
Ejemplo n.º 7
0
def make_envs(env_id, device, seed=0, num_envs=1, frame_stack=1, **kwargs):
    envs = [
        env_generator(env_id, seed=seed + 1000 * i, **kwargs)
        for i in range(num_envs)
    ]

    if len(envs) > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    envs = VecPyTorch(envs, device)

    if frame_stack > 1:
        envs = VecPyTorchFrameStack(envs, frame_stack, device)

    return envs
def main():
    steering_angles = np.array([-0.7, -0.5, -0.25, 0.0, 0.25, 0.5, 0.7])
    env = AirSimGym(continuous=False, off_road_dist=2.9, max_speed=4.5, scale_reward=True, steering_angles=steering_angles)
    env = DummyVecEnv([lambda: env])

    # the noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))

    model = DDPG(MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise)
    model.learn(total_timesteps=40000)

    del model # remove to demonstrate saving and loading

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
Ejemplo n.º 9
0
def run_baselines(env, seed, log_dir):
    """Create baselines model and training.

    Replace the ppo and its training with the algorithm you want to run.

    Args:
        env (gym.Env): Environment of the task.
        seed (int): Random seed for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: The log file path.

    """
    seed = seed + 1000000
    set_global_seeds(seed)
    env.seed(seed)

    # Set up logger for baselines
    configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard'])
    baselines_logger.info('seed={}, logdir={}'.format(
        seed, baselines_logger.get_dir()))

    env = DummyVecEnv([
        lambda: bench.Monitor(
            env, baselines_logger.get_dir(), allow_early_resets=True)
    ])

    ddpg.learn(network='mlp',
               env=env,
               nb_epochs=params['n_epochs'],
               nb_epoch_cycles=params['steps_per_epoch'],
               normalize_observations=False,
               critic_l2_reg=0,
               actor_lr=params['policy_lr'],
               critic_lr=params['qf_lr'],
               gamma=params['discount'],
               nb_train_steps=params['n_train_steps'],
               nb_rollout_steps=params['n_rollout_steps'],
               nb_eval_steps=100)

    return osp.join(log_dir, 'progress.csv')
Ejemplo n.º 10
0
def main():
    vecEnv = DummyVecEnv([makeEnv])
    env = vecEnv.envs[0]
    model = ppo2.learn(
        network=models.mlp(num_hidden=20, num_layers=1),
        env=vecEnv,
        total_timesteps=1000000
    )

    # play model using shorter change rate
    env.change_rate = 100

    play(env, model, 1000)

    # play sine curve
    env.change_rate = 1
    env.scaling_env_options['input'] = INPUTS['SINE_CURVE']

    play(env, model, 1000)

    env.close()
Ejemplo n.º 11
0
def make_rl_envs(env_id, seed, n_envs, device, frame_stack=4, add_video=False, add_frames=False, vid_path=None, **kwargs):
    envs = [env_generator(env_id, seed=seed+1000*i) for i in range(n_envs)]

    if len(envs) > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if add_video:
        assert vid_path is not None
        envs = VecConcatVideo(envs, vid_path, ordered=True)
    elif add_frames:
        assert vid_path is not None
        envs = VecConcatVideo(envs, vid_path, ordered=False)

    envs = VecPyTorch(envs, device)

    if frame_stack > 1:
        envs = VecPyTorchFrameStack(envs, frame_stack, device)

    return envs
Ejemplo n.º 12
0
def get_envs(factory, dummy=False):
    # assign an environment for each core
    num_envs = len(os.sched_getaffinity(0))
    # initialize (1) singular environment for metadata fetching and (2) vector of environments
    env = factory.make_env()
    env.seed(1)

    def make_env():
        def _thunk():
            env = factory.make_env()
            env.seed(1)

            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    if dummy:
        envs = DummyVecEnv(envs)
    else:
        envs = ShmemVecEnv(envs)
    return env, envs
Ejemplo n.º 13
0
def make_env(device, camera, policies, pose_estimator):
    env_fn = [make_env_fn()]
    vec_env = DummyVecEnv(env_fn)

    base_env = vec_env.envs[0]
    low = base_env.normalize_low
    high = base_env.normalize_high
    state_to_est = base_env.state_to_estimate

    vec_env = wrap_initial_policies(vec_env, device, policies)

    vec_env = RealImageObsVecEnvWrapper(vec_env, (128, 128), camera)

    vec_env = VecPyTorch(vec_env, device)

    vec_env = PoseEstimatorVecEnvWrapper(vec_env,
                                         device,
                                         pose_estimator,
                                         state_to_est,
                                         low,
                                         high,
                                         abs_to_rel=True)

    return vec_env
Ejemplo n.º 14
0
def test_from_baselines_env(env_id):
    env_fn = lambda: gym.make(env_id)
    e = gym3.FromBaselinesVecEnv(DummyVecEnv([env_fn]))
    gym3_rollout(e)
Ejemplo n.º 15
0
def train_save_baseline():
    env = get_env()
    model = learn('cnn_small',
                  DummyVecEnv([lambda: env]),
                  total_timesteps=int(80e6))
    model.save(save_path=cache_path)
Ejemplo n.º 16
0
def load_baseline():
    env = get_env()
    model = learn('cnn_small', DummyVecEnv([lambda: env]), total_timesteps=0)
    model.load(load_path=cache_path)
    env.close()
    return model
Ejemplo n.º 17
0
    return model


# Avoid division error when calculate the mean (in our case if epinfo is empty returns np.nan, not return an error)
def safemean(xs):
    return np.nan if len(xs) == 0 else np.mean(xs)


#### RANDOM SEED
print('RANDOM SEED: 666')
env = gym.make("metacar-level3-continuous-v0")
env = TerminateWrapper(env)
env = ClipRewardsWrapper(env)
env = WorldModelWrapper(env)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, nstack=8)

start = time.time()
model_666 = learn(network='world_model_mlp',
                  env=env,
                  total_timesteps=1000,
                  seed=666,
                  nsteps=16,
                  save_model_path='')
print('Time elapsed {}'.format(time.time() - start))

model_666.train_model.value_network.save_weights(
    'models/world_model_value_net_666.h5')
model_666.train_model.policy_network.save_weights(
    'models/world_model_policy_net_666.h5')
Ejemplo n.º 18
0
    DummyVecEnv,
)

import Maze

LOG_DIR = './maze_train_folder'
#Goes much faster without gpu for some reason...
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"


def env_fn():
    return gym.make('maze-v0', start_level=0, num_levels=50)


envs = [env_fn for x in range(64)]
venv = DummyVecEnv(envs)

config = tf.ConfigProto()
sess = tf.Session(config=config)
sess.__enter__()

conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)

final_model = ppo2.learn(
    env=venv,
    network=conv_fn,
    total_timesteps=0,
    mpi_rank_weight=0,
    update_fn=None,
    init_fn=None,
)
Ejemplo n.º 19
0
ppo_epochs = 3
clip_range = .2
use_vf_clipping = True

format_strs = ['csv', 'stdout']
logger.configure(dir=LOG_DIR, format_strs=format_strs)

logger.info("creating environment")


def env_fn():
    return gym.make('maze-v0', start_level=0, num_levels=50)


envs = [env_fn for x in range(64)]
venv = DummyVecEnv(envs)


def eval_env_fn():
    return gym.make('maze-v0', start_level=0, num_levels=0)


eval_envs = [eval_env_fn for x in range(64)]
eval_venv = DummyVecEnv(eval_envs)

venv = VecMonitor(
    venv=venv,
    filename=None,
    keep_buf=100,
)
Ejemplo n.º 20
0
ppo_epochs = 3
clip_range = .2
use_vf_clipping = True

format_strs = ['csv', 'stdout']
logger.configure(dir=LOG_DIR, format_strs=format_strs)

logger.info("creating environment")


def env_fn():
    return gym.make('snake-v0')


envs = [env_fn for x in range(64)]
venv = DummyVecEnv(envs)

venv = VecMonitor(
    venv=venv,
    filename=None,
    keep_buf=100,
)

logger.info("creating tf session")
config = tf.ConfigProto()
sess = tf.Session(config=config)
sess.__enter__()

conv_fn = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256)

logger.info("training")
Ejemplo n.º 21
0
 def __init__(self, env_fns):
     DummyVecEnv.__init__(self, env_fns)
     self.init_storage()
Ejemplo n.º 22
0
def train(*, env_id, num_env, hps, num_timesteps, seed, use_reward, ep_path,
          dmlab):
    venv = VecFrameStack(
        DummyVecEnv([
            lambda: CollectGymDataset(Minecraft('MineRLTreechop-v0', 'treechop'
                                                ),
                                      ep_path,
                                      atari=False)
        ]), hps.pop('frame_stack'))
    venv.score_multiple = 1
    venv.record_obs = True if env_id == 'SolarisNoFrameskip-v4' else False
    ob_space = venv.observation_space
    ac_space = venv.action_space
    gamma = hps.pop('gamma')
    policy = {'rnn': CnnGruPolicy, 'cnn': CnnPolicy}[hps.pop('policy')]
    print('Running train ==========================================')
    agent = PpoAgent(
        scope='ppo',
        ob_space=ob_space,
        ac_space=ac_space,
        stochpol_fn=functools.partial(
            policy,
            scope='pol',
            ob_space=ob_space,
            ac_space=ac_space,
            update_ob_stats_independently_per_gpu=hps.pop(
                'update_ob_stats_independently_per_gpu'),
            proportion_of_exp_used_for_predictor_update=hps.pop(
                'proportion_of_exp_used_for_predictor_update'),
            dynamics_bonus=hps.pop("dynamics_bonus")),
        gamma=gamma,
        gamma_ext=hps.pop('gamma_ext'),
        lam=hps.pop('lam'),
        nepochs=hps.pop('nepochs'),
        nminibatches=hps.pop('nminibatches'),
        lr=hps.pop('lr'),
        cliprange=0.1,
        nsteps=128,
        ent_coef=0.001,
        max_grad_norm=hps.pop('max_grad_norm'),
        use_news=hps.pop("use_news"),
        comm=MPI.COMM_WORLD if MPI.COMM_WORLD.Get_size() > 1 else None,
        update_ob_stats_every_step=hps.pop('update_ob_stats_every_step'),
        int_coeff=hps.pop('int_coeff'),
        ext_coeff=hps.pop('ext_coeff'))
    agent.start_interaction([venv])
    if hps.pop('update_ob_stats_from_random_agent'):
        agent.collect_random_statistics(num_timesteps=128 * 50)
    assert len(hps) == 0, "Unused hyperparameters: %s" % list(hps.keys())

    counter = 0
    while True:
        info = agent.step()
        if info['update']:
            logger.logkvs(info['update'])
            logger.dumpkvs()
            counter += 1
        if agent.I.stats['tcount'] > num_timesteps:
            break

    agent.stop_interaction()
Ejemplo n.º 23
0
def get_env(
    env_name: str,
    num_processes: int = 1,
    seed: int = 1,
    time_limit: int = None,
    normalize_transition: bool = True,
    normalize_first_n: int = None,
    allow_early_resets: bool = False,
    same_np_seed: bool = False,
    **kwargs: Dict[str, Any],
) -> Env:
    """
    Return environment object from environment name, with wrappers for added
    functionality, such as multiprocessing and observation/reward normalization. Extra
    arguments are passed to environment constructor.

    Parameters
    ----------
    env_name : str
        Name of environment to create.
    num_processes: int
        Number of asynchronous copies of the environment to run simultaneously.
    seed : int
        Random seed for environment.
    time_limit : int
        Limit on number of steps for environment.
    normalize_transition : bool
        Whether or not to add environment wrapper to normalize observations and rewards.
    normalize_first_n: int
        If not equal to None, only normalize the first ``normalize_first_n`` elements of
        the observation. If ``normalize_transition`` is False then this value is
        ignored.
    allow_early_resets: bool
        Whether or not to allow environments before done=True is returned.
    same_np_seed : bool
        Whether or not to use the same numpy random seed across each process. This
        should really only be used when training on MetaWorld, as it allows for multiple
        processes to generate/act over the same set of goals.

    Returns
    -------
    env : Env
        Environment object.
    """

    # Create vectorized environment.
    seeds = [seed + i for i in range(num_processes)]
    if same_np_seed:
        np_seeds = [seed] * num_processes
    else:
        np_seeds = list(seeds)
    env_creators = [
        get_single_env_creator(env_name, seeds[i], np_seeds[i], time_limit,
                               allow_early_resets, **kwargs)
        for i in range(num_processes)
    ]
    if num_processes > 1:
        env = ShmemVecEnv(env_creators, context="fork")
    elif num_processes == 1:
        # Use DummyVecEnv if num_processes is 1 to avoid multiprocessing overhead.
        env = DummyVecEnv(env_creators)
    else:
        raise ValueError("Invalid num_processes value: %s" % num_processes)

    # Add environment wrappers to normalize observations/rewards and convert between
    # numpy arrays and torch.Tensors.
    if normalize_transition:
        env = VecNormalizeEnv(env, first_n=normalize_first_n)
    env = VecPyTorchEnv(env)

    return env