Example #1
0
def test_lstm_example():
    import tensorflow as tf
    from baselines.common import policies, models, cmd_util
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv

    # create vectorized environment
    venv = DummyVecEnv([lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)])

    with tf.Session() as sess:
        # build policy based on lstm network with 128 units
        policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1)

        # initialize tensorflow variables
        sess.run(tf.global_variables_initializer())

        # prepare environment variables
        ob = venv.reset()
        state = policy.initial_state
        done = [False]
        step_counter = 0

        # run a single episode until the end (i.e. until done)
        while True:
            action, _, state, _ = policy.step(ob, S=state, M=done)
            ob, reward, done, _ = venv.step(action)
            step_counter += 1
            if done:
                break


        assert step_counter > 5
Example #2
0
def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
    def seeded_env_fn():
        env = env_fn()
        env.seed(0)
        return env

    np.random.seed(0)
    env = DummyVecEnv([seeded_env_fn])
    with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
        tf.set_random_seed(0)
        model = learn_fn(env)
        sum_rew = 0
        done = True
        for i in range(n_trials):
            if done:
                obs = env.reset()
                state = model.initial_state
            if state is not None:
                a, v, state, _ = model.step(obs, S=state, M=[False])
            else:
                a, v, _, _ = model.step(obs)
            obs, rew, done, _ = env.step(a)
            sum_rew += float(rew)
        print("Reward in {} trials is {}".format(n_trials, sum_rew))
        assert sum_rew > min_reward_fraction * n_trials, \
            'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials)
Example #3
0
def test_serialization(learn_fn, network_fn):
    '''
    Test if the trained model can be serialized
    '''


    if network_fn.endswith('lstm') and learn_fn in ['acer', 'acktr', 'trpo_mpi', 'deepq']:
            # TODO make acktr work with recurrent policies
            # and test
            # github issue: https://github.com/openai/baselines/issues/660
            return

    def make_env():
        env = MnistEnv(episode_len=100)
        env.seed(10)
        return env

    env = DummyVecEnv([make_env])
    ob = env.reset().copy()
    learn = get_learn_function(learn_fn)

    kwargs = {}
    kwargs.update(network_kwargs[network_fn])
    kwargs.update(learn_kwargs[learn_fn])


    learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs)

    with tempfile.TemporaryDirectory() as td:
        model_path = os.path.join(td, 'serialization_test_model')

        with tf.Graph().as_default(), make_session().as_default():
            model = learn(total_timesteps=100)
            model.save(model_path)
            mean1, std1 = _get_action_stats(model, ob)
            variables_dict1 = _serialize_variables()

        with tf.Graph().as_default(), make_session().as_default():
            model = learn(total_timesteps=0, load_path=model_path)
            mean2, std2 = _get_action_stats(model, ob)
            variables_dict2 = _serialize_variables()

        for k, v in variables_dict1.items():
            np.testing.assert_allclose(v, variables_dict2[k], atol=0.01,
                err_msg='saved and loaded variable {} value mismatch'.format(k))

        np.testing.assert_allclose(mean1, mean2, atol=0.5)
        np.testing.assert_allclose(std1, std2, atol=0.5)
Example #4
0
def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES):
    env = DummyVecEnv([env_fn])
    with tf.Graph().as_default(), tf.compat.v1.Session(config=_sess_config).as_default():
        model = learn_fn(env)
        N_TRIALS = 100
        observations, actions, rewards = rollout(env, model, N_TRIALS)
        rewards = [sum(r) for r in rewards]
        avg_rew = sum(rewards) / N_TRIALS
        print("Average reward in {} episodes is {}".format(n_trials, avg_rew))
        assert avg_rew > min_avg_reward, \
            'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward)
Example #5
0
def test_microbatches():
    def env_fn():
        env = gym.make('CartPole-v0')
        env.seed(0)
        return env

    learn_fn = partial(learn, network='mlp', nsteps=32, total_timesteps=32, seed=0)

    env_ref = DummyVecEnv([env_fn])
    sess_ref = make_session(make_default=True, graph=tf.Graph())
    learn_fn(env=env_ref)
    vars_ref = {v.name: sess_ref.run(v) for v in tf.trainable_variables()}

    env_test = DummyVecEnv([env_fn])
    sess_test = make_session(make_default=True, graph=tf.Graph())
    learn_fn(env=env_test, model_fn=partial(MicrobatchedModel, microbatch_size=2))
    vars_test = {v.name: sess_test.run(v) for v in tf.trainable_variables()}

    for v in vars_ref:
        np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=1e-3)
Example #6
0
def evaluation(policy, num_times):
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv

    def make_env():
        df = pd.read_csv('dataset/btc_indexed2_test.csv')
        env = trading_env.make(
            env_id='training_v1',
            obs_data_len=1,
            step_len=1,
            df=df,
            fee=0.003,
            max_position=5,
            deal_col_name='close',
            return_transaction=True,
            sample_days=30,
            normalize_reward=False,
            feature_names=['open', 'high', 'low', 'close', 'volume'])
        env = wrapper.LogPriceFilterWrapper(env)
        return env

    env = DummyVecEnv([make_env] * 8)

    rewards = []
    for i in range(num_times):
        episode_reward = np.zeros(8)
        ob = env.reset()
        lstm_state = policy.initial_state
        not_done = [1] * 8
        while True:
            action, _, lstm_state, _ = policy.step(ob, lstm_state, not_done)
            ob, reward, done, _ = env.step(action)
            #print(reward)
            #print(reward.shape)
            episode_reward += reward
            print(done)
            if done.all():
                break
            not_done = np.invert(done).astype(int)
        print("evaluation ", i, ": ", episode_reward)
        rewards.append(list(episode_reward))
    print("evaluation: mu:", np.mean(rewards), "std:", np.std(rewards))
Example #7
0
def main():
    config = tf.ConfigProto()

    # Avoid warning message errors
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"

    # Allowing GPU memory growth
    config.gpu_options.allow_growth = True

    with tf.Session(config=config):
        model.play(policy=policies.A2CPolicy,
                   env=DummyVecEnv([env.make_train_0]))
    def create_environment(self):
        envs = [
            make_env(i, args, True, self.gan_file)
            for i in range(self.num_processes)
        ]
        envs = DummyVecEnv(envs)
        if len(envs.observation_space.shape) == 1:
            envs = VecNormalize(envs, gamma=args.gamma)

        obs_shape = envs.observation_space.shape
        obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
        return envs, obs_shape
Example #9
0
def train(env_id, num_timesteps, seed, pol, cur, vis, model):
    from baselines.common import set_global_seeds
    from baselines.ppo2 import ppo2
    from baselines.ppo2.policies import HierPolicy, HierPolicy2, MlpPolicy, RandomWalkPolicy
    import gym
    import gym_program
    import tensorflow as tf
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
    ncpu = 1
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()

    hier = True if pol == 'hier1' or pol == 'hier2' else False

    def make_env():
        set_global_seeds(seed)
        env = gym.make(env_id)
        env.set_curiosity(cur, model)
        env.set_hier(hier)
        env.set_visualize(vis)
        env = bench.Monitor(env, logger.get_dir())
        env.seed(seed)
        return env

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)

    if pol == 'hier1': policy = HierPolicy
    elif pol == 'hier2': policy = HierPolicy2
    elif policy == 'mlp': policy = MlpPolicy
    elif pol == 'random_walk':
        pol = RandomWalkPolicy
        pol(env)
        return

    ppo2.learn(policy=policy,
               env=env,
               pol=pol,
               nsteps=2048,
               nminibatches=32,
               lam=0.95,
               gamma=0.99,
               noptepochs=10,
               log_interval=1,
               ent_coef=0.0,
               lr=1e-4,
               cliprange=0.2,
               total_timesteps=num_timesteps)
Example #10
0
def play():
    env_args = dict()
    network_kwargs = dict(nlstm=512)

    # create vectorized environment
    pysc2_env_vec = DummyVecEnv([partial(make_sc2env, id=i, **env_args) for i in range(1)])

    policy = policies.build_policy(pysc2_env_vec, "cnn_lstm", **network_kwargs)
    nenvs = pysc2_env_vec.num_envs
    # Calculate the batch_size
    nsteps=256
    nminibatches=1
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    ent_coef=0.0
    vf_coef=0.5
    max_grad_norm=0.5

    make_model = lambda : ppo_model(policy=policy, ob_space=(64, 64, 3), ac_space=65, nbatch_act=nenvs, nbatch_train=nbatch_train,
                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                    max_grad_norm=max_grad_norm)
    model = make_model()
    model.load("4860_ppo_cnn_lstm_512_medium")

    images = []
    ob = pysc2_env_vec.reset()
    state = model.initial_state
    done = [False]
    step_counter = 0

    # run a single episode until the end (i.e. until done)
    while True:
        #print(step_counter)
        images.append(ob)
        action, _, state, _ = model.step(ob, S=state, M=done)
        ob, _, done, stats = pysc2_env_vec.step(action)
        step_counter += 1
        if(done[0]):
            imageio.mimsave(str(stats[0]["final_reward"]) + "_" + str(difficulty) + '.gif', [np.array(img[0]) for i, img in enumerate(images) if i%2 == 0], fps=4)
            images = []
Example #11
0
def run_baselines(env, seed, log_dir):
    """Create baselines model and training.

    Args:
        env (dict): Environment of the task.
        seed (int): Random positive integer for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: Path to output csv file

    """
    ncpu = max(multiprocessing.cpu_count() // 2, 1)
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.compat.v1.Session(config=config).__enter__()

    # Set up logger for baselines
    configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard'])
    baselines_logger.info('rank {}: seed={}, logdir={}'.format(
        0, seed, baselines_logger.get_dir()))

    env = DummyVecEnv([
        lambda: bench.Monitor(
            env, baselines_logger.get_dir(), allow_early_resets=True)
    ])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy

    nbatch = env.num_envs * hyper_parameters['batch_size']
    training_batch_number = nbatch // hyper_parameters['training_batch_size']

    # import pdb; pdb.set_trace()

    # use AdamOptimizer as optimizer and choose value function same with policy
    ppo2.learn(policy=policy,
               env=env,
               nsteps=hyper_parameters['batch_size'],
               lam=hyper_parameters['gae_lambda'],
               gamma=hyper_parameters['discount'],
               ent_coef=hyper_parameters['policy_ent_coeff'],
               nminibatches=training_batch_number,
               noptepochs=hyper_parameters['training_epochs'],
               max_grad_norm=None,
               lr=hyper_parameters['learning_rate'],
               cliprange=hyper_parameters['lr_clip_range'],
               total_timesteps=hyper_parameters['batch_size'] * hyper_parameters['n_epochs'])  # yapf: disable  # noqa: E501

    return osp.join(log_dir, 'progress.csv')
Example #12
0
def ppo():
    def make_env():
        env = SawyerEnvWrapper(DownEnv(for_her=False))
        return env

    tf.Session().__enter__()
    env = VecNormalize(DummyVecEnv([make_env]))
    policy = MlpPolicy
    model = ppo2.learn(policy=policy, env=env, nsteps=4000, nminibatches=1,
                       lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
                       ent_coef=0.0, lr=3e-4, cliprange=0.2, total_timesteps=1e8)

    return model
Example #13
0
def make_env(env, seed, device):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    def _thunk():
        env_ = gym.make(env)
        env_.seed(seed)
        return env_

    envs = DummyVecEnv([_thunk])
    envs = VecNormalize(envs, ret=False)
    envs = VecPyTorch(envs, device)
    return envs
Example #14
0
def PrepareParallelEnv(env_id, seed, num_processes, gamma, log_dir, device,
                       allow_early_resets):
    envs = [
        PrepareMujocoEnv(env_id, seed, i, log_dir, allow_early_resets)
        for i in range(num_processes)
    ]
    if len(envs) > 1:
        envs = ShmemVecEnv(envs, context='fork')
    else:
        envs = DummyVecEnv(envs)
    envs = VecNormalize(envs, gamma=gamma)
    envs = VecPyTorch(envs, device)
    return envs
Example #15
0
def make_vec_env(env_id,
                 env_type,
                 num_env,
                 seed,
                 wrapper_kwargs=None,
                 env_kwargs=None,
                 start_index=0,
                 reward_scale=1.0,
                 flatten_dict_observations=True,
                 gamestate=None,
                 initializer=None,
                 force_dummy=False,
                 obs_type='original',
                 fixed_num_of_contact=0):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
    """
    wrapper_kwargs = wrapper_kwargs or {}
    env_kwargs = env_kwargs or {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = seed + 10000 * mpi_rank if seed is not None else None
    logger_dir = logger.get_dir()

    def make_thunk(rank, initializer=None):
        return lambda: make_env(env_id=env_id,
                                env_type=env_type,
                                mpi_rank=mpi_rank,
                                subrank=rank,
                                seed=seed,
                                reward_scale=reward_scale,
                                gamestate=gamestate,
                                flatten_dict_observations=
                                flatten_dict_observations,
                                wrapper_kwargs=wrapper_kwargs,
                                env_kwargs=env_kwargs,
                                logger_dir=logger_dir,
                                initializer=initializer,
                                obs_type=obs_type,
                                fixed_num_of_contact=fixed_num_of_contact)

    set_global_seeds(seed)
    if not force_dummy and num_env > 1:
        return SubprocVecEnv([
            make_thunk(i + start_index, initializer=initializer)
            for i in range(num_env)
        ])
    else:
        return DummyVecEnv([
            make_thunk(i + start_index, initializer=None)
            for i in range(num_env)
        ])
def make_vec_envs(env_name,
                  scene_path,
                  seed,
                  num_processes,
                  gamma,
                  log_dir,
                  device,
                  allow_early_resets,
                  initial_policies,
                  num_frame_stack=None,
                  show=False,
                  no_norm=False,
                  pose_estimator=None,
                  image_ips=None,
                  init_control=True):
    envs = [
        make_env(env_name, scene_path, seed, i, log_dir, allow_early_resets,
                 show, init_control) for i in range(num_processes)
    ]

    if len(envs) > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    envs = wrap_initial_policies(envs, device, initial_policies)

    if pose_estimator is not None:
        envs = SimImageObsVecEnvWrapper(envs)

    if len(envs.observation_space.shape) == 1 and not no_norm:
        if gamma is None:
            envs = VecNormalize(envs, ret=False)
        else:
            envs = VecNormalize(envs, gamma=gamma)

    envs = VecPyTorch(envs, device)

    if pose_estimator is not None:
        envs = wrap_initial_policies(envs, device, image_ips)
        envs = PoseEstimatorVecEnvWrapper(envs,
                                          device,
                                          *pose_estimator,
                                          abs_to_rel=True)

    if num_frame_stack is not None:
        envs = VecPyTorchFrameStack(envs, num_frame_stack, device)
    elif not pose_estimator and len(envs.observation_space.shape) == 3:
        envs = VecPyTorchFrameStack(envs, 4, device)

    return envs
Example #17
0
def make_vec_envs(env_name, seed, num_processes, gamma, log_dir, device,
                  allow_early_resets, num_frame_stack=None, rank=0,
                  signature='', max_steps=None, env_group_spec=None):
    """ Make vectorised environments for parallelized experience sampling. """
    # Should environments be the all the same for each learner or differ across processes for the
    # same learner.
    heterogeneous_envs = not (env_group_spec is not None and env_group_spec[1] == num_processes)
    if env_group_spec is None or env_group_spec[0] == 1:
        # No grouping of environment processes for each agent.
        envs = [
            make_env(env_name, seed + num_processes * rank, (rank * num_processes) + i,
                     log_dir, allow_early_resets, signature, max_steps,
                     heterogeneous=heterogeneous_envs)
            for i in range(num_processes)
        ]
    else:
        # We have environments grouped such that environments differ even for the same learner.
        envs = []
        counter = 0
        for i in range(env_group_spec[0]):
            envs += [
                make_env(env_name, seed + num_processes * rank,
                         (rank * num_processes) + counter + i, log_dir,
                         allow_early_resets, signature, max_steps, heterogeneous=False)
                for i in range(env_group_spec[1])
            ]
            seed += env_group_spec[1]
            counter += env_group_spec[1]

    # Allow dummy environment wrapper if no parallelisation required.
    if len(envs) > 1:
        envs = ShmemVecEnv(envs, context='fork')
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        if gamma is None:
            envs = VecNormalize(envs, ret=False)
        else:
            envs = VecNormalize(envs, gamma=gamma)

    # Ensure environments are compatible with the PyTorch agents.
    envs = VecPyTorch(envs, device)

    # Frame stacking for visual environments.
    if num_frame_stack is not None:
        envs = VecPyTorchFrameStack(envs, num_frame_stack, device)
    elif len(envs.observation_space.shape) == 3:
        envs = VecPyTorchFrameStack(envs, 4, device)

    return envs
Example #18
0
def train(env_id, num_timesteps, seed):
    from baselines.common import set_global_seeds
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from baselines.ppo2 import ppo2
    from baselines.ppo2.policies import MlpPolicy
    import gym
    import tensorflow as tf
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
    ncpu = 1
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()

    def make_env():
        if env_id == 'toy':
            #env = continuous_gridworld.ContinuousGridworld('', max_steps=1000,
            #                                           obstacle_mode=continuous_gridworld.NO_OBJECTS)
            from toy_environment import room_obstacle_list
            env = gridworld.Gridworld(
                obstacle_list_generator=room_obstacle_list.obstacle_list)
        elif env_id == 'navigate':
            env = NavigateEnv(use_camera=False,
                              continuous_actions=True,
                              neg_reward=True,
                              max_steps=500)
        elif env_id == 'arm2pos':
            #env = Arm2PosEnv(continuous=False, max_steps=500)
            pass
        else:
            env = gym.make(env_id)
        env = bench.Monitor(env, logger.get_dir())
        return env

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy
    ppo2.learn(policy=policy,
               env=env,
               nsteps=2048,
               nminibatches=32,
               lam=0.95,
               gamma=0.99,
               noptepochs=10,
               log_interval=1,
               ent_coef=0.0,
               lr=3e-4,
               cliprange=0.2,
               total_timesteps=num_timesteps)
Example #19
0
def train(model_name, num_processes, max_grad_norm, num_env_steps, log_dir, epoch, env_name, save_dir, use_linear_clip_decay):
  records = []
  envs = [make_env(rank = i) for i in range(num_processes)]
  replaybuffer = Buffer()
  if len(envs) > 1:
    envs = SubprocVecEnv(envs)
  else:
    envs = DummyVecEnv(envs)
  try:
    state_shape = envs.observation_space.shape[0]
    action_shape = envs.action_space.shape[0]
    model = model_dict[model_name](state_shape, action_shape)
    cumpute_loss = loss_dict[model_name]
    optimizer = torch.optim.Adam(model.parameters())
    state = envs.reset()
    returns = 0
    for t in range(num_env_steps//num_processes):
      action, log_prob = model.act(state)
      next_state, reward, done, info = envs.step(to_np(action))
      returns += reward
      replaybuffer.store(zip(state, to_np(action), to_np(log_prob), reward, next_state, 1 - done))
      for i, d in enumerate(done):
        if d:
          records.append((t * num_processes + i, returns[i]))
          if i==0:
            print(returns[0])
          returns[i] = 0
      state = next_state

      if t % 500//num_processes == (500//num_processes-1):
        for _ in range(epoch):
          optimizer.zero_grad()
          loss = cumpute_loss(replaybuffer.sample(), model)
          loss.backward()
          nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
          optimizer.step()
        if model_name == 'PPO' or model_name == 'DPPO':
          replaybuffer.clear()

      if t % (num_env_steps//num_processes//10) == 0:
        i = t//(num_env_steps//num_processes//10)
        torch.save(model.state_dict(), os.path.join(save_dir, model_name,env_name, model_name+str(i)+'.pt'))
      if use_linear_clip_decay:
        update_linear_schedule(optimizer, t * num_processes)
    torch.save(model.state_dict(), os.path.join(save_dir, model_name,env_name, model_name+'_Final.pt'))
    timesteps , sumofrewards = zip(*records)
    savemat(os.path.join(save_dir, model_name,env_name,'returns.mat'),{'timesteps':timesteps, 'returns':sumofrewards})
  except Exception as e:
    traceback.print_exc()
  finally:
    envs.close()
def testing(model):
    """
    We'll use this function to calculate the score on test levels for each saved model,
    to generate the video version
    to generate the map version
    """

    test_env = DummyVecEnv([sonic_env.make_test])

    # Get state_space and action_space
    ob_space = test_env.observation_space
    ac_space = test_env.action_space
 
    # Play
    total_score = 0
    trial = 0
    
    # We make 3 trials
    for trial in range(3):
        obs = test_env.reset()
        done = False
        score = 0

        while done == False:
            # Get the action
            action, value, _ = model.step(obs)
            
            # Take action in env and look the results
            obs, reward, done, info = test_env.step(action)

            score += reward[0]
        total_score += score
        trial += 1
    test_env.close()

    # Divide the score by the number of trials
    total_test_score = total_score / 3
    return total_test_score
Example #21
0
def make_vec_envs(env_name,
                  rep_type,
                  resolution,
                  seed,
                  scenario,
                  num_processes,
                  gamma,
                  log_dir,
                  device,
                  allow_early_resets,
                  num_frame_stack=None,
                  patch_count=3,
                  reverse_green=False):
    envs = [
        make_env(env_name,rep_type,resolution,seed,scenario ,i, log_dir, allow_early_resets,patch_count=patch_count,reverse_green=reverse_green)
        for i in range(num_processes)
    ]

    if len(envs) > 1:
        #envs = ShmemVecEnv(envs, context='fork')
        envs = DummyVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        if gamma is None:
            envs = VecNormalize(envs, ret=False)
        else:
            envs = VecNormalize(envs, gamma=gamma)

    envs = VecPyTorch(envs, device)

    if num_frame_stack is not None:
        envs = VecPyTorchFrameStack(envs, num_frame_stack, device)
    elif len(envs.observation_space.shape) == -1: #problematic, original 3
        envs = VecPyTorchFrameStack(envs, 4, device)

    return envs
Example #22
0
def train(env_id, num_timesteps, seed, lr, lr_q, cliprangeq):
    from baselines.common import set_global_seeds
    from baselines.common.vec_env.vec_normalize import VecNormalize
    from baselines.ppo2 import ppo2_implicit
    from baselines.ppo2.policies import ImplicitMLPPolicy
    import gym
    import tensorflow as tf
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
    ncpu = 1
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()

    def make_env():
        env = gym.make(env_id)
        env = bench.Monitor(env, logger.get_dir())
        return env

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = ImplicitMLPPolicy

    # build call back
    arg = {}
    arg['seed'] = seed
    arg['env'] = env_id
    arg['lr'] = lr
    arg['lrq'] = lr_q
    arg['cliprangeq'] = cliprangeq
    callback = CALLBACK(arg)

    cliprangeq = linear_schedule(maxrate=cliprangeq, minrate=0.001)

    ppo2_implicit.learn(policy=policy,
                        env=env,
                        nsteps=2048,
                        nminibatches=32,
                        lam=0.95,
                        gamma=0.99,
                        noptepochs=10,
                        log_interval=1,
                        ent_coef=0.0,
                        lr=lr,
                        lr_q=lr_q,
                        cliprangeq=cliprangeq,
                        total_timesteps=num_timesteps,
                        callback=callback)
Example #23
0
def test_identity(learn_func):
    '''
    Test if the algorithm (with a given policy) 
    can learn an identity transformation (i.e. return observation as an action)
    '''
    np.random.seed(0)
    np_random.seed(0)
    random.seed(0)

    env = DummyVecEnv([lambda: IdentityEnv(10)])

    with tf.Graph().as_default(), tf.Session().as_default():
        tf.set_random_seed(0)
        model = learn_func(env)

        N_TRIALS = 1000
        sum_rew = 0
        obs = env.reset()
        for i in range(N_TRIALS):
            obs, rew, done, _ = env.step(model.step(obs)[0])
            sum_rew += rew

        assert sum_rew > 0.9 * N_TRIALS
Example #24
0
def _make_env(env_fn, nenv):
    def _env(rank):
        def _thunk():
            return env_fn(rank=rank)

        return _thunk

    if nenv > 1:
        env = SubprocVecEnv([_env(i) for i in range(nenv)])
    else:
        env = DummyVecEnv([_env(0)])
    env = SuccessWrapper(env)
    tstart = 0
    return VecMonitor(env, max_history=100, tstart=tstart, tbX=True)
Example #25
0
def test_identity(learn_func):
    '''
    Test if the algorithm (with a given policy) 
    can learn an identity transformation (i.e. return observation as an action)
    '''
    np.random.seed(0)
    np_random.seed(0)
    random.seed(0)

    env = DummyVecEnv([lambda: IdentityEnv(10)])

    with tf.Graph().as_default(), tf.Session().as_default():
        tf.set_random_seed(0)
        model = learn_func(env)

        N_TRIALS = 1000
        sum_rew = 0
        obs = env.reset()
        for i in range(N_TRIALS):
            obs, rew, done, _ = env.step(model.step(obs)[0])
            sum_rew += rew

        assert sum_rew > 0.9 * N_TRIALS
def make_vec_envs(args, device="cpu"):
    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir, args.default_ind, \
            args.num_envs, args.basepath) for i in range(args.num_processes)
    ]

    if len(envs) > 1:
        envs = ShmemVecEnv(envs, context='fork')
    else:
        envs = DummyVecEnv(envs)

    envs = VecPyTorch(envs, device)

    return envs
Example #27
0
def run_baselines(env, seed, log_dir):
    '''
    Create baselines model and training.

    Replace the ppo and its training with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return
    '''
    ncpu = max(multiprocessing.cpu_count() // 2, 1)
    config = tf.ConfigProto(
        allow_soft_placement=True,
        intra_op_parallelism_threads=ncpu,
        inter_op_parallelism_threads=ncpu)
    tf.Session(config=config).__enter__()

    # Set up logger for baselines
    configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard'])
    baselines_logger.info('rank {}: seed={}, logdir={}'.format(
        0, seed, baselines_logger.get_dir()))

    def make_env():
        monitor = bench.Monitor(
            env, baselines_logger.get_dir(), allow_early_resets=True)
        return monitor

    env = DummyVecEnv([make_env])
    env = VecNormalize(env)

    set_global_seeds(seed)
    policy = MlpPolicy
    ppo2.learn(
        policy=policy,
        env=env,
        nsteps=2048,
        nminibatches=32,
        lam=0.95,
        gamma=0.99,
        noptepochs=10,
        log_interval=1,
        ent_coef=0.0,
        lr=1e-3,
        vf_coef=0.5,
        max_grad_norm=None,
        cliprange=0.2,
        total_timesteps=int(1e6))

    return osp.join(log_dir, 'progress.csv')
Example #28
0
def train(args):

    # exploration score type
    if 'MiniGrid' in args.env:
        args.score_type = 'discrete'
        args.train_rl = True
        policy_fn = MlpPolicy
    elif args.env == 'MiniWorld-MazeS5-v0':
        args.score_type = 'continious'
        args.train_rl = True
        policy_fn = CnnPolicy
    else:  # MuJoCo
        args.score_type = 'continious'
        if args.disable_rapid:
            args.train_rl = True
        else:
            args.train_rl = False
        policy_fn = MlpPolicy

    # Make the environment
    def _make_env():
        env = make_env(args.env)
        env.seed(args.seed)
        return env

    env = DummyVecEnv([_make_env])
    if not 'MiniGrid' in args.env and not args.env == 'MiniWorld-MazeS5-v0':  # Mujoco
        env = VecNormalize(env)

    # Initialize the buffer
    ranking_buffer = RankingBuffer(ob_space=env.observation_space,
                                   ac_space=env.action_space,
                                   args=args)

    # Start training
    learn(policy_fn, env, ranking_buffer, args)
    env.close()
Example #29
0
def make_vec_envs_domains(env_name,
                          seed,
                          num_processes,
                          gamma,
                          log_dir,
                          device,
                          allow_early_resets,
                          num_envs1,
                          num_envs2,
                          num_frame_stack=None,
                          env_kwargs1=None,
                          env_kwargs2=None):

    # Environments from domain 1
    num_envs_domain1 = num_envs1  # int(num_processes/2)
    num_envs_domain2 = num_envs2  # int(num_processes/2)
    envs1 = [
        make_env(env_name, seed, i, log_dir, allow_early_resets, env_kwargs1)
        for i in range(num_envs_domain1)
    ]

    # Environments from domain 2
    envs2 = [
        make_env(env_name, seed, i, log_dir, allow_early_resets, env_kwargs2)
        for i in range(num_envs_domain2)
    ]

    # Concatenate envs
    envs = envs1 + envs2

    if len(envs) > 1:
        envs = ShmemVecEnv_DR(envs, context='fork')
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        if gamma is None:
            envs = VecNormalize(envs, ret=False)
        else:
            envs = VecNormalize(envs, gamma=gamma)

    envs = VecPyTorch(envs, device)

    if num_frame_stack is not None:
        envs = VecPyTorchFrameStack(envs, num_frame_stack, device)
    elif len(envs.observation_space.shape) == 3:
        envs = VecPyTorchFrameStack(envs, 4, device)

    return envs
Example #30
0
def make_vec_env(env_id, seed):
    """
    Create environment
    """
    env = gym.make(env_id)
    env.seed(seed)

    def make_env(env):
        return lambda: env

    env = Monitor(env,
                  logger.get_dir() and os.path.join(logger.get_dir(), '0'),
                  allow_early_resets=True)
    set_global_seeds(seed)
    return DummyVecEnv([make_env(env)])
Example #31
0
def make_vec_envs_custom(constants, device, env_lambda):
    
    # Construct envs
    envs = [
        env_lambda for i in range(constants["num_processes"])
    ]
    # Multiple processes
    if len(envs) > 1:
        envs = ShmemVecEnv(envs, context='fork')
    else:
        envs = DummyVecEnv(envs)
    # Put on gpu whatever can be
    envs = VecPyTorch(envs, device)

    return envs
Example #32
0
def make_vec_env(env_id, seed):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
    """
    env = gym.make(env_id)
    env.seed(seed)

    def make_thunk(env):
        return lambda: env

    env = Monitor(env,
                  logger.get_dir() and os.path.join(logger.get_dir(), '0'),
                  allow_early_resets=True)
    set_global_seeds(seed)
    return DummyVecEnv([make_thunk(env)])
Example #33
0
def make_vec_random_env(num_envs: int, mk_config: Union[MkConfig,
                                                        Dict]) -> VectorEnv:
    # Move import here in case we don't have `baselines` installed:
    # TODO: Use the "native" vectorized envs from gym rather than those from baselines.
    # The only thing we'd lose is the ability to render the envs, which isn't part of
    # gym at the time of writing. One potential solution would be to use a fork of gym
    # which adds this support for rendering the envs.
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
    from baselines.common.vec_env.shmem_vec_env import ShmemVecEnv

    env_func = partial(make_env, mk_config=mk_config)

    if num_envs == 1:
        return DummyVecEnv([env_func for _ in range(num_envs)])
    return ShmemVecEnv([env_func for _ in range(num_envs)])
Example #34
0
def make_vec_envs(args,
                  seed,
                  num_processes,
                  gamma,
                  device):
    envs = [make_env(args, seed, i) for i in range(num_processes)]

    if len(envs) > 1:
        envs = ShmemVecEnv(envs, context='fork')
    else:
        envs = DummyVecEnv(envs)

    envs = VecPyTorch(envs, device)

    return envs
Example #35
0
def build_env(args):
    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    return envs