Example #1
0
def run_agent(envs, parameters):
    '''Train an agent.'''
    path = Path(parameters['path'])
    dummy_env = OptVecEnv(envs)
    set_global_seeds(parameters.setdefault('seed'))
    save_path = str(path / 'model.pkl')
    alg = parameters['alg']
    if alg == 'PPO':
        with open(save_path, 'rb') as pkl:
            model = PPO2.load(pkl, env=dummy_env)
    elif alg == 'A2C':
        with open(save_path, 'rb') as pkl:
            model = A2C.load(pkl, env=dummy_env)
    try:
        done = False
        observations = dummy_env.reset()
        while not done:
            action = model.predict(observations)
            print(action[0].ravel().tolist())
            observations, rewards, dones, infos = dummy_env.step(action[0])
            done = any(dones)
            info = infos[0]
            yield info['weights']
    finally:
        dummy_env.close()
Example #2
0
def make_robotics_env(env_id, seed, rank=0, allow_early_resets=True):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.

    :param env_id: (str) the environment ID
    :param seed: (int) the inital seed for RNG
    :param rank: (int) the rank of the environment (for logging)
    :param allow_early_resets: (bool) allows early reset of the environment
    :return: (Gym Environment) The robotic environment
    """
    set_global_seeds(seed)
    env = gym.make(env_id)
    keys = ['observation', 'desired_goal']
    # TODO: remove try-except once most users are running modern Gym
    try:  # for modern Gym (>=0.15.4)
        from gym.wrappers import FilterObservation, FlattenObservation
        env = FlattenObservation(FilterObservation(env, keys))
    except ImportError:  # for older gym (<=0.15.3)
        from gym.wrappers import FlattenDictWrapper  # pytype:disable=import-error
        env = FlattenDictWrapper(env, keys)
    env = Monitor(env,
                  logger.get_dir()
                  and os.path.join(logger.get_dir(), str(rank)),
                  info_keywords=('is_success', ),
                  allow_early_resets=allow_early_resets)
    env.seed(seed)
    return env
Example #3
0
def run_agent(envs, parameters):
    '''Train an agent.'''
    alg = parameters['alg']
    learning_rate = parameters['learning_rate']
    gamma = parameters['gamma']
    model_path = parameters['model_path']
    set_global_seeds(parameters.get('seed'))
    dummy_env = OptVecEnv(envs)
    if alg == 'PPO':
        model = PPO2(MlpPolicy,
                     dummy_env,
                     gamma=gamma,
                     learning_rate=learning_rate,
                     verbose=1,
                     nminibatches=dummy_env.num_envs)
    elif alg == 'A2C':
        model = A2C(MlpPolicy,
                    dummy_env,
                    gamma=gamma,
                    learning_rate=learning_rate,
                    verbose=1)
    else:
        model = DDPG(ddpg.MlpPolicy,
                     dummy_env,
                     gamma=gamma,
                     verbose=1,
                     actor_lr=learning_rate / 10,
                     critic_lr=learning_rate)
    try:
        model.learn(total_timesteps=parameters.get('total_timesteps', 10**6))
    except tf.errors.InvalidArgumentError:
        LOGGER.error('Possible Nan, %s', str((alg, learning_rate, gamma)))
    finally:
        dummy_env.close()
        model.save(str(model_path))
def make_mario_env(env_id,
                   num_env,
                   seed,
                   actions=None,
                   cut_map=False,
                   do_wrap_dm=True,
                   wrapper_kwargs=None,
                   start_index=0,
                   allow_early_resets=True,
                   start_method=None,
                   use_subprocess=False):
    # FIXME do actions set up on env
    """
    Create a wrapped, monitored VecEnv for Atari.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the initial seed for RNG
    :param wrapper_kwargs: (dict) the parameters for wrap_deepmind function
    :param start_index: (int) start rank index
    :param allow_early_resets: (bool) allows early reset of the environment
    :param start_method: (str) method used to start the subprocesses.
        See SubprocVecEnv doc for more information
    :param use_subprocess: (bool) Whether to use `SubprocVecEnv` or `DummyVecEnv` when
        `num_env` > 1, `DummyVecEnv` is usually faster. Default: False
    :return: (VecEnv) The atari environment
    """
    if wrapper_kwargs is None:
        wrapper_kwargs = {}

    def make_env(rank):
        def _thunk():
            env = make_mario(env_id)
            env.seed(seed + rank)

            if cut_map:
                env = CutMarioMap(env)

            env = Monitor(env,
                          logger.get_dir()
                          and os.path.join(logger.get_dir(), str(rank)),
                          allow_early_resets=allow_early_resets)

            # FIXME do if wrap deepmind, create other methods
            return wrap_deepmind_custom(
                env, **wrapper_kwargs)  # converts to 84*84 bw, keep for now

        return _thunk

    set_global_seeds(seed)

    # When using one environment, no need to start subprocesses
    if num_env == 1 or not use_subprocess:
        return DummyVecEnv([make_env(i + start_index) for i in range(num_env)])

    return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)],
                         start_method=start_method)
Example #5
0
def make_env(rank, seed=0, sub_id=6, enable_draw=True):
    def _init():
        env = SimpleHumanoidMimicEnv(sub_id=sub_id, enable_draw=enable_draw)

        # Important: use a different seed for each environment
        env.seed(seed + rank)
        return env

    set_global_seeds(seed)
    return _init
def train_ppo(env_id,
              num_timesteps,
              seed,
              policy,
              save_params,
              n_envs=1,
              nminibatches=5,
              n_steps=8000):
    """
     env_id: typr str, identifies each environment uniquely
     num_timesteps: number of timesteps to run the algorithm
     seed: initial random seed
     policy: policy to be followed (mlp, cnn, lstm, etc)
     n_env: number of envs to run in parallel
     nminibatches: number of minibatches of mini batch gradient descent (first-order optimization) to update the policy params
     n_steps: number of steps in each update
    """
    # Train PPO algorithm for num_timesteps
    # stack the frames for the vectorized environment
    # Note: PPO2 works only with vectorized environment

    set_global_seeds(seed)
    env = make_atari(env_id)
    env.seed(seed)
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = wrap_deepmind(env, frame_stack=True)
    # define the policy
    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    # create model object for class PPO2
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=n_steps,
                 nminibatches=nminibatches,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=4,
                 ent_coef=.01,
                 learning_rate=lambda f: f * 2.5e-4,
                 cliprange=lambda f: f * 0.1,
                 verbose=1)
    # train the model
    # trained for 2e7 timesteps with seed = 5
    model.learn(total_timesteps=num_timesteps, callback=callback)
    # save the hyperparameters and weights
    model.save(save_params)
    env.close()
    # free the memory
    del model
Example #7
0
def make_mujoco_env(env_id, seed, allow_early_resets=True):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.

    :param env_id: (str) the environment ID
    :param seed: (int) the inital seed for RNG
    :param allow_early_resets: (bool) allows early reset of the environment
    :return: (Gym Environment) The mujoco environment
    """
    set_global_seeds(seed + 10000 * mpi_rank_or_zero())
    env = gym.make(env_id)
    env = Monitor(env, os.path.join(logger.get_dir(), '0'), allow_early_resets=allow_early_resets)
    env.seed(seed)
    return env
Example #8
0
def make_envs(env_id,
              do_eval,
              seed,
              conf,
              normalize_observations=False,
              normalize_returns=False):
    # Create envs.
    env_params = conf.pop('env_params', {})
    env = base_env = gym.make(env_id)
    if hasattr(base_env, 'env'):
        base_env = base_env.env
    for attr in env_params:
        setattr(base_env, attr, env_params[attr])
    env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)

    # Seed everything to make things reproducible.
    logger.info('seed={}, logdir={}'.format(seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)

    if normalize_observations or normalize_returns:
        env = DummyVecEnv([lambda: env])
        env = VecNormalize(env,
                           norm_obs=normalize_observations,
                           norm_reward=normalize_returns)

    if do_eval:
        eval_env = base_eval_env = gym.make(env_id)
        if hasattr(base_eval_env, 'env'):
            base_eval_env = base_eval_env.env
        for attr in env_params:
            setattr(base_eval_env, attr, env_params[attr])
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'),
                                 allow_early_resets=True)
        eval_env.seed(seed)
        eval_env.base_env = base_eval_env
    else:
        base_eval_env = None
        eval_env = None
    env.base_env = base_env

    return base_env, env, base_eval_env, eval_env
Example #9
0
def make_atari_env(env_id,
                   num_env,
                   seed,
                   wrapper_kwargs=None,
                   start_index=0,
                   allow_early_resets=True,
                   start_method=None):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param wrapper_kwargs: (dict) the parameters for wrap_deepmind function
    :param start_index: (int) start rank index
    :param allow_early_resets: (bool) allows early reset of the environment
    :return: (Gym Environment) The atari environment
    :param start_method: (str) method used to start the subprocesses.
        See SubprocVecEnv doc for more information
    """
    if wrapper_kwargs is None:
        wrapper_kwargs = {}

    def make_env(rank):
        def _thunk():
            env = make_atari(env_id)
            env.seed(seed + rank)
            env = Monitor(env,
                          logger.get_dir()
                          and os.path.join(logger.get_dir(), str(rank)),
                          allow_early_resets=allow_early_resets)
            return wrap_deepmind(env, **wrapper_kwargs)

        return _thunk

    set_global_seeds(seed)

    # When using one environment, no need to start subprocesses
    if num_env == 1:
        return DummyVecEnv([make_env(0)])

    return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)],
                         start_method=start_method)
def train_trpo(env_id, num_timesteps, seed):

    # env_id: typr str, identifies each environment uniquely
    # num_timesteps: number of timesteps to run the algorithm
    # seed: initial random seed

    # set up the environment
    rank = MPI.COMM_WORLD.Get_rank()
    sseed = seed + 10000 * rank
    set_global_seeds(sseed)
    env = make_atari(env_id)
    env.seed(sseed)
    env = wrap_deepmind(make_atari(env_id))
    env.seed(sseed)
    # define policies
    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    # define TRPO class object
    model = TRPO(policy=policy,
                 env=env,
                 timesteps_per_batch=1024,
                 max_kl=0.01,
                 cg_iters=10,
                 cg_dampling=1e-3,
                 ent_coef=0.0,
                 gamma=0.99,
                 lam=1,
                 vf_iters=3,
                 vf_stepsize=1e-4,
                 verbose=1)
    # Train TRPO for num_timesteps
    model.learn(total_timesteps=num_timesteps)
    # save the hyperparameters and weights
    model.save('trpo' + env_id)
    env.close()
    # free the memory
    del model
 def set_random_seed(self, seed):
     """
     :param seed: (int) Seed for the pseudo-random generators. If None,
         do not change the seeds.
     """
     # Ignore if the seed is None
     if seed is None:
         return
     # Seed python, numpy and tf random generator
     set_global_seeds(seed)
     if self.env is not None:
         if isinstance(self.env, VecEnv):
             # Use a different seed for each env
             for idx in range(self.env.num_envs):
                 self.env.env_method("seed", seed + idx)
         else:
             self.env.seed(seed)
         # Seed the action space
         # useful when selecting random actions
         self.env.action_space.seed(seed)
     self.action_space.seed(seed)
def train_dqn_adv(env_id, train_timesteps, seed, policy, save_params, n_envs = 1):
    set_global_seeds(seed)
    env = make_atari(env_id)
    env.seed(seed)
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = wrap_deepmind(env, frame_stack=True)
    # define the policy
    policy = {'cnn': CnnPolicy, 'mlp': MlpPolicy}[policy]
    # create model object for class DQN
    model = DQN(policy = policy, env = env, gamma=0.99, learning_rate=0.0001, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, 
                exploration_initial_eps=1.0, train_freq=4, batch_size=32, double_q=True, learning_starts=10000, target_network_update_freq=1000, 
                prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-06, 
                param_noise=False, n_cpu_tf_sess=None, verbose=1)
    callback = save_best_model_callback(save_freq = 100, log_dir = log_dir, save_params = save_params, verbose=1)
    # train the model
    # trained for 2e7 timesteps with seed = 7
    model.learn(total_timesteps = train_timesteps, callback = callback)
    plot_results([log_dir], train_timesteps, results_plotter.X_TIMESTEPS, "DQNPong_TrainedByAdversary")
    plt.show()
    env.close()
    # free the memory
    del model
Example #13
0
def make_rosetta_env(env_id, num_env, seed, wrapper_kwargs=None,
                   start_index=0, allow_early_resets=True,
                   start_method=None, use_subprocess=False):
    """
    Create a wrapped, monitored VecEnv for Rosetta.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the initial seed for RNG
    :param wrapper_kwargs: (dict) the parameters for wrap_deepmind function
    :param start_index: (int) start rank index
    :param allow_early_resets: (bool) allows early reset of the environment
    :param start_method: (str) method used to start the subprocesses.
        See SubprocVecEnv doc for more information
    :param use_subprocess: (bool) Whether to use `SubprocVecEnv` or `DummyVecEnv` when
        `num_env` > 1, `DummyVecEnv` is usually faster. Default: False
    :return: (VecEnv) The atari environment
    """
    if wrapper_kwargs is None:
        wrapper_kwargs = {}

    def make_env(rank):
        def _thunk():
            env = gym.make(env_id)
            env.seed(seed + rank)
            env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
                          allow_early_resets=allow_early_resets, reset_keywords=())
            # return wrap_deepmind(env, **wrapper_kwargs)
            return env
        return _thunk
    set_global_seeds(seed)

    # When using one environment, no need to start subprocesses
    if num_env == 1 or not use_subprocess:
        return DummyVecEnv([make_env(i + start_index) for i in range(num_env)])

    return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)],
                         start_method=start_method)
Example #14
0
def run_agent(envs, parameters, trial):
    '''Train an agent.'''
    path = Path(parameters['path'])
    dummy_env = OptVecEnv(envs)
    set_global_seeds(parameters.setdefault('seed'))
    if parameters['alg'] == 'PPO':
        model = PPO2(MlpPolicy,
                     dummy_env,
                     gamma=parameters['gamma'],
                     learning_rate=parameters['learning_rate'],
                     verbose=0)
    elif parameters['alg'] == 'A2C':
        model = A2C(MlpPolicy,
                    dummy_env,
                    gamma=parameters['gamma'],
                    learning_rate=parameters['learning_rate'],
                    verbose=0)
    try:
        timesteps = parameters['total_timesteps'] * dummy_env.agent_no_list[0]
        with tqdm(count(), leave=True) as progress:
            progress = iter(progress)

            def callback(local_vars, global_vars):
                if next(progress) % 100:
                    callback_env = local_vars['self'].env
                    get_total_reward(callback_env)
                    trial.report(get_total_reward(callback_env),
                                 local_vars['update'])
                    if trial.should_prune():
                        raise optuna.structs.TrialPruned()

            model.learn(total_timesteps=timesteps, callback=callback)
        return get_total_reward(dummy_env)
    finally:
        dummy_env.close()
        model.save(str(path / 'model.pkl'))
Example #15
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    """
    run the training of DDPG

    :param env_id: (str) the environment ID
    :param seed: (int) the initial random seed
    :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by
        seperating them with commas
    :param layer_norm: (bool) use layer normalization
    :param evaluation: (bool) enable evaluation of DDPG training
    :param kwargs: (dict) extra keywords for the training.train function
    """

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    start_time = 0
    if rank == 0:
        start_time = time.time()
    model = DDPG(policy=MlpPolicy,
                 env=env,
                 memory_policy=Memory,
                 eval_env=eval_env,
                 param_noise=param_noise,
                 action_noise=action_noise,
                 memory_limit=int(1e6),
                 layer_norm=layer_norm,
                 verbose=2,
                 **kwargs)
    model.learn(total_timesteps=10000)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Example #16
0
def run(env_id, seed, layer_norm, evaluation, agent, delay_step, gamma=0.99, **kwargs):
    # Create envs.
    env = create_env(env_id, delay_step, str(0))
    print(env.observation_space, env.action_space)
    if evaluation:
        eval_env = create_env(env_id, delay_step, "eval_env")
    else:
        eval_env = None

    # Seed everything to make things reproducible.
    logger.info('seed={}, logdir={}'.format(seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    start_time = time.time()

    policy = 'MlpPolicy'
    td3_variants = {
        "TD3": TD3,
        "TD3SIL": TD3SIL,
        "TD3NSTEP": TD3NSTEP,
        "TD3REDQ": TD3REDQ,
        "TD3DoubleTwin": TD3DoubleTwin,
    }
    if td3_variants.get(agent, None):
        model_func = td3_variants[agent]
        model = model_func(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=128,
                           tau=0.005, policy_delay=2, learning_starts=25000,
                           action_noise=create_action_noise(env, "normal_0.1"), buffer_size=100000, verbose=2,
                           n_cpu_tf_sess=10,
                           policy_kwargs={"layers": [400, 300]})
    elif agent == "DDPG":
        model = DDPG(policy=policy, env=env, eval_env=eval_env, gamma=gamma, nb_eval_steps=5, batch_size=100,
                     nb_train_steps=100, nb_rollout_steps=100, learning_starts=10000,
                     actor_lr=1e-3, critic_lr=1e-3, critic_l2_reg=0,
                     tau=0.005, normalize_observations=False,
                     action_noise=create_action_noise(env, "normal_0.1"), buffer_size=int(1e6),
                     verbose=2, n_cpu_tf_sess=10,
                     policy_kwargs={"layers": [400, 300]})
    elif agent == "SAC":
        model = SAC(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=256,
                    action_noise=create_action_noise(env, "normal_0.1"), buffer_size=int(1e6), verbose=2,
                    n_cpu_tf_sess=10, learning_starts=10000,
                    policy_kwargs={"layers": [256, 256]})
    elif agent == "GEM":
        policy = 'TD3LnMlpPolicy'
        model = TD3MemGEM(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=128,
                          tau=0.005, policy_delay=2, learning_starts=25000,
                          action_noise=create_action_noise(env, "normal_0.1"), buffer_size=100000, verbose=2,
                          n_cpu_tf_sess=10,
                          alpha=0.5, beta=-1, iterative_q=-1,
                          num_q=4, gradient_steps=200, max_step=kwargs['max_steps'], reward_scale=1., nb_eval_steps=10,
                          policy_kwargs={"layers": [400, 300]})
    elif agent == "BP":
        policy = 'TD3LnMlpPolicy'
        model = TD3MemBackProp(policy=policy, env=env, eval_env=eval_env, gamma=gamma, batch_size=128,
                          tau=0.005, policy_delay=2, learning_starts=25000,
                          action_noise=create_action_noise(env, "normal_0.1"), buffer_size=100000, verbose=2,
                          n_cpu_tf_sess=10,
                          alpha=0.5, beta=-1, gradient_steps=200, max_step=kwargs['max_steps'], reward_scale=1., nb_eval_steps=10,
                          policy_kwargs={"layers": [400, 300]})
    else:
        raise NotImplementedError

    print("model building finished")
    model.learn(total_timesteps=kwargs['num_timesteps'])

    env.close()
    if eval_env is not None:
        eval_env.close()

    logger.info('total runtime: {}s'.format(time.time() - start_time))