def __init__(self,
                 observation_shape=(1, ),
                 normalize_observations=True,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 nb_actions=3,
                 layer_norm=True,
                 skill_name=None,
                 restore_path=None,
                 action_func=None,
                 obs_func=None,
                 num_params=None,
                 termination=None,
                 **kwargs):

        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')

        # Parameters.
        self.skill_name = skill_name
        self.restore_path = osp.expanduser(restore_path)
        self.normalize_observations = normalize_observations
        self.action_range = action_range
        self.observation_range = observation_range
        self.actor = Actor(nb_actions=nb_actions,
                           name=skill_name,
                           layer_norm=layer_norm)
        self.num_params = num_params
        if termination:
            self.termination = termination
        else:
            self.termination = lambda x: False

        # funcs
        self.get_action = action_func if action_func is not None else mirror
        self.get_obs = obs_func if obs_func is not None else mirror

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('%s/obs_rms' % skill_name):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        self.actor_tf = self.actor(normalized_obs0)

        ## loader and saver
        self.loader = tf.train.Saver(self.create_restore_var_dict())
Beispiel #2
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    logger.debug("Env info")
    logger.debug(env.__doc__)
    logger.debug("-" * 20)
    gym.logger.setLevel(logging.WARN)

    if evaluation and rank == 0:
        if kwargs['eval_env_id']:
            eval_env_id = kwargs['eval_env_id']
        else:
            eval_env_id = env_id
        eval_env = gym.make(eval_env_id)
        # del eval_env_id from kwargs
        del kwargs['eval_env_id']
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        elif 'epsnorm' in current_noise_type:
            _, stddev, epsilon = current_noise_type.split('_')
            action_noise = EpsilonNormalActionNoise(mu=np.zeros(nb_actions),
                                                    sigma=float(stddev) *
                                                    np.ones(nb_actions),
                                                    epsilon=float(epsilon))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    tf.reset_default_graph()

    # importing the current skill configs
    if kwargs['look_ahead'] and kwargs['skillset']:
        skillset_file = __import__("HER.skills.%s" % kwargs['skillset'],
                                   fromlist=[''])
        my_skill_set = SkillSet(skillset_file.skillset)
    else:
        my_skill_set = None

    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        logger.info('rank {}: seed={}, logdir={}'.format(
            rank, seed, logger.get_dir()))
        start_time = time.time()
    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   my_skill_set=my_skill_set,
                   **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Beispiel #3
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    gym.logger.setLevel(logging.WARN)

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        #env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Beispiel #4
0
    def __init__(self,
                 observation_shape=(1, ),
                 normalize_observations=True,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 nb_actions=3,
                 layer_norm=True,
                 skill_name=None,
                 restore_path=None,
                 action_func=None,
                 obs_func=None,
                 num_params=None,
                 termination=None,
                 get_full_state_func=None,
                 next_state_query_idx=None):

        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')

        # Parameters.
        self.skill_name = skill_name
        self.restore_path = osp.expanduser(restore_path)
        self.normalize_observations = normalize_observations
        self.action_range = action_range
        self.observation_range = observation_range
        self.actor = Actor(nb_actions=nb_actions,
                           name="%s/actor" % skill_name,
                           layer_norm=layer_norm)
        self.critic = Critic(layer_norm=layer_norm,
                             name="%s/critic" % skill_name)

        self.successor_prob_model = classifier(in_shape=observation_shape[0],
                                               out_shape=1,
                                               name="%s/suc_pred_model" %
                                               skill_name,
                                               sess=None,
                                               log_dir=None,
                                               train=False,
                                               in_tensor=self.obs0)

        self.num_params = num_params

        # memory loading for comparison only
        print("searching for memory in %s" %
              osp.join(self.restore_path, 'memory'))
        memory_filename = glob.glob(
            osp.join(self.restore_path, 'memory', '*.csv'))[0]

        self.memory = np.loadtxt(memory_filename, delimiter=',')
        self.starting_state_goal = self.memory[:, :observation_shape[0]]
        self.ending_state = self.memory[:, observation_shape[0]:]

        # load successor prediction model
        print("searching for successor model in %s" %
              osp.join(self.restore_path, 'succ_model'))
        self.succ_model = regressor(in_shape=observation_shape[0],
                                    out_shape=observation_shape[0] - 3,
                                    name="%s/succmodel" % skill_name,
                                    sess=None,
                                    log_dir=None,
                                    whiten_data=None,
                                    train=False,
                                    in_tensor=self.obs0)

        if next_state_query_idx is not None:
            self.next_state_query_idx = next_state_query_idx
        else:
            self.next_state_query_idx = list(range(observation_shape[0]))

        if termination:
            self.termination = termination
        else:
            self.termination = lambda x, y: False

        # funcs
        self.get_action = action_func if action_func is not None else mirror
        self.get_obs = obs_func if obs_func is not None else mirror
        self.get_full_state = get_full_state_func if get_full_state_func is not None else mirror

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('%s/obs_rms' % skill_name):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        self.actor_tf = self.actor(normalized_obs0)
        self.critic_tf = self.critic(normalized_obs0, self.actor_tf)
        self.success_prob = self.successor_prob_model.prob
        self.next_state_pred = self.succ_model.out_tensor

        ## loader and saver
        self.loader_ddpg = tf.train.Saver(self.create_restore_var_dict())
        self.loader_successor_model = tf.train.Saver(
            self.create_restore_var_dict_successor_model(
                model_name='suc_pred_model'))
        self.loader_successor_prediction_model = tf.train.Saver(
            self.create_restore_var_dict_successor_model(
                model_name="succmodel"))