def __init__(self, config, action_bound):
        super(DQNModel, self).__init__(config=config)
        self.proposed_action_list = []
        self.action_bound = action_bound
        action_list = []
        for i in range(len(action_bound[0])):
            low = action_bound[0][i]
            high = action_bound[1][i]
            action_list.append(
                np.arange(start=low,
                          stop=high,
                          step=(high - low) /
                          self.config.config_dict['ACTION_SPLIT_COUNT']))
        action_iterator = itertools.product(*action_list)
        self.action_selection_list = []
        for action_sample in action_iterator:
            self.action_selection_list.append(tf.constant(action_sample))

        self.reward_input = tf.placeholder(shape=[None, 1], dtype=tf.float32)

        self.state_input = tf.placeholder(
            shape=[None] + list(self.config.config_dict['STATE_SPACE']),
            dtype=tf.float32)
        self.next_state_input = tf.placeholder(
            shape=[None] + list(self.config.config_dict['STATE_SPACE']),
            dtype=tf.float32)
        self.action_input = tf.placeholder(
            shape=[None] + list(self.config.config_dict['ACTION_SPACE']),
            dtype=tf.float32)
        self.done_input = tf.placeholder(shape=[None, 1], dtype=tf.bool)
        self.input = tf.concat([self.state_input, self.action_input])
        self.done = tf.cast(self.done_input, dtype=tf.float32)

        self.q_value_list = []
        var_list = None
        for action_sample in self.action_selection_list:
            q_net, q_output, var_list = NetworkCreator.create_network(
                input=tf.concat(self.state_input, action_sample),
                network_config=self.config.config_dict['NET_CONFIG'],
                net_name=self.config.config_dict['NAME'])
            self.q_value_list.append(q_output)
        self.var_list = var_list

        self.target_q_value_list = []
        for action_sample in self.action_selection_list:
            q_net, q_output, var_list = NetworkCreator.create_network(
                input=tf.concat(self.next_state_input, action_sample),
                network_config=self.config.config_dict['NET_CONFIG'],
                net_name='TARGET' + self.config.config_dict['NAME'])
            self.target_var_list.append(q_output)
        self.target_var_list = var_list

        self.loss, self.optimizer, self.optimize = self.create_training_method(
        )
        self.update_target_q_op = self.create_target_q_update()
        self.memory = Memory(
            limit=1e100,
            action_shape=self.config.config_dict['ACTION_SPACE'],
            observation_shape=self.config.config_dict['STATE_SPACE'])
        self.sess = tf.get_default_session()
Esempio n. 2
0
 def __init__(self, limit, env):
     self.limit = limit
     self.env = env
     self.memory = Memory(
         limit=self.limit,
         action_shape=self.env.action_space.shape,
         observation_shape=self.env.observation_space.shape)
     self.file_dir = None
Esempio n. 3
0
class Expert:
    def __init__(self, limit, env):
        self.limit = limit
        self.env = env
        self.memory = Memory(
            limit=self.limit,
            action_shape=self.env.action_space.shape,
            observation_shape=self.env.observation_space.shape)
        self.file_dir = None

    def load_file(self, file_dir):
        self.file_dir = file_dir
        expert_file = open(self.file_dir, 'rb')
        expert_data = pickle.load(expert_file)
        expert_file.close()
        for episode_sample in expert_data:
            for step_sample in episode_sample:
                self.memory.append(step_sample[0], step_sample[1],
                                   step_sample[2], step_sample[3],
                                   step_sample[4])

    def sample(self, batch_size):
        return self.memory.sample(batch_size)

    def set_tf(self, actor, critic, obs_rms, ret_rms, observation_range,
               return_range):
        self.expert_state = tf.placeholder(tf.float32,
                                           shape=(None, ) +
                                           self.env.observation_space.shape,
                                           name='expert_state')
        self.expert_action = tf.placeholder(tf.float32,
                                            shape=(None, ) +
                                            self.env.action_space.shape,
                                            name='expert_action')
        normalized_state = tf.clip_by_value(
            normalize(self.expert_state, obs_rms), observation_range[0],
            observation_range[1])
        expert_actor = actor(normalized_state, reuse=True)
        normalized_q_with_expert_data = critic(normalized_state,
                                               self.expert_action,
                                               reuse=True)
        normalized_q_with_expert_actor = critic(normalized_state,
                                                expert_actor,
                                                reuse=True)
        self.Q_with_expert_data = denormalize(
            tf.clip_by_value(normalized_q_with_expert_data, return_range[0],
                             return_range[1]), ret_rms)
        self.Q_with_expert_actor = denormalize(
            tf.clip_by_value(normalized_q_with_expert_actor, return_range[0],
                             return_range[1]), ret_rms)
        self.critic_loss = tf.reduce_mean(
            tf.nn.softplus(self.Q_with_expert_actor - self.Q_with_expert_data))
        self.actor_loss = -tf.reduce_mean(self.Q_with_expert_actor)
    def __init__(self, config, action_bound, obs_bound):
        super().__init__(config=config)
        self.obs_dim = self.config.config_dict['STATE_SPACE']
        self.obs_dim = self.obs_dim[0] + 1
        self.act_dim = self.config.config_dict['ACTION_SPACE'][0]
        with tf.variable_scope(name_or_scope=self.config.config_dict['NAME']):
            self.scaler = Scaler(self.obs_dim)
            self.val_func = NNValueFunction(
                self.obs_dim,
                hid1_mult=self.config.config_dict['HIDDEN_MULTIPLE'],
                name_scope=self.config.config_dict['NAME'])
            self.policy = Policy(
                self.obs_dim,
                self.act_dim,
                kl_targ=self.config.config_dict['KL_TARG'],
                hid1_mult=self.config.config_dict['HIDDEN_MULTIPLE'],
                policy_logvar=self.config.config_dict['POLICY_LOGVAR'],
                name_scope=self.config.config_dict['NAME'])

        self._real_trajectories = {
            'observes': [],
            'actions': [],
            'rewards': [],
            'unscaled_obs': []
        }

        self._cyber_trajectories = {
            'observes': [],
            'actions': [],
            'rewards': [],
            'unscaled_obs': []
        }
        self._real_trajectories_memory = deque(
            maxlen=self.config.config_dict['EPISODE_REAL_MEMORY_SIZE'])
        self._cyber_trajectories_memory = deque(
            maxlen=self.config.config_dict['EPISODE_CYBER_MEMORY_SIZE'])
        self._real_step_count = 0.0
        self._cyber_step_count = 0.0
        self.action_low = action_bound[0]
        self.action_high = action_bound[1]
        self._env_status = None

        self.real_data_memory = Memory(
            limit=10000,
            action_shape=self.config.config_dict['ACTION_SPACE'],
            observation_shape=self.config.config_dict['STATE_SPACE'])
        self.simulation_data_memory = Memory(
            limit=10000,
            action_shape=self.config.config_dict['ACTION_SPACE'],
            observation_shape=self.config.config_dict['STATE_SPACE'])
Esempio n. 5
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    # TODO: Change back to 1e6
    
    memory = Memory(limit=int(1e2), state_shape=env.state_space.shape, action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()


    kwargs.pop('state_shape')
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
        action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)

    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Esempio n. 6
0
    def train(self, env, nb_steps):
        # Configure things.
        rank = MPI.COMM_WORLD.Get_rank()
        if rank != 0:
            logger.set_level(logger.DISABLED)

        # Parse noise_type
        action_noise = None
        param_noise = None
        nb_actions = env.action_space.shape[-1]
        param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(0.2),
                                             desired_action_stddev=float(0.2))

        # Configure components.
        memory = Memory(limit=int(1e6),
                        action_shape=env.action_space.shape,
                        observation_shape=env.observation_space.shape)
        critic = Critic(layer_norm=True)
        actor = Actor(nb_actions, layer_norm=True)

        # Seed everything to make things reproducible.
        seed = self.seed + 1000000 * rank
        logger.info('rank {}: seed={}, logdir={}'.format(
            rank, seed, logger.get_dir()))
        tf.reset_default_graph()
        set_global_seeds(seed)
        env.seed(seed)

        # Disable logging for rank != 0 to avoid noise.
        if rank == 0:
            start_time = time.time()
        #load_state("D:\project\osim-rl-helper\ddpg.pkl")
        training.train(env=env,
                       param_noise=param_noise,
                       restore=True,
                       action_noise=action_noise,
                       actor=actor,
                       critic=critic,
                       memory=memory,
                       nb_epochs=1,
                       nb_epoch_cycles=1,
                       render_eval=False,
                       reward_scale=1.0,
                       render=False,
                       normalize_returns=False,
                       normalize_observations=True,
                       critic_l2_reg=1e-2,
                       actor_lr=1e-4,
                       critic_lr=1e-3,
                       popart=False,
                       gamma=0.99,
                       clip_norm=None,
                       nb_train_steps=nb_steps,
                       nb_rollout_steps=5,
                       nb_eval_steps=5,
                       batch_size=64)
        #save_state("D:\project\osim-rl-helper\ddpg.pkl")

        if rank == 0:
            logger.info('total runtime: {}s'.format(time.time() - start_time))
Esempio n. 7
0
def run_baselines(env, seed, log_dir):
    '''
    Create baselines model and training.

    Replace the ddpg and its training with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return
    '''
    rank = MPI.COMM_WORLD.Get_rank()
    seed = seed + 1000000 * rank
    set_global_seeds(seed)
    env.seed(seed)

    # Set up logger for baselines
    configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard'])
    baselines_logger.info('rank {}: seed={}, logdir={}'.format(
        rank, seed, baselines_logger.get_dir()))

    # Set up params for baselines ddpg
    nb_actions = env.action_space.shape[-1]
    layer_norm = False

    action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                sigma=float(params['sigma']) *
                                                np.ones(nb_actions))
    memory = Memory(limit=params['replay_buffer_size'],
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    training.train(env=env,
                   eval_env=None,
                   param_noise=None,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   nb_epochs=params['n_epochs'],
                   nb_epoch_cycles=params['n_epoch_cycles'],
                   render_eval=False,
                   reward_scale=1.,
                   render=False,
                   normalize_returns=False,
                   normalize_observations=False,
                   critic_l2_reg=0,
                   actor_lr=params['policy_lr'],
                   critic_lr=params['qf_lr'],
                   popart=False,
                   gamma=params['discount'],
                   clip_norm=None,
                   nb_train_steps=params['n_train_steps'],
                   nb_rollout_steps=params['n_rollout_steps'],
                   nb_eval_steps=100,
                   batch_size=64)

    return osp.join(log_dir, 'progress.csv')
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    logger.configure(dir='/home/vaisakhs_shaj/Desktop/DeepReinforcementLearning/5_Deep_Deterministic_Policy_Gradients/LOGS/OSIM')
    # Create envs.
    env = ProstheticsEnv(visualize=True)
    env.change_model(model = '2D', difficulty = 0, prosthetic = True, seed=seed)
        #env.seed(seed)
    #env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    
    eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(2e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 2000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
        action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Esempio n. 9
0
    def setup(self,
              obs_shape,
              nb_actions,
              action_spec,
              noise_type,
              gamma=1.,
              tau=0.01,
              layer_norm=True):
        super(DDPGAgent, self).setup(obs_shape, nb_actions, action_spec,
                                     noise_type, gamma, tau, layer_norm)

        self.action_spec_internal = action_spec
        self.obs_dim = obs_shape
        action_noise = None
        param_noise = None

        # Parse noise_type
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

        # Configure components.
        self.memory = Memory(limit=int(500),
                             action_shape=(nb_actions, ),
                             observation_shape=obs_shape)
        self.critic = Critic(layer_norm=layer_norm, hidden_size=128)
        self.actor = Actor(nb_actions, layer_norm=layer_norm, hidden_size=128)

        tf.reset_default_graph()

        # max_action = env.action_space.high
        self.ddpg = DDPG(actor=self.actor,
                         critic=self.critic,
                         memory=self.memory,
                         observation_shape=obs_shape,
                         action_shape=(nb_actions, ),
                         gamma=gamma,
                         tau=tau,
                         action_noise=action_noise,
                         param_noise=param_noise)
Esempio n. 10
0
def run(cfg, seed, noise_type, layer_norm, evaluation, architecture, **kwargs):    
   
    if MPI.COMM_WORLD.Get_rank() == 0:
        dir_path = os.path.dirname(os.path.realpath(__file__))
        logger.configure(dir_path, ['stdout'])
        
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = GRLEnv(cfg)
    gym.logger.setLevel(logging.WARN)
    env = MyMonitor(env, os.path.join(logger.get_dir(), kwargs['output']))

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev, theta = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), dt=0.03,
                                                        sigma=float(stddev) * np.ones(nb_actions), 
                                                        theta=float(theta) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = MyCritic(layer_norm=layer_norm, architecture=architecture)
    actor = MyActor(nb_actions, layer_norm=layer_norm, architecture=architecture)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env, param_noise=param_noise, action_noise=action_noise,
                   actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Esempio n. 11
0
def train_ddpg(env, N_episodes):
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)
Esempio n. 12
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):

    param_noise = None
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    # env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    nb_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                sigma=np.ones(nb_actions))

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    actor = Actor(nb_actions, layer_norm=layer_norm)
    critic = Critic(layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Esempio n. 13
0
def main():
    args = parse_args()
    # create the environment
    env = gym.make("kuka-v0") # <-- this we need to create
    env.init_bullet(render=True)

    # create the learning agent
    # model = deepq.models.mlp([16, 16])
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n

   # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    noise_type = 'adaptive-param_0.2'
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # policy = GaussianMlpPolicy(ob_dim, ac_dim)
    # vf = NeuralNetValueFunction(ob_dim, ac_dim)
    # learn(env, policy=policy, vf=vf,
    #     gamma=0.99, lam=0.97, timesteps_per_batch=2500,
    #     desired_kl=0.002,
    #     num_timesteps=1000, animate=False)

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=False)
    actor = Actor(nb_actions, layer_norm=False)

    training.train(env=env, param_noise=param_noise,
        action_noise=action_noise, actor=actor, critic=critic, memory=memory, **args)

    env.close()
Esempio n. 14
0
File: ddpg.py Progetto: DiddiZ/gps
    def __init__(self, hyperparams, dX, dU):
        """Initializes the policy.

        Args:
            hyperparams: Dictionary of hyperparameters.
            dX: Dimension of state space.
            dU: Dimension of action space.

        """
        PolicyOpt.__init__(self, hyperparams, dX, dU)
        self.dX = dX
        self.dU = dU

        self.epochs = hyperparams['epochs']
        self.param_noise_adaption_interval = hyperparams[
            'param_noise_adaption_interval']
        set_global_seeds(hyperparams['seed'])

        # Initialize DDPG policy
        self.pol = DDPG(Actor(dU,
                              network=hyperparams['network'],
                              **hyperparams['network_kwargs']),
                        Critic(network=hyperparams['network'],
                               **hyperparams['network_kwargs']),
                        Memory(limit=hyperparams['memory_limit'],
                               action_shape=(dU, ),
                               observation_shape=(dX, )),
                        observation_shape=(dX, ),
                        action_shape=(dU, ),
                        param_noise=AdaptiveParamNoiseSpec(
                            initial_stddev=0.2, desired_action_stddev=0.2),
                        **hyperparams['ddpg_kwargs'])

        sess = get_session()
        self.pol.initialize(sess)
        sess.graph.finalize()

        self.policy = self  # Act method is contained in this class
class DQNModel(TensorflowBasedModel):
    key_list = Config.load_json(file_path=None)

    def __init__(self, config, action_bound):
        super(DQNModel, self).__init__(config=config)
        self.proposed_action_list = []
        self.action_bound = action_bound
        action_list = []
        for i in range(len(action_bound[0])):
            low = action_bound[0][i]
            high = action_bound[1][i]
            action_list.append(
                np.arange(start=low,
                          stop=high,
                          step=(high - low) /
                          self.config.config_dict['ACTION_SPLIT_COUNT']))
        action_iterator = itertools.product(*action_list)
        self.action_selection_list = []
        for action_sample in action_iterator:
            self.action_selection_list.append(tf.constant(action_sample))

        self.reward_input = tf.placeholder(shape=[None, 1], dtype=tf.float32)

        self.state_input = tf.placeholder(
            shape=[None] + list(self.config.config_dict['STATE_SPACE']),
            dtype=tf.float32)
        self.next_state_input = tf.placeholder(
            shape=[None] + list(self.config.config_dict['STATE_SPACE']),
            dtype=tf.float32)
        self.action_input = tf.placeholder(
            shape=[None] + list(self.config.config_dict['ACTION_SPACE']),
            dtype=tf.float32)
        self.done_input = tf.placeholder(shape=[None, 1], dtype=tf.bool)
        self.input = tf.concat([self.state_input, self.action_input])
        self.done = tf.cast(self.done_input, dtype=tf.float32)

        self.q_value_list = []
        var_list = None
        for action_sample in self.action_selection_list:
            q_net, q_output, var_list = NetworkCreator.create_network(
                input=tf.concat(self.state_input, action_sample),
                network_config=self.config.config_dict['NET_CONFIG'],
                net_name=self.config.config_dict['NAME'])
            self.q_value_list.append(q_output)
        self.var_list = var_list

        self.target_q_value_list = []
        for action_sample in self.action_selection_list:
            q_net, q_output, var_list = NetworkCreator.create_network(
                input=tf.concat(self.next_state_input, action_sample),
                network_config=self.config.config_dict['NET_CONFIG'],
                net_name='TARGET' + self.config.config_dict['NAME'])
            self.target_var_list.append(q_output)
        self.target_var_list = var_list

        self.loss, self.optimizer, self.optimize = self.create_training_method(
        )
        self.update_target_q_op = self.create_target_q_update()
        self.memory = Memory(
            limit=1e100,
            action_shape=self.config.config_dict['ACTION_SPACE'],
            observation_shape=self.config.config_dict['STATE_SPACE'])
        self.sess = tf.get_default_session()

    def update(self):
        for i in range(self.config.config_dict['ITERATION_EVER_EPOCH']):
            batch_data = self.memory.sample(
                batch_size=self.config.config_dict['BATCH_SIZE'])
            loss = self.sess.run(fetches=[self.loss, self.optimize],
                                 feed_dict={
                                     self.reward_input: batch_data['rewards'],
                                     self.action_input: batch_data['actions'],
                                     self.state_input: batch_data['obs0'],
                                     self.done_input: batch_data['terminals1']
                                 })

    def predict(self, obs, q_value):
        pass

    def print_log_queue(self, status):
        self.status = status
        while self.log_queue.qsize() > 0:
            log = self.log_queue.get()
            print("%s: Critic loss %f: " %
                  (self.name, log[self.name + '_CRITIC']))
            log['INDEX'] = self.log_print_count
            self.log_file_content.append(log)
            self.log_print_count += 1

    def create_training_method(self):
        l1_l2 = tfcontrib.layers.l1_l2_regularizer()
        loss = tf.reduce_sum((self.predict_q_value - self.q_output) ** 2) + \
               tfcontrib.layers.apply_regularization(l1_l2, weights_list=self.var_list)
        optimizer = tf.train.AdadeltaOptimizer(
            learning_rate=self.config.config_dict['LEARNING_RATE'])
        optimize_op = optimizer.minimize(loss=loss, var_list=self.var_list)
        return loss, optimizer, optimize_op

    def create_predict_q_value_op(self):

        predict_q_value = (1. - self.done) * self.config.config_dict['DISCOUNT'] * self.target_q_output \
                          + self.reward_input
        return predict_q_value

    def create_target_q_update(self):
        op = []
        for var, target_var in zip(self.var_list, self.target_var_list):
            ref_val = self.config.config_dict['DECAY'] * target_var + (
                1.0 - self.config.config_dict['DECAY']) * var
            op.append(tf.assign(ref_val, var))
        return op

    def store_one_sample(self, state, next_state, action, reward, done, *arg,
                         **kwargs):
        self.memory.append(obs0=state,
                           obs1=next_state,
                           action=action,
                           reward=reward,
                           terminal1=done)
Esempio n. 16
0
        _, stddev = current_noise_type.split('_')
        action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                         sigma=float(stddev) *
                                         np.ones(nb_actions))
    elif 'ou' in current_noise_type:
        _, stddev = current_noise_type.split('_')
        action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                    sigma=float(stddev) *
                                                    np.ones(nb_actions))
    else:
        raise RuntimeError(
            'unknown noise type "{}"'.format(current_noise_type))

nb_actions = env.action_space.shape[-1]
memory = Memory(limit=int(1e6),
                action_shape=env.action_space.shape,
                observation_shape=env.observation_space.shape)
critic = Critic(layer_norm=layer_norm)
actor = Actor(nb_actions, layer_norm=layer_norm)

tf.reset_default_graph()

agent = DDPG(actor,
             critic,
             memory,
             env.observation_space.shape,
             env.action_space.shape,
             gamma=gamma,
             tau=tau,
             normalize_returns=normalize_returns,
             normalize_observations=normalize_observations,
Esempio n. 17
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    # env = CartpoleSwingupEnvX()
    env = experiments[env_id]['env_call']()
    if experiments[env_id]['normalize_env']:
        env = normalize(env)
    env = bench.Monitor(
        env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    # env = gym.wrappers.Monitor(env, log_dir, video_callable=False,
    # force=True)
    gym.logger.setLevel(logging.WARN)

    if evaluation and rank == 0:
        eval_env = experiments[env_id]['env_call']()
        if experiments[env_id]['normalize_env']:
            eval_env = normalize(eval_env)
        eval_env = bench.Monitor(
            eval_env,
            os.path.join(logger.get_dir(),
                         'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(
        limit=int(1e6),
        action_shape=env.action_space.shape,
     observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info(
        'rank {}: seed={}, logdir={}'.format(rank,
                                             seed,
                                             logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    # env.seed(seed)
    # if eval_env is not None:
        # eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
                   action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Esempio n. 18
0
dataPrimary = pd.read_csv("data_p/monitorChange.csv")
dataProgress = pd.read_csv("data_p/progress.csv")
action_shape = (1, )
nb_action = 1
observation_shape = (3, )
t_train_time = 10000
t_test_time = 10000
network = 'mlp'
action_noise = None
param_noise = None
popart = False,
load_path = 'ddpg_model'
load_path = None

memory = Memory(limit=int(1e6),
                action_shape=action_shape,
                observation_shape=observation_shape)
critic = Critic(network=network)
actor = Actor(nb_action, network=network)

agent = DDPG(actor,
             critic,
             memory,
             observation_shape,
             action_shape,
             gamma=0.99,
             tau=0.01,
             normalize_returns=False,
             normalize_observations=True,
             batch_size=32,
             action_noise=action_noise,
Esempio n. 19
0
def run(env_id, seed, noise_type, layer_norm, evaluation, perform, use_expert,
        expert_dir, use_trpo_expert, expert_limit, **kwargs):
    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    gym.logger.setLevel(logging.WARN)
    if evaluation and perform:
        perform = False

    if evaluation and rank == 0 or perform:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        # env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)
    if use_expert:
        expert = Expert(limit=expert_limit, env=env)
        if expert_dir is None:
            expert_dir = os.path.join('./expert',
                                      env.env.spec.id) + '/expert.pkl'
        expert.load_file(expert_dir)
    elif use_trpo_expert:
        assert expert_dir is not None
        expert = Expert(limit=expert_limit, env=env)
        expert.load_file_trpo(expert_dir)
    else:
        expert = None

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   perform=perform,
                   expert=expert,
                   **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Esempio n. 20
0
    def train(self,
              env_fn,
              num_timesteps,
              noise_type,
              layer_norm,
              folder,
              load_policy,
              video_width,
              video_height,
              plot_rewards,
              save_every=50,
              seed=1234,
              episode_length=1000,
              pi_hid_size=150,
              pi_num_hid_layers=3,
              render_frames=_render_frames,
              **kwargs):
        num_cpu = self.workers
        if sys.platform == 'darwin':
            num_cpu //= 2
        config = tf.ConfigProto(
            allow_soft_placement=True,
            intra_op_parallelism_threads=num_cpu,
            inter_op_parallelism_threads=num_cpu)

        if self.gpu_usage is None or self.gpu_usage <= 0.:
            os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
        else:
            config.gpu_options.allow_growth = True  # pylint: disable=E1101
            config.gpu_options.per_process_gpu_memory_fraction = self.gpu_usage / self.workers
        tf.Session(config=config).__enter__()

        worker_seed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
        set_global_seeds(worker_seed)

        tf.set_random_seed(worker_seed)
        np.random.seed(worker_seed)

        save_every = max(1, save_every)

        env = env_fn()
        env.seed(worker_seed)

        rank = MPI.COMM_WORLD.Get_rank()
        logger.info('rank {}: seed={}, logdir={}'.format(rank, worker_seed,
                                                         logger.get_dir()))

        def policy_fn(name, ob_space, ac_space):
            return mlp_policy.MlpPolicy(
                name=name,
                ob_space=ob_space,
                ac_space=ac_space,
                hid_size=pi_hid_size,
                num_hid_layers=pi_num_hid_layers)

        env = bench.Monitor(
            env,
            logger.get_dir() and osp.join(logger.get_dir(), str(rank)),
            allow_early_resets=True)
        gym.logger.setLevel(logging.INFO)

        that = self

        iter_name = 'iters_so_far'
        if self.method == 'sql':
            iter_name = 'epoch'
        # TODO replace with utils.create_callback(...)
        def callback(locals, globals):
            if that.method != "ddpg":
                if load_policy is not None and locals[iter_name] == 0:
                    # noinspection PyBroadException
                    try:
                        utils.load_state(load_policy)
                        if MPI.COMM_WORLD.Get_rank() == 0:
                            logger.info("Loaded policy network weights from %s." % load_policy)
                            # save TensorFlow summary (contains at least the graph definition)
                    except:
                        logger.error("Failed to load policy network weights from %s." % load_policy)
                if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] == 0:
                    _ = tf.summary.FileWriter(folder, tf.get_default_graph())
            if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] % save_every == 0:
                print('Saving video and checkpoint for policy at iteration %i...' %
                      locals[iter_name])
                ob = env.reset()
                images = []
                rewards = []
                max_reward = 1.  # if any reward > 1, we have to rescale
                lower_part = video_height // 5
                for i in range(episode_length):
                    if that.method == "ddpg":
                        ac, _ = locals['agent'].pi(ob, apply_noise=False, compute_Q=False)
                    elif that.method == "sql":
                        ac, _ = locals['policy'].get_action(ob)
                    elif isinstance(locals['pi'], GaussianMlpPolicy):
                        ac, _, _ = locals['pi'].act(np.concatenate((ob, ob)))
                    else:
                        ac, _ = locals['pi'].act(False, ob)
                    ob, rew, new, _ = env.step(ac)
                    images.append(render_frames(env))
                    if plot_rewards:
                        rewards.append(rew)
                        max_reward = max(rew, max_reward)
                    if new:
                        break

                orange = np.array([255, 163, 0])
                red = np.array([255, 0, 0])
                video = []
                width_factor = 1. / episode_length * video_width
                for i, imgs in enumerate(images):
                    for img in imgs:
                        img[-lower_part, :10] = orange
                        img[-lower_part, -10:] = orange
                        if episode_length < video_width:
                            p_rew_x = 0
                            for j, r in enumerate(rewards[:i]):
                                rew_x = int(j * width_factor)
                                if r < 0:
                                    img[-1:, p_rew_x:rew_x] = red
                                    img[-1:, p_rew_x:rew_x] = red
                                else:
                                    rew_y = int(r / max_reward * lower_part)
                                    img[-rew_y - 1:, p_rew_x:rew_x] = orange
                                    img[-rew_y - 1:, p_rew_x:rew_x] = orange
                                p_rew_x = rew_x
                        else:
                            for j, r in enumerate(rewards[:i]):
                                rew_x = int(j * width_factor)
                                if r < 0:
                                    img[-1:, rew_x] = red
                                    img[-1:, rew_x] = red
                                else:
                                    rew_y = int(r / max_reward * lower_part)
                                    img[-rew_y - 1:, rew_x] = orange
                                    img[-rew_y - 1:, rew_x] = orange
                    video.append(np.hstack(imgs))

                imageio.mimsave(
                    os.path.join(folder, "videos", "%s_%s_iteration_%i.mp4" %
                                 (that.environment, that.method, locals[iter_name])),
                    video,
                    fps=60)
                env.reset()

                if that.method != "ddpg":
                    utils.save_state(os.path.join(that.folder, "checkpoints", "%s_%i" %
                                                 (that.environment, locals[iter_name])))

        if self.method == "ppo":
            pposgd_simple.learn(
                env,
                policy_fn,
                max_timesteps=int(num_timesteps),
                timesteps_per_actorbatch=1024,  # 256
                clip_param=0.2,
                entcoeff=0.01,
                optim_epochs=4,
                optim_stepsize=1e-3,  # 1e-3
                optim_batchsize=64,
                gamma=0.99,
                lam=0.95,
                schedule='linear',  # 'linear'
                callback=callback)
        elif self.method == "trpo":
            trpo_mpi.learn(
                env,
                policy_fn,
                max_timesteps=int(num_timesteps),
                timesteps_per_batch=1024,
                max_kl=0.1,  # 0.01
                cg_iters=10,
                cg_damping=0.1,
                gamma=0.99,
                lam=0.98,
                vf_iters=5,
                vf_stepsize=1e-3,
                callback=callback)
        elif self.method == "acktr":
            from algos.acktr import acktr
            with tf.Session(config=tf.ConfigProto()):
                ob_dim = env.observation_space.shape[0]
                ac_dim = env.action_space.shape[0]
                with tf.variable_scope("vf"):
                    vf = NeuralNetValueFunction(ob_dim, ac_dim)
                with tf.variable_scope("pi"):
                    policy = GaussianMlpPolicy(ob_dim, ac_dim)
                acktr.learn(
                    env,
                    pi=policy,
                    vf=vf,
                    gamma=0.99,
                    lam=0.97,
                    timesteps_per_batch=1024,
                    desired_kl=0.01,  # 0.002
                    num_timesteps=num_timesteps,
                    animate=False,
                    callback=callback)
        elif self.method == "ddpg":
            from algos.ddpg import ddpg
            # Parse noise_type
            action_noise = None
            param_noise = None
            nb_actions = env.action_space.shape[-1]
            for current_noise_type in noise_type.split(','):
                current_noise_type = current_noise_type.strip()
                if current_noise_type == 'none':
                    pass
                elif 'adaptive-param' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    from baselines.ddpg.noise import AdaptiveParamNoiseSpec
                    param_noise = AdaptiveParamNoiseSpec(
                        initial_stddev=float(stddev),
                        desired_action_stddev=float(stddev))
                elif 'normal' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    from baselines.ddpg.noise import NormalActionNoise
                    action_noise = NormalActionNoise(
                        mu=np.zeros(nb_actions),
                        sigma=float(stddev) * np.ones(nb_actions))
                elif 'ou' in current_noise_type:
                    from baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise
                    _, stddev = current_noise_type.split('_')
                    action_noise = OrnsteinUhlenbeckActionNoise(
                        mu=np.zeros(nb_actions),
                        sigma=float(stddev) * np.ones(nb_actions))
                else:
                    raise RuntimeError(
                        'unknown noise type "{}"'.format(current_noise_type))

            # Configure components.
            memory = Memory(
                limit=int(1e6),
                action_shape=env.action_space.shape,
                observation_shape=env.observation_space.shape)
            critic = Critic(layer_norm=layer_norm)
            actor = Actor(nb_actions, layer_norm=layer_norm)

            ddpg.train(
                env=env,
                eval_env=None,
                param_noise=param_noise,
                render=False,
                render_eval=False,
                action_noise=action_noise,
                actor=actor,
                critic=critic,
                memory=memory,
                callback=callback,
                **kwargs)
        elif self.method == "sql":
            from softqlearning.algorithms import SQL
            from softqlearning.misc.kernel import adaptive_isotropic_gaussian_kernel
            from softqlearning.misc.utils import timestamp
            from softqlearning.replay_buffers import SimpleReplayBuffer
            from softqlearning.value_functions import NNQFunction
            from softqlearning.policies import StochasticNNPolicy

            from rllab.envs.gym_env import GymEnv

            env = GymEnv(env)

            variant = {
                'seed': [1, 2, 3],
                'policy_lr': 3E-4,
                'qf_lr': 3E-4,
                'discount': 0.99,
                'layer_size': 128,
                'batch_size': 128,
                'max_pool_size': 1E6,
                'n_train_repeat': 1,
                'epoch_length': 1000,
                'snapshot_mode': 'last',
                'snapshot_gap': 100,
            }

            pool = SimpleReplayBuffer(
                env_spec=env.spec,
                max_replay_buffer_size=variant['max_pool_size'],
            )

            base_kwargs = dict(
                min_pool_size=episode_length,
                epoch_length=episode_length,
                n_epochs=num_timesteps,
                max_path_length=episode_length,
                batch_size=variant['batch_size'],
                n_train_repeat=variant['n_train_repeat'],
                eval_render=False,
                eval_n_episodes=1,
                iter_callback=callback
            )

            qf = NNQFunction(
                env_spec=env.spec,
                hidden_layer_sizes=tuple([pi_hid_size] * pi_num_hid_layers),
            )

            pi_layers = tuple([pi_hid_size] * pi_num_hid_layers)
            policy = StochasticNNPolicy(env_spec=env.spec, hidden_layer_sizes=pi_layers)

            algorithm = SQL(
                base_kwargs=base_kwargs,
                env=env,
                pool=pool,
                qf=qf,
                policy=policy,
                kernel_fn=adaptive_isotropic_gaussian_kernel,
                kernel_n_particles=32,
                kernel_update_ratio=0.5,
                value_n_particles=16,
                td_target_update_interval=1000,
                qf_lr=variant['qf_lr'],
                policy_lr=variant['policy_lr'],
                discount=variant['discount'],
                reward_scale=1,
                save_full_state=False,
            )

            algorithm.train()
        else:
            print('ERROR: Invalid "method" argument provided.', file=sys.stderr)
        env.close()
    def run(self):
        """Override Process.run()"""
        # Create environment
        env = create_environment(
            action_repeat=self.action_repeat,
            full=self.full,
            exclude_centering_frame=self.exclude_centering_frame,
            visualize=self.visualize,
            fail_reward=self.fail_reward,
            integrator_accuracy=self.integrator_accuracy)
        nb_actions = env.action_space.shape[-1]

        env.seed(os.getpid())
        set_global_seeds(os.getpid())

        num_traj = 0

        # Allocate ReplayBuffer
        memory = Memory(limit=int(1e6),
                        action_shape=env.action_space.shape,
                        observation_shape=env.observation_space.shape)

        # Create DPPG agent
        agent = DDPG(self.actor,
                     self.critic,
                     memory,
                     env.observation_space.shape,
                     env.action_space.shape,
                     gamma=self.gamma,
                     tau=self.tau,
                     normalize_returns=self.normalize_returns,
                     normalize_observations=self.normalize_observations,
                     batch_size=self.batch_size,
                     action_noise=None,
                     param_noise=None,
                     critic_l2_reg=self.critic_l2_reg,
                     enable_popart=self.popart,
                     clip_norm=self.clip_norm,
                     reward_scale=self.reward_scale)

        # Build the testing logic fn
        testing_fn = make_testing_fn(agent, env, self.episode_length,
                                     self.action_repeat, self.max_action,
                                     self.nb_episodes)

        # Start TF session
        with U.single_threaded_session() as sess:
            agent.initialize(sess)
            set_parameters = U.SetFromFlat(self.actor.trainable_vars)

            # Start sampling-worker loop.
            while True:
                message, actor_ws, global_step = self.inputQ.get(
                )  # Pop message
                if message == 'test':
                    # Set weights
                    set_parameters(actor_ws)
                    # Do testing
                    rewards, step_times, distances, episode_lengths = testing_fn(
                    )
                    self.outputQ.put((rewards, step_times, distances,
                                      episode_lengths, global_step))

                    # update number of trajectories
                    num_traj += self.nb_episodes

                    # restore environment if needed
                    if num_traj >= self.max_env_traj:
                        env.restore()
                        num_traj = 0

                elif message == 'exit':
                    print('[Worker {}] Exiting...'.format(os.getpid()))
                    env.close()
                    break
Esempio n. 22
0
def main():
    with U.single_threaded_session() as sess:
        batch_size = 64
        current_noise_type = 'adaptive-param_0.2'
        _, stddev = current_noise_type.split('_')
        param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        param_noise_adaption_interval = 2
        env = gym.make("Pendulum-v0")

        nb_actions = env.action_space.shape[-1]
        layer_norm = True

        # Configure components.
        memory = Memory(limit=int(1e6), action_shape=env.action_space.shape,
                        observation_shape=env.observation_space.shape)
        critic = Critic(layer_norm=layer_norm)
        actor = Actor(nb_actions, layer_norm=layer_norm)

        # Seed everything to make things reproducible.
        seed = int(1000000 * np.random.rand())
        logger.info('seed={}, logdir={}'.format(seed, logger.get_dir()))
        tf.set_random_seed(seed)
        np.random.seed(seed)
        random.seed(seed)
        env.seed(seed)

        max_action = env.action_space.high
        logger.info('scaling actions by {} before executing in env'.format(max_action))
        agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
                     batch_size=batch_size, param_noise=param_noise)
        logger.info('Using agent with the following configuration:')
        logger.info(str(agent.__dict__.items()))

        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()
        agent.reset()
        obs = env.reset()
        for t in itertools.count():
            episode_rewards = []
            done = False
            while not done:
                env.render()

                # Take action and update exploration to the newest value
                action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                new_obs, rew, done, _ = env.step(max_action * action)

                # Book-keeping.
                agent.store_transition(obs, action, rew, new_obs, done)
                obs = new_obs

                episode_rewards.append(rew)
                if done:
                    agent.reset()
                    obs = env.reset()

            nb_train_steps = 100
            epoch_adaptive_distances = []
            epoch_critic_losses = []
            epoch_actor_losses = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)

                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            if t % 10 == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean episode reward", round(np.mean(episode_rewards), 1))
                logger.record_tabular('train/loss_actor', round(np.mean(epoch_actor_losses)))
                logger.record_tabular('train/loss_critic', round(np.mean(epoch_critic_losses)))
                logger.record_tabular('train/param_noise_distance', round(np.mean(epoch_adaptive_distances)))
                logger.dump_tabular()
Esempio n. 23
0
    def __init__(self, network, env, gamma=1, tau=0.01, total_timesteps=1e6,
                 normalize_observations=True, normalize_returns=False, enable_popart=False,
                 noise_type='adaptive-param_0.2', clip_norm=None, reward_scale=1.,
                 batch_size=128, l2_reg_coef=0.2, actor_lr=1e-4, critic_lr=1e-3,
                 observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
                 **network_kwargs):
        # logger.info('Using agent with the following configuration:')
        # logger.info(str(self.__dict__.items()))
        observation_shape = env.observation_space.shape
        action_shape = env.action_space.shape

        # Inputs.
        self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
        self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
        self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
        self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions')
        self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')

        # Parameters.
        self.env = env
        self.gamma = gamma
        self.tau = tau
        self.total_timesteps = total_timesteps
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.enable_popart = enable_popart
        self.clip_norm = clip_norm
        self.reward_scale = reward_scale
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.batch_size = batch_size
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.l2_reg_coef = l2_reg_coef

        self.stats_sample = None

        self.action_noise = None
        self.param_noise = None
        nb_actions = self.env.action_space.shape[-1]
        if noise_type is not None:
            for current_noise_type in noise_type.split(','):
                current_noise_type = current_noise_type.strip()
                if current_noise_type == 'none':
                    pass
                elif 'adaptive-param' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    self.param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev),
                                                         desired_action_stddev=float(stddev))
                elif 'normal' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    self.action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
                elif 'ou' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    self.action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                                sigma=float(stddev) * np.ones(nb_actions))
                else:
                    raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

        assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
        self.memory = Memory(limit=int(1e6), action_shape=env.action_space.shape,
                             observation_shape=env.observation_space.shape)
        self.critic = Critic(network=network, **network_kwargs)
        self.actor = Actor(nb_actions, network=network, **network_kwargs)

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(self.actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(self.critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = self.actor(normalized_obs0)
        self.normalized_critic_tf = self.critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = self.critic(normalized_obs0, self.actor_tf, reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]),
            self.ret_rms)
        Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        self.initial_state = None  # recurrent architectures not supported yet
        self.def_path_pre = os.path.dirname(os.path.abspath(__file__)) + '/tmp/'
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    logging.basicConfig(filename='noGazebo_ddpg.log',
                        level=logging.DEBUG,
                        filemode="w")
    logging.getLogger().addHandler(logging.StreamHandler())

    # Configure logger for the process with rank 0 (main-process?)
    # MPI = Message Passing Interface, for parallel computing; rank = process identifier within a group of processes
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        # Disable logging for rank != 0 to avoid noise.
        logging.debug(
            "I'm MPI worker {} and I guess I just log nothing".format(rank))
        logger.set_level(logger.DISABLED)
        logging.disable(logging.CRITICAL)

    logging.info(
        "********************************************* Starting RL algorithm *********************************************"
    )
    now = datetime.datetime.now()
    logging.info(now.isoformat())

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(env,
                        logger.get_dir()
                        and os.path.join(logger.get_dir(), str(rank)),
                        allow_early_resets=True)

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[0]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components. (initialize memory, critic & actor objects)
    logging.info("action space of env: {}".format(env.action_space))  # Box(2,)
    logging.info("observation space of env: {}".format(
        env.observation_space))  # Box(51200,)
    memory = Memory(limit=int(1e4),
                    action_shape=(env.action_space.shape[0], ),
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Train the RL algorithm
    start_time = time.time()
    training.train(env=env,
                   eval_env=eval_env,
                   param_noise=param_noise,
                   action_noise=action_noise,
                   actor=actor,
                   critic=critic,
                   memory=memory,
                   **kwargs)

    # Training is done
    env.close()
    if eval_env is not None:
        eval_env.close()

    logger.info('total runtime: {}s'.format(time.time() - start_time))

    now = datetime.datetime.now()
    logging.info(now.isoformat())
    logging.info(
        "********************************************* End of RL algorithm *********************************************"
    )
    return True
    def __init__(self, config, action_bound, obs_bound):
        super(DDPGModelNew, self).__init__(config=config)
        self.action_noise = None
        self.para_noise = None
        if self.config.config_dict['NOISE_FLAG']:
            nb_actions = self.config.config_dict['ACTION_SPACE']
            noise_type = self.config.config_dict['NOISE_TYPE']
            action_noise = None
            param_noise = None
            for current_noise_type in noise_type.split(','):
                current_noise_type = current_noise_type.strip()
                if current_noise_type == 'none':
                    pass
                elif 'adaptive-param' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev),
                                                         desired_action_stddev=float(stddev))
                elif 'normal' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
                elif 'ou' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                                sigma=float(stddev) * np.ones(nb_actions))
                else:
                    raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))
            self.action_noise = action_noise
            self.para_noise = param_noise

        actor = Actor(nb_actions=self.config.config_dict['ACTION_SPACE'][0],
                      layer_norm=self.config.config_dict['LAYER_NORM_FLAG'],
                      net_config=self.config.config_dict['ACTOR_LAYER_CONFIG'],
                      action_low=action_bound[0],
                      action_high=action_bound[1])
        critic = Critic(net_config=self.config.config_dict['CRITIC_LAYER_CONFIG'])

        self.real_data_memory = Memory(limit=int(1e5),
                                       action_shape=self.config.config_dict['ACTION_SPACE'],
                                       observation_shape=self.config.config_dict['STATE_SPACE'])
        self.simulation_data_memory = Memory(limit=int(1e5),
                                             action_shape=self.config.config_dict['ACTION_SPACE'],
                                             observation_shape=self.config.config_dict['STATE_SPACE'])
        # TODO deal with obs range
        self.ddpg_model = baseline_ddpg(actor=actor,
                                        critic=critic,
                                        memory=self.real_data_memory,
                                        observation_shape=self.config.config_dict['STATE_SPACE'],
                                        action_shape=self.config.config_dict['ACTION_SPACE'],
                                        param_noise=self.para_noise,
                                        action_noise=self.action_noise,
                                        gamma=self.config.config_dict['GAMMA'],
                                        tau=self.config.config_dict['TAU'],
                                        action_range=action_bound,
                                        return_range=(-np.inf, np.inf),
                                        normalize_observations=False,
                                        actor_lr=self.config.config_dict['ACTOR_LEARNING_RATE'],
                                        critic_lr=self.config.config_dict['CRITIC_LEARNING_RATE'],
                                        critic_l2_reg=self.config.config_dict['CRITIC_L2_REG'],
                                        batch_size=self.config.config_dict['BATCH_SIZE'],
                                        observation_range=(-np.inf, np.inf))
        self.ddpg_model.sess = tf.get_default_session()

        var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        self.var_list = []
        for var in var_list:
            # TODO THIS MAY LEAD TO SOME BUGS IN THE FUTURE
            if 'actor' in var.name or 'critic' in var.name or 'obs' in var.name:
                self.var_list.append(var)
        self.variables_initializer = tf.variables_initializer(var_list=self.var_list)
        self._env_status = None
Esempio n. 26
0
    def __init__(
            self,
            env,
            gamma,
            total_timesteps,
            network='mlp',
            nb_rollout_steps=100,
            reward_scale=1.0,
            noise_type='adaptive-param_0.2',
            normalize_returns=False,
            normalize_observations=False,
            critic_l2_reg=1e-2,
            actor_lr=1e-4,
            critic_lr=1e-3,
            popart=False,
            clip_norm=None,
            nb_train_steps=50,  # per epoch cycle and MPI worker,  <- HERE!
            nb_eval_steps=100,
            buffer_size=1000000,
            batch_size=64,  # per MPI worker
            tau=0.01,
            param_noise_adaption_interval=50,
            **network_kwargs):

        # Adjusting hyper-parameters by considering the number of options policies to learn
        num_options = env.get_number_of_options()
        buffer_size = num_options * buffer_size
        batch_size = num_options * batch_size

        observation_space = env.option_observation_space
        action_space = env.option_action_space

        nb_actions = action_space.shape[-1]
        assert (np.abs(action_space.low) == action_space.high
                ).all()  # we assume symmetric actions.

        memory = Memory(limit=buffer_size,
                        action_shape=action_space.shape,
                        observation_shape=observation_space.shape)
        critic = Critic(network=network, **network_kwargs)
        actor = Actor(nb_actions, network=network, **network_kwargs)

        action_noise = None
        param_noise = None
        if noise_type is not None:
            for current_noise_type in noise_type.split(','):
                current_noise_type = current_noise_type.strip()
                if current_noise_type == 'none':
                    pass
                elif 'adaptive-param' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    param_noise = AdaptiveParamNoiseSpec(
                        initial_stddev=float(stddev),
                        desired_action_stddev=float(stddev))
                elif 'normal' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                     sigma=float(stddev) *
                                                     np.ones(nb_actions))
                elif 'ou' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    action_noise = OrnsteinUhlenbeckActionNoise(
                        mu=np.zeros(nb_actions),
                        sigma=float(stddev) * np.ones(nb_actions))
                else:
                    raise RuntimeError(
                        'unknown noise type "{}"'.format(current_noise_type))

        max_action = action_space.high
        logger.info(
            'scaling actions by {} before executing in env'.format(max_action))

        agent = DDPG(actor,
                     critic,
                     memory,
                     observation_space.shape,
                     action_space.shape,
                     gamma=gamma,
                     tau=tau,
                     normalize_returns=normalize_returns,
                     normalize_observations=normalize_observations,
                     batch_size=batch_size,
                     action_noise=action_noise,
                     param_noise=param_noise,
                     critic_l2_reg=critic_l2_reg,
                     actor_lr=actor_lr,
                     critic_lr=critic_lr,
                     enable_popart=popart,
                     clip_norm=clip_norm,
                     reward_scale=reward_scale)
        logger.info('Using agent with the following configuration:')
        logger.info(str(agent.__dict__.items()))

        sess = U.get_session()
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()

        # Variables that are used during learning
        self.agent = agent
        self.memory = memory
        self.max_action = max_action
        self.batch_size = batch_size
        self.nb_train_steps = nb_train_steps
        self.nb_rollout_steps = nb_rollout_steps
        self.param_noise_adaption_interval = param_noise_adaption_interval
Esempio n. 27
0
def learn(
        network,
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=20,
        nb_rollout_steps=100,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        noise_type='adaptive-param_0.2',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        nb_eval_steps=100,
        batch_size=64,  # per MPI worker
        tau=0.01,
        eval_env=None,
        param_noise_adaption_interval=50,
        **network_kwargs):

    set_global_seeds(seed)

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles *
                                             nb_rollout_steps)
    else:
        nb_epochs = 500

    rank = MPI.COMM_WORLD.Get_rank()
    nb_actions = env.action_space.shape[-1]
    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.

    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(network=network, **network_kwargs)
    actor = Actor(nb_actions, network=network, **network_kwargs)

    action_noise = None
    param_noise = None
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))
            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mu=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))

    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()
    # Prepare everything.
    agent.initialize(sess)
    sess.graph.finalize()

    agent.reset()

    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = obs.shape[0]

    episode_reward = np.zeros(nenvs, dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                # of the environments, so resetting here instead
                agent.reset()
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q, _, _ = agent.step(obs,
                                             apply_noise=True,
                                             compute_Q=True)

                # Execute next action.
                if rank == 0 and render:
                    env.render()

                # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch
                new_obs, r, done, info = env.step(
                    max_action * action
                )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                # note these outputs are batched from vecenv

                t += 1
                if rank == 0 and render:
                    env.render()
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                agent.store_transition(
                    obs, action, r, new_obs, done
                )  #the batched data will be unrolled in memory.py's append.

                obs = new_obs

                for d in range(len(done)):
                    if done[d]:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward[d])
                        episode_rewards_history.append(episode_reward[d])
                        epoch_episode_steps.append(episode_step[d])
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1
                        if nenvs == 1:
                            agent.reset()

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)

                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        max_action * eval_action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0

        mpi_size = MPI.COMM_WORLD.Get_size()
        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(
            epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)
        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(
                eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = MPI.COMM_WORLD.allreduce(
            np.array(
                [np.array(x).flatten()[0] for x in combined_stats.values()]))
        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'),
                          'wb') as f:
                    pickle.dump(eval_env.get_state(), f)

    return agent
Esempio n. 28
0
def run(env_id, seed, noise_type, num_cpu, layer_norm, logdir, gym_monitor, evaluation, bind_to_core, **kwargs):
    kwargs['logdir'] = logdir
    whoami = mpi_fork(num_cpu, bind_to_core=bind_to_core)
    if whoami == 'parent':
        sys.exit(0)

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        # Write to temp directory for all non-master workers.
        actual_dir = None
        Logger.CURRENT.close()
        Logger.CURRENT = Logger(dir=mkdtemp(), output_formats=[])
        logger.set_level(logger.DISABLED)
    
    # Create envs.
    if rank == 0:
        env = gym.make(env_id)
        if gym_monitor and logdir:
            env = gym.wrappers.Monitor(env, os.path.join(logdir, 'gym_train'), force=True)
        env = SimpleMonitor(env)

        if evaluation:
            eval_env = gym.make(env_id)
            if gym_monitor and logdir:
                eval_env = gym.wrappers.Monitor(eval_env, os.path.join(logdir, 'gym_eval'), force=True)
            eval_env = SimpleMonitor(eval_env)
        else:
            eval_env = None
    else:
        env = gym.make(env_id)
        if evaluation:
            eval_env = gym.make(env_id)
        else:
            eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()
    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
        action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    Logger.CURRENT.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
Esempio n. 29
0
class Model(object):
    def __init__(self, network, env, gamma=1, tau=0.01, total_timesteps=1e6,
                 normalize_observations=True, normalize_returns=False, enable_popart=False,
                 noise_type='adaptive-param_0.2', clip_norm=None, reward_scale=1.,
                 batch_size=128, l2_reg_coef=0.2, actor_lr=1e-4, critic_lr=1e-3,
                 observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
                 **network_kwargs):
        # logger.info('Using agent with the following configuration:')
        # logger.info(str(self.__dict__.items()))
        observation_shape = env.observation_space.shape
        action_shape = env.action_space.shape

        # Inputs.
        self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
        self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
        self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
        self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions')
        self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')

        # Parameters.
        self.env = env
        self.gamma = gamma
        self.tau = tau
        self.total_timesteps = total_timesteps
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.enable_popart = enable_popart
        self.clip_norm = clip_norm
        self.reward_scale = reward_scale
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.batch_size = batch_size
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.l2_reg_coef = l2_reg_coef

        self.stats_sample = None

        self.action_noise = None
        self.param_noise = None
        nb_actions = self.env.action_space.shape[-1]
        if noise_type is not None:
            for current_noise_type in noise_type.split(','):
                current_noise_type = current_noise_type.strip()
                if current_noise_type == 'none':
                    pass
                elif 'adaptive-param' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    self.param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev),
                                                         desired_action_stddev=float(stddev))
                elif 'normal' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    self.action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
                elif 'ou' in current_noise_type:
                    _, stddev = current_noise_type.split('_')
                    self.action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                                sigma=float(stddev) * np.ones(nb_actions))
                else:
                    raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))

        assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
        self.memory = Memory(limit=int(1e6), action_shape=env.action_space.shape,
                             observation_shape=env.observation_space.shape)
        self.critic = Critic(network=network, **network_kwargs)
        self.actor = Actor(nb_actions, network=network, **network_kwargs)

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(self.actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(self.critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = self.actor(normalized_obs0)
        self.normalized_critic_tf = self.critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = self.critic(normalized_obs0, self.actor_tf, reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]),
            self.ret_rms)
        Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        self.initial_state = None  # recurrent architectures not supported yet
        self.def_path_pre = os.path.dirname(os.path.abspath(__file__)) + '/tmp/'

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars,
                                                                      self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor,
                                                                       self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms),
                                                       self.return_range[0], self.return_range[1])
        self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.l2_reg_coef > 0.:
            critic_reg_vars = [var for var in self.critic.trainable_vars if
                               var.name.endswith('/w:0') and 'output' not in var.name]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(self.l2_reg_coef))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.l2_reg_coef),
                weights_list=critic_reg_vars
            )
            self.critic_loss += critic_reg
        critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars]
        critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)]
            self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def train_step(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action[0].shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])

        return action, q, None, None

    def step(self, obs, compute_Q=True):
        feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])}
        if compute_Q:
            action, q = self.sess.run([self.actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        else:
            action = self.sess.run(self.actor_tf, feed_dict=feed_dict)
            q = None

        action = np.clip(action, self.action_range[0], self.action_range[1])

        return action, q, None, None

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale

        B = obs0.shape[0]
        for b in range(B):
            self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b])
            if self.normalize_observations:
                self.obs_rms.update(np.array([obs0[b]]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                                                        feed_dict={
                                                            self.obs1: batch['obs1'],
                                                            self.rewards: batch['rewards'],
                                                            self.terminals1: batch['terminals1'].astype('float32'),
                                                        })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op, feed_dict={
                self.old_std: np.array([old_std]),
                self.old_mean: np.array([old_mean]),
            })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q, feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })

        # Get all gradients and perform a synced update.
        ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
            self.obs0: batch['obs0'],
            self.actions: batch['actions'],
            self.critic_target: target_Q,
        })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops, feed_dict={
            self.obs0: self.stats_sample['obs0'],
            self.actions: self.stats_sample['actions'],
        })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        try:
            from mpi4py import MPI
        except ImportError:
            MPI = None

        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
            self.param_noise_stddev: self.param_noise.current_stddev,
        })
        distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
            self.obs0: batch['obs0'],
            self.param_noise_stddev: self.param_noise.current_stddev,
        })

        if MPI is not None:
            mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        else:
            mean_distance = distance

        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops, feed_dict={
                self.param_noise_stddev: self.param_noise.current_stddev,
            })

    def learn(self,
              total_timesteps=None,
              seed=None,
              nb_epochs=None,  # with default settings, perform 1M steps total
              nb_epoch_cycles=20,
              nb_rollout_steps=100,
              render=False,
              nb_train_steps=50,  # per epoch cycle and MPI worker,
              batch_size=64,  # per MPI worker
              param_noise_adaption_interval=50,):

        set_global_seeds(seed)

        if total_timesteps is not None:
            assert nb_epochs is None
            nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps)
        else:
            nb_epochs = 500

        if MPI is not None:
            rank = MPI.COMM_WORLD.Get_rank()
        else:
            rank = 0

        # eval_episode_rewards_history = deque(maxlen=100)
        episode_rewards_history = deque(maxlen=100)
        sess = U.get_session()
        # Prepare everything.
        self.initialize(sess)
        sess.graph.finalize()
        self.reset()

        obs = self.env.reset()
        # if eval_env is not None:
        #     eval_obs = eval_env.reset()
        nenvs = obs.shape[0]

        episode_reward = np.zeros(nenvs, dtype=np.float32)  # vector
        episode_step = np.zeros(nenvs, dtype=int)  # vector
        episodes = 0  # scalar
        t = 0  # scalar

        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                if nenvs > 1:
                    # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                    # of the environments, so resetting here instead
                    self.reset()
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q, _, _ = self.train_step(obs, apply_noise=True, compute_Q=True)

                    # Execute next action.
                    if rank == 0 and render:
                        self.env.render()

                    # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch
                    # new_obs, r, done, info = self.env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    new_obs, r, done, info = self.env.step(action)
                    # note these outputs are batched from vecenv

                    t += 1
                    if rank == 0 and render:
                        self.env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    # the batched data will be unrolled in memory.py's append.
                    self.store_transition(obs, action, r, new_obs, done)

                    obs = new_obs

                    for d in range(len(done)):
                        if done[d]:
                            # Episode done.
                            epoch_episode_rewards.append(episode_reward[d])
                            episode_rewards_history.append(episode_reward[d])
                            epoch_episode_steps.append(episode_step[d])
                            episode_reward[d] = 0.
                            episode_step[d] = 0
                            epoch_episodes += 1
                            episodes += 1
                            if nenvs == 1:
                                self.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if self.memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                        distance = self.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = self.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    self.update_target_net()
            #
            # # Evaluate.
            # eval_episode_rewards = []
            # eval_qs = []
            # if eval_env is not None:
            #     eval_obs = eval_env.reset()
            #     nenvs_eval = eval_obs.shape[0]
            #     eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
            #     for t_rollout in range(nb_eval_steps):
            #         eval_action, eval_q, _, _ = self.train_step(eval_obs, apply_noise=False, compute_Q=True)
            #         # eval_obs, eval_r, eval_done, eval_info = eval_env.step(
            #         #     max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
            #         eval_obs, eval_r, eval_done, eval_info = eval_env.step(eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
            #
            #         if render_eval:
            #             eval_env.render()
            #         eval_episode_reward += eval_r
            #
            #         eval_qs.append(eval_q)
            #         for d in range(len(eval_done)):
            #             if eval_done[d]:
            #                 eval_episode_rewards.append(eval_episode_reward[d])
            #                 eval_episode_rewards_history.append(eval_episode_reward[d])
            #                 eval_episode_reward[d] = 0.0

            if MPI is not None:
                mpi_size = MPI.COMM_WORLD.Get_size()
            else:
                mpi_size = 1

            # save trainable variables
            file_name = time.strftime('Y%YM%mD%d_h%Hm%Ms%S', time.localtime(time.time()))
            model_save_path = self.def_path_pre + file_name
            self.save(model_save_path)

            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = self.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_std'] = np.std(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/return_history_std'] = np.std(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            # combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            # combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            # combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            # if eval_env is not None:
            #     combined_stats['eval/return'] = eval_episode_rewards
            #     combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
            #     combined_stats['eval/Q'] = eval_qs
            #     combined_stats['eval/episodes'] = len(eval_episode_rewards)

            combined_stats_sums = np.array([np.array(x).flatten()[0] for x in combined_stats.values()])
            if MPI is not None:
                combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums)

            combined_stats = {k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)}

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])

            if rank == 0:
                logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(self.env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(self.env.get_state(), f)
                # if eval_env and hasattr(eval_env, 'get_state'):
                #     with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                #         pickle.dump(eval_env.get_state(), f)
        self.sess.graph._unsafe_unfinalize()
        return self

    def save(self, save_path=None):
        save_variables(save_path=save_path, sess=self.sess)
        print('save model variables to', save_path)

    def load_newest(self, load_path=None):
        file_list = os.listdir(self.def_path_pre)
        file_list.sort(key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x)))
        if load_path is None:
            load_path = os.path.join(self.def_path_pre, file_list[-1])
        load_variables(load_path=load_path, sess=self.sess)
        print('load_path: ', load_path)

    def load_index(self, index, load_path=None):
        file_list = os.listdir(self.def_path_pre)
        file_list.sort(key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x)), reverse=True)
        if load_path is None:
            load_path = os.path.join(self.def_path_pre, file_list[index])
        load_variables(load_path=load_path, sess=self.sess)
        print('load_path: ', load_path)
Esempio n. 30
0
def run(env_id, seed, noise_type, layer_norm, evaluation, outdir, no_hyp,
        **kwargs):
    params = locals()
    # Configure things.
    # rank = MPI.COMM_WORLD.Get_rank()
    # if rank != 0: logger.set_level(logger.DISABLED)
    rank = 0
    # Create envs.
    env = make_env(env_id)
    weight_file = kwargs.pop('weight_file')
    if not weight_file:
        outdir = exp_utils.prepare_exp_dirs(params, outdir, env_id)
    else:
        outdir = exp_utils.prepare_exp_dirs(params, outdir, env_id, 'eval')
    logger.configure(outdir)
    os.makedirs(outdir, exist_ok=True)

    env = bench.Monitor(env, os.path.join(outdir, "%i.monitor.json" % rank))
    gym.logger.setLevel(logging.WARN)
    logger.info('Output directory:{}, env:{}, no_hyp:{}'.format(
        outdir, env_id, no_hyp))
    if evaluation:
        eval_env = make_env(env_id)
        eval_env.seed(42)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'),
                                 allow_early_resets=True)
        # env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Configure components.
    memory = Memory(limit=int(1e5),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)

    # critic = models.ConvCritic(layer_norm=layer_norm)
    # actor = models.ConvActor(nb_actions, layer_norm=layer_norm, no_hyp=no_hyp)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm, no_hyp=no_hyp)

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    # set_global_seeds(seed)
    # env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    if rank == 0:
        start_time = time.time()

    if weight_file:
        evaluate(
            env,
            nb_episodes=kwargs.get('nb_epochs', 100),
            reward_scale=kwargs.get('reward_scale'),
            render=kwargs.get('render'),
            param_noise=None,
            action_noise=None,
            actor=actor,
            critic=critic,
            critic_l2_reg=kwargs.get('critic_l2_reg'),
            memory=memory,
            weight_file=weight_file,
        )
    else:
        training.train(env=env,
                       eval_env=eval_env,
                       param_noise=param_noise,
                       action_noise=action_noise,
                       actor=actor,
                       critic=critic,
                       memory=memory,
                       outdir=outdir,
                       no_hyp=no_hyp,
                       **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))