Python OrnsteinUhlenbeckProcess.sample Examples

Programming Language: Python

Namespace/Package Name: rl.random

Method/Function: sample

Examples at hotexamples.com: 6

Python OrnsteinUhlenbeckProcess.sample - 6 examples found. These are the top rated real world Python examples of rl.random.OrnsteinUhlenbeckProcess.sample extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

OrnsteinUhlenbeckProcess(30)

sample(6)

reset_states(2)

Example #1

Show file

def injectNoise(action):
    random_process = OrnsteinUhlenbeckProcess(theta=.1,
                                              mu=0.,
                                              sigma=.02,
                                              size=env.get_action_space_size())
    action += random_process.sample()
    return action

Example #2

Show file

def slave_loop(env):
    np.random.seed(seed=rank)
    s = env.reset()  # env.reset_with_seed(rank)

    eps_count = 0
    step_count = 0
    ep_reward = 0.
    ep_step = 0

    ou_noise = OrnsteinUhlenbeckProcess(theta=.15,
                                        mu=0.,
                                        sigma=.2,
                                        size=env.e.action_space.shape[0])
    a_high = env.e.action_space.high[0]
    a_low = env.e.action_space.low[0]

    initial_noise_scale = 1.0
    noise_decay = NOISE_DECAY  # 0.99
    global_n_eps = 0
    global_n_step = 0

    while True:
        #noise_scale = max(initial_noise_scale * noise_decay ** (global_n_eps), 0.002)
        noise_scale = max(initial_noise_scale * noise_decay**(global_n_eps),
                          NOISE_MIN)

        if rank == 1:
            noise_scale = 0

        comm.send(np.array(s), dest=0, tag=REQ_ACTION)
        (action, global_n_eps, global_n_step) = comm.recv(source=0,
                                                          tag=RSP_ACTION)

        noise = ou_noise.sample()
        action = np.clip(action + noise * noise_scale, a_low, a_high)

        s_, reward, done, info = env.step(action)
        ep_reward += reward

        step_count += 1
        ep_step += 1

        if ep_step >= MAX_EP_STEPS:
            done = True

        obs_data = (np.array(s), action, reward, np.array(s_), done, ep_reward,
                    ep_step)
        comm.send(obs_data, dest=0, tag=OBS_DATA)

        if done:
            s_ = env.reset()
            ep_reward = 0
            eps_count += 1
            ep_step = 0
            ou_noise.reset_states()
            # print("eps: %d" % (eps_count,))
        s = s_

Example #3

Show file

def initialSample_action(experiment_act):
    # c = list(range(0, 256))
    action = [0]*19
    ind = np.asscalar(np.random.choice(len(experiment_act),1))
    # print(ind)
    action[9] = experiment_act[ind][9]
    action[13] = experiment_act[ind][13]
    action[15] = experiment_act[ind][15]
    action[16] = experiment_act[ind][16]
    # print(action)
    random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.2, size=env.get_action_space_size())
    action += random_process.sample()
    return action

Example #4

Show file

File: test.py Project: idthanm/h-DDPG

# plt.title('episode_reward')
#
# plt.subplot(222)
# EPSILON = 0.9
# plt.plot(episode, weighted_decay(episode_reward, epsilon=EPSILON))
# plt.title('{}weighted_decay_reward'.format(EPSILON))
#
# plt.subplot(223)
# plt.plot(episode, nb_steps)
# plt.title('total steps')
#
# plt.subplot(224)
# plt.plot(episode, nb_episode_steps)
# plt.title('episode steps')
#
# # plt.subplot(224)
# # plt.subplot(episode, list(memory_len))
# # plt.title('memory length')
# plt.subplots_adjust(wspace=0.3, hspace=0.3)
# plt.show()
RANDOM_PROCESS_THETA = 0.15
RANDOM_PROCESS_MU = 0.
RANDOM_PROCESS_SIGMA = 0.3
from rl.random import OrnsteinUhlenbeckProcess
process = OrnsteinUhlenbeckProcess(size=2,
                                   theta=RANDOM_PROCESS_THETA,
                                   mu=RANDOM_PROCESS_MU,
                                   sigma=RANDOM_PROCESS_SIGMA)
for _ in range(200):
    print(process.sample())

Example #5

Show file

File: DDPGAgent.py Project: hopexn/NewRLCache

class DDPGAgent(Agent):
    def __init__(self,
                 action_space,
                 observation_space,
                 gamma=0.99,
                 nb_steps_warmup=2000,
                 training=True,
                 polyak=0.99,
                 memory_size=10000):
        super().__init__()
        self.gamma = gamma
        self.polyak = polyak

        self.action_space = action_space
        self.nb_actions = action_space.shape[0]
        self.observation_shape = observation_space.shape
        self.nb_steps_warmup = nb_steps_warmup
        self.training = training

        self.memory = Memory(capacity=memory_size,
                             observation_shape=self.observation_shape,
                             action_shape=self.action_space.shape)

        self.actor_model, self.critic_model = self._build_network()
        self.target_actor_model, self.target_critic_model = self._build_network(
        )
        self.target_actor_model.set_weights(self.actor_model.get_weights())
        self.target_critic_model.set_weights(self.critic_model.get_weights())

        self.step_count = 0
        self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions,
                                                       theta=0.15,
                                                       mu=0.,
                                                       sigma=0.3)

    def _build_network(self):
        action_tensor = tf.keras.layers.Input(shape=(self.nb_actions, ),
                                              dtype=tf.float64)
        observation_tensor = tf.keras.layers.Input(
            shape=self.observation_shape, dtype=tf.float64)

        # 创建Actor模型
        y = tf.keras.layers.Flatten()(observation_tensor)
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(self.nb_actions, activation='tanh')(y)

        actor_model = tf.keras.Model(inputs=observation_tensor, outputs=y)
        actor_model.compile(optimizer=tf.keras.optimizers.Adam(lr=3e-4),
                            loss='mse')

        # 创建Critic模型
        y = tf.keras.layers.Concatenate()([observation_tensor, action_tensor])
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(1, activation='linear')(y)

        critic_model = tf.keras.Model(
            inputs=[observation_tensor, action_tensor], outputs=y)
        critic_model.compile(optimizer=tf.keras.optimizers.Adam(lr=3e-4),
                             loss='mse')

        return actor_model, critic_model

    def forward(self, observation):
        self.step_count += 1

        if self.step_count < self.nb_steps_warmup:
            return self.action_space.sample()
        else:
            observation = np.expand_dims(observation, axis=0)
            action = self.actor_model.predict(observation)
            action = action.reshape(self.nb_actions)
            if self.training:
                action = action + self.random_process.sample()
            return action

    def backward(self, observation, action, reward, terminal,
                 next_observation):
        self.memory.store_transition(observation, action, reward, terminal,
                                     next_observation)

        if self.step_count < self.nb_steps_warmup:
            return
        else:
            self._update()

    def _update(self):
        observations, actions, rewards, terminals, next_observations = self.memory.sample_batch(
        )

        self._update_critic(observations, actions, rewards, terminals,
                            next_observations)
        self._update_actor(observations)

        # 更新critic的target网络
        new_target_critic_weights_list = polyak_averaging(
            self.critic_model.get_weights(),
            self.target_critic_model.get_weights(), self.polyak)
        self.target_critic_model.set_weights(new_target_critic_weights_list)

        # 更新actor的target网络
        new_target_actor_weights_list = polyak_averaging(
            self.actor_model.get_weights(),
            self.target_actor_model.get_weights(), self.polyak)
        self.target_actor_model.set_weights(new_target_actor_weights_list)

    def polyak_averaging(self, weights_list, target_weights_list):
        new_target_weights_list = []
        for weights, target_weights in zip(weights_list, target_weights_list):
            new_target_weights = self.polyak * target_weights + (
                1 - self.polyak) * weights
            new_target_weights_list.append(new_target_weights)
        return new_target_weights_list

    def _update_critic(self, observations, actions, rewards, terminals,
                       next_observations):
        q_values_next = self.target_critic_model(
            [next_observations,
             self.actor_model(next_observations)])
        target_q_values = rewards + self.gamma * q_values_next
        self.critic_model.fit([observations, actions],
                              target_q_values,
                              verbose=0)

    @tf.function
    def _update_actor(self, observations):
        with tf.GradientTape() as tape:
            tape.watch(self.actor_model.trainable_weights)
            q_values = self.target_critic_model(
                [observations, self.actor_model(observations)])
            loss = -tf.reduce_mean(q_values)

        actor_grads = tape.gradient(loss, self.actor_model.trainable_weights)
        self.actor_model.optimizer.apply_gradients(
            zip(actor_grads, self.actor_model.trainable_weights))

Example #6

Show file

File: deep_learners.py Project: Thomas0Gilles/LocomotionRL

class MACE(Agent):
    def __init__(self, env: gym.Env, **kwargs):

        super(MACE, self).__init__(**kwargs)
        self.nb_actions = env.action_space.shape[0]

        obs_input_actor = Input(shape=(1, ) + env.observation_space.shape,
                                name='observation_input')
        x_ac = Flatten()(obs_input_actor)
        x_ac = Dense(units=256, activation='relu')(x_ac)

        obs_input_critic = Input(shape=(1, ) + env.observation_space.shape,
                                 name='observation_input')
        x_cr = Flatten()(obs_input_critic)
        x_cr = Dense(units=256, activation='relu')(x_cr)

        x_critic = Dense(units=128, activation='relu')(x_cr)
        value = Dense(units=1)(x_critic)

        x_actor = Dense(units=128, activation='relu')(x_ac)
        action = Dense(units=self.nb_actions, activation='tanh')(x_actor)

        actor = Model(inputs=obs_input_actor, outputs=action)
        critic = Model(inputs=obs_input_critic, outputs=value)

        metrics = []
        metrics += [mean_q]
        critic_metrics = metrics

        critic_optimizer = Adam(lr=1e-3)
        actor_optimizer = Adam(lr=1e-3)

        #        critic_optimizer = SGD(lr=1e-4, momentum=0.9)
        #        actor_optimizer = SGD(lr=1e-3, momentum=0.9)

        self.actor = actor
        self.critic = critic

        self.target_actor = clone_model(self.actor)
        self.target_actor.compile(optimizer='sgd', loss='mse')
        self.target_critic = clone_model(self.critic)
        self.target_critic.compile(optimizer='sgd', loss='mse')

        self.target_model_update = 1e-3
        #self.target_model_update=500

        if self.target_model_update < 1.:
            # We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model.
            critic_updates = get_soft_target_model_updates(
                self.target_critic, self.critic, self.target_model_update)
            critic_optimizer = AdditionalUpdatesOptimizer(
                critic_optimizer, critic_updates)
            actor_updates = get_soft_target_model_updates(
                self.target_actor, self.actor, self.target_model_update)
            actor_optimizer = AdditionalUpdatesOptimizer(
                actor_optimizer, actor_updates)

        self.delta_clip = np.inf

        def clipped_error(y_true, y_pred):
            return K.mean(huber_loss(y_true, y_pred, self.delta_clip), axis=-1)

        actor.compile(actor_optimizer, loss='mse')
        critic.compile(critic_optimizer, loss='mse', metrics=critic_metrics)

        self.compiled = True

        self.memory = SequentialMemory(limit=100000, window_length=1)
        self.memory_interval = 1
        self.memory_actor = SequentialMemory(limit=100000, window_length=1)
        self.memory_critic = SequentialMemory(limit=100000, window_length=1)

        self.nb_steps_warmup = 50000

        self.train_interval = 4
        self.batch_size = 64
        self.gamma = 0.99

        self.processor = None
        self.random_process = OrnsteinUhlenbeckProcess(theta=.15,
                                                       mu=0.,
                                                       sigma=.3,
                                                       size=self.nb_actions)
        self.eps = 0.9

    def process_state_batch(self, batch):
        batch = np.array(batch)
        if self.processor is None:
            return batch
        return self.processor.process_state_batch(batch)

    def select_action(self, state):
        batch = [state]
        action = self.actor.predict_on_batch(np.asarray(batch)).flatten()
        # Apply noise, if a random process is set.
        if self.training and self.random_process is not None:
            #Actor exploration Bernoulli
            #            rd = np.random.rand()
            #            if rd<self.eps:
            noise = self.random_process.sample()
            assert noise.shape == action.shape
            action += noise
#               self.action_exploration=True


#            else:
#                self.action_exploration=False
        return action

    def forward(self, observation):
        # Select an action.

        state = self.memory.get_recent_state(observation)
        action = self.select_action(state)  # TODO: move this into policy

        # Book-keeping.
        self.recent_observation = observation
        self.recent_action = action

        return action

    def backward(self, reward, terminal=False):
        # Store most recent experience in memory.
        if self.step % self.memory_interval == 0:
            self.memory.append(self.recent_observation,
                               self.recent_action,
                               reward,
                               terminal,
                               training=self.training)

        metrics = [np.nan for _ in self.metrics_names]
        if not self.training:
            # We're done here. No need to update the experience memory since we only use the working
            # memory to obtain the state over the most recent observations.
            return metrics

        # Train the network on a single stochastic batch.
        can_train_either = self.step > self.nb_steps_warmup
        if can_train_either and self.step % self.train_interval == 0:
            experiences = self.memory.sample(self.batch_size)
            assert len(experiences) == self.batch_size

            # Start by extracting the necessary parameters (we use a vectorized implementation).
            state0_batch = []
            reward_batch = []
            action_batch = []
            terminal1_batch = []
            state1_batch = []
            for e in experiences:
                state0_batch.append(e.state0)
                state1_batch.append(e.state1)
                reward_batch.append(e.reward)
                action_batch.append(e.action)
                terminal1_batch.append(0. if e.terminal1 else 1.)

            # Prepare and validate parameters.
            state0_batch = self.process_state_batch(state0_batch)
            state1_batch = self.process_state_batch(state1_batch)
            terminal1_batch = np.array(terminal1_batch)
            reward_batch = np.array(reward_batch)
            action_batch = np.array(action_batch)
            assert reward_batch.shape == (self.batch_size, )
            assert terminal1_batch.shape == reward_batch.shape
            assert action_batch.shape == (self.batch_size, self.nb_actions)

            # Update actor and critic, if warm up is over.
            if self.step > self.nb_steps_warmup:
                if len(self.critic.inputs) >= 3:
                    state1_batch_with_action = state1_batch[:]
                else:
                    state1_batch_with_action = [state1_batch]
                target_q_values = self.target_critic.predict_on_batch(
                    state1_batch_with_action).flatten()
                assert target_q_values.shape == (self.batch_size, )

                # Compute r_t + gamma * max_a Q(s_t+1, a) and update the target ys accordingly,
                # but only for the affected output units (as given by action_batch).
                discounted_reward_batch = self.gamma * target_q_values
                discounted_reward_batch *= terminal1_batch
                assert discounted_reward_batch.shape == reward_batch.shape
                targets = (reward_batch + discounted_reward_batch).reshape(
                    self.batch_size, 1)

                # Perform a single batch update on the critic network.
                if len(self.critic.inputs) >= 3:
                    state0_batch_with_action = state0_batch[:]
                else:
                    state0_batch_with_action = [state0_batch]
                #state0_batch_with_action.insert(self.critic_action_input_idx, action_batch)
                metrics = self.critic.train_on_batch(state0_batch_with_action,
                                                     targets)
                if self.processor is not None:
                    metrics += self.processor.metrics

            #Actor
            experiences = self.memory.sample(self.batch_size)
            assert len(experiences) == self.batch_size

            # Start by extracting the necessary parameters (we use a vectorized implementation).
            state0_batch = []
            reward_batch = []
            action_batch = []
            terminal1_batch = []
            state1_batch = []
            for e in experiences:
                state0_batch.append(e.state0)
                state1_batch.append(e.state1)
                reward_batch.append(e.reward)
                action_batch.append(e.action)
                terminal1_batch.append(0. if e.terminal1 else 1.)

            # Prepare and validate parameters.
            state0_batch = self.process_state_batch(state0_batch)
            state1_batch = self.process_state_batch(state1_batch)
            terminal1_batch = np.array(terminal1_batch)
            reward_batch = np.array(reward_batch)
            action_batch = np.array(action_batch)
            assert reward_batch.shape == (self.batch_size, )
            assert terminal1_batch.shape == reward_batch.shape
            assert action_batch.shape == (self.batch_size, self.nb_actions)

            if self.step > self.nb_steps_warmup:
                #Actor
                target_q_values1 = self.target_critic.predict_on_batch(
                    state1_batch_with_action).flatten()
                discounted_reward_batch = self.gamma * target_q_values1
                discounted_reward_batch *= terminal1_batch
                targets = (reward_batch + discounted_reward_batch)
                target_q_values0 = self.target_critic.predict_on_batch(
                    state0_batch_with_action).flatten()
                delta = targets - target_q_values0
                if len(self.actor.inputs) >= 2:
                    inputs = state0_batch[:]
                else:
                    #inputs = [state0_batch]
                    inputs = state0_batch
                pos_dif = delta > 0
                #                if self.step%1000==0:
                #                    print(np.sum(pos_dif))
                inputs = np.asarray(inputs)[pos_dif]
                actions_target = action_batch[pos_dif]
                #state0_batch_with_action.insert(self.critic_action_input_idx, action_batch)
                self.actor.train_on_batch(inputs, actions_target)

        if self.target_model_update >= 1 and self.step % self.target_model_update == 0:
            self.update_target_models_hard()

        return metrics

    def reset_states(self):
        if self.random_process is not None:
            self.random_process.reset_states()
        self.recent_action = None
        self.recent_observation = None
        if self.compiled:
            self.actor.reset_states()
            self.critic.reset_states()
            self.target_actor.reset_states()
            self.target_critic.reset_states()

    def update_target_models_hard(self):
        self.target_critic.set_weights(self.critic.get_weights())
        self.target_actor.set_weights(self.actor.get_weights())

    @property
    def metrics_names(self):
        names = self.critic.metrics_names[:]
        if self.processor is not None:
            names += self.processor.metrics_names[:]
        return names