Example #1
0
def injectNoise(action):
    random_process = OrnsteinUhlenbeckProcess(theta=.1,
                                              mu=0.,
                                              sigma=.02,
                                              size=env.get_action_space_size())
    action += random_process.sample()
    return action
Example #2
0
def slave_loop(env):
    np.random.seed(seed=rank)
    s = env.reset()  # env.reset_with_seed(rank)

    eps_count = 0
    step_count = 0
    ep_reward = 0.
    ep_step = 0

    ou_noise = OrnsteinUhlenbeckProcess(theta=.15,
                                        mu=0.,
                                        sigma=.2,
                                        size=env.e.action_space.shape[0])
    a_high = env.e.action_space.high[0]
    a_low = env.e.action_space.low[0]

    initial_noise_scale = 1.0
    noise_decay = NOISE_DECAY  # 0.99
    global_n_eps = 0
    global_n_step = 0

    while True:
        #noise_scale = max(initial_noise_scale * noise_decay ** (global_n_eps), 0.002)
        noise_scale = max(initial_noise_scale * noise_decay**(global_n_eps),
                          NOISE_MIN)

        if rank == 1:
            noise_scale = 0

        comm.send(np.array(s), dest=0, tag=REQ_ACTION)
        (action, global_n_eps, global_n_step) = comm.recv(source=0,
                                                          tag=RSP_ACTION)

        noise = ou_noise.sample()
        action = np.clip(action + noise * noise_scale, a_low, a_high)

        s_, reward, done, info = env.step(action)
        ep_reward += reward

        step_count += 1
        ep_step += 1

        if ep_step >= MAX_EP_STEPS:
            done = True

        obs_data = (np.array(s), action, reward, np.array(s_), done, ep_reward,
                    ep_step)
        comm.send(obs_data, dest=0, tag=OBS_DATA)

        if done:
            s_ = env.reset()
            ep_reward = 0
            eps_count += 1
            ep_step = 0
            ou_noise.reset_states()
            # print("eps: %d" % (eps_count,))
        s = s_
Example #3
0
def initialSample_action(experiment_act):
    # c = list(range(0, 256))
    action = [0]*19
    ind = np.asscalar(np.random.choice(len(experiment_act),1))
    # print(ind)
    action[9] = experiment_act[ind][9]
    action[13] = experiment_act[ind][13]
    action[15] = experiment_act[ind][15]
    action[16] = experiment_act[ind][16]
    # print(action)
    random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.2, size=env.get_action_space_size())
    action += random_process.sample()
    return action
Example #4
0
# plt.title('episode_reward')
#
# plt.subplot(222)
# EPSILON = 0.9
# plt.plot(episode, weighted_decay(episode_reward, epsilon=EPSILON))
# plt.title('{}weighted_decay_reward'.format(EPSILON))
#
# plt.subplot(223)
# plt.plot(episode, nb_steps)
# plt.title('total steps')
#
# plt.subplot(224)
# plt.plot(episode, nb_episode_steps)
# plt.title('episode steps')
#
# # plt.subplot(224)
# # plt.subplot(episode, list(memory_len))
# # plt.title('memory length')
# plt.subplots_adjust(wspace=0.3, hspace=0.3)
# plt.show()
RANDOM_PROCESS_THETA = 0.15
RANDOM_PROCESS_MU = 0.
RANDOM_PROCESS_SIGMA = 0.3
from rl.random import OrnsteinUhlenbeckProcess
process = OrnsteinUhlenbeckProcess(size=2,
                                   theta=RANDOM_PROCESS_THETA,
                                   mu=RANDOM_PROCESS_MU,
                                   sigma=RANDOM_PROCESS_SIGMA)
for _ in range(200):
    print(process.sample())
Example #5
0
class DDPGAgent(Agent):
    def __init__(self,
                 action_space,
                 observation_space,
                 gamma=0.99,
                 nb_steps_warmup=2000,
                 training=True,
                 polyak=0.99,
                 memory_size=10000):
        super().__init__()
        self.gamma = gamma
        self.polyak = polyak

        self.action_space = action_space
        self.nb_actions = action_space.shape[0]
        self.observation_shape = observation_space.shape
        self.nb_steps_warmup = nb_steps_warmup
        self.training = training

        self.memory = Memory(capacity=memory_size,
                             observation_shape=self.observation_shape,
                             action_shape=self.action_space.shape)

        self.actor_model, self.critic_model = self._build_network()
        self.target_actor_model, self.target_critic_model = self._build_network(
        )
        self.target_actor_model.set_weights(self.actor_model.get_weights())
        self.target_critic_model.set_weights(self.critic_model.get_weights())

        self.step_count = 0
        self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions,
                                                       theta=0.15,
                                                       mu=0.,
                                                       sigma=0.3)

    def _build_network(self):
        action_tensor = tf.keras.layers.Input(shape=(self.nb_actions, ),
                                              dtype=tf.float64)
        observation_tensor = tf.keras.layers.Input(
            shape=self.observation_shape, dtype=tf.float64)

        # 创建Actor模型
        y = tf.keras.layers.Flatten()(observation_tensor)
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(self.nb_actions, activation='tanh')(y)

        actor_model = tf.keras.Model(inputs=observation_tensor, outputs=y)
        actor_model.compile(optimizer=tf.keras.optimizers.Adam(lr=3e-4),
                            loss='mse')

        # 创建Critic模型
        y = tf.keras.layers.Concatenate()([observation_tensor, action_tensor])
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(32, activation='relu')(y)
        y = tf.keras.layers.Dense(1, activation='linear')(y)

        critic_model = tf.keras.Model(
            inputs=[observation_tensor, action_tensor], outputs=y)
        critic_model.compile(optimizer=tf.keras.optimizers.Adam(lr=3e-4),
                             loss='mse')

        return actor_model, critic_model

    def forward(self, observation):
        self.step_count += 1

        if self.step_count < self.nb_steps_warmup:
            return self.action_space.sample()
        else:
            observation = np.expand_dims(observation, axis=0)
            action = self.actor_model.predict(observation)
            action = action.reshape(self.nb_actions)
            if self.training:
                action = action + self.random_process.sample()
            return action

    def backward(self, observation, action, reward, terminal,
                 next_observation):
        self.memory.store_transition(observation, action, reward, terminal,
                                     next_observation)

        if self.step_count < self.nb_steps_warmup:
            return
        else:
            self._update()

    def _update(self):
        observations, actions, rewards, terminals, next_observations = self.memory.sample_batch(
        )

        self._update_critic(observations, actions, rewards, terminals,
                            next_observations)
        self._update_actor(observations)

        # 更新critic的target网络
        new_target_critic_weights_list = polyak_averaging(
            self.critic_model.get_weights(),
            self.target_critic_model.get_weights(), self.polyak)
        self.target_critic_model.set_weights(new_target_critic_weights_list)

        # 更新actor的target网络
        new_target_actor_weights_list = polyak_averaging(
            self.actor_model.get_weights(),
            self.target_actor_model.get_weights(), self.polyak)
        self.target_actor_model.set_weights(new_target_actor_weights_list)

    def polyak_averaging(self, weights_list, target_weights_list):
        new_target_weights_list = []
        for weights, target_weights in zip(weights_list, target_weights_list):
            new_target_weights = self.polyak * target_weights + (
                1 - self.polyak) * weights
            new_target_weights_list.append(new_target_weights)
        return new_target_weights_list

    def _update_critic(self, observations, actions, rewards, terminals,
                       next_observations):
        q_values_next = self.target_critic_model(
            [next_observations,
             self.actor_model(next_observations)])
        target_q_values = rewards + self.gamma * q_values_next
        self.critic_model.fit([observations, actions],
                              target_q_values,
                              verbose=0)

    @tf.function
    def _update_actor(self, observations):
        with tf.GradientTape() as tape:
            tape.watch(self.actor_model.trainable_weights)
            q_values = self.target_critic_model(
                [observations, self.actor_model(observations)])
            loss = -tf.reduce_mean(q_values)

        actor_grads = tape.gradient(loss, self.actor_model.trainable_weights)
        self.actor_model.optimizer.apply_gradients(
            zip(actor_grads, self.actor_model.trainable_weights))
class MACE(Agent):
    def __init__(self, env: gym.Env, **kwargs):

        super(MACE, self).__init__(**kwargs)
        self.nb_actions = env.action_space.shape[0]

        obs_input_actor = Input(shape=(1, ) + env.observation_space.shape,
                                name='observation_input')
        x_ac = Flatten()(obs_input_actor)
        x_ac = Dense(units=256, activation='relu')(x_ac)

        obs_input_critic = Input(shape=(1, ) + env.observation_space.shape,
                                 name='observation_input')
        x_cr = Flatten()(obs_input_critic)
        x_cr = Dense(units=256, activation='relu')(x_cr)

        x_critic = Dense(units=128, activation='relu')(x_cr)
        value = Dense(units=1)(x_critic)

        x_actor = Dense(units=128, activation='relu')(x_ac)
        action = Dense(units=self.nb_actions, activation='tanh')(x_actor)

        actor = Model(inputs=obs_input_actor, outputs=action)
        critic = Model(inputs=obs_input_critic, outputs=value)

        metrics = []
        metrics += [mean_q]
        critic_metrics = metrics

        critic_optimizer = Adam(lr=1e-3)
        actor_optimizer = Adam(lr=1e-3)

        #        critic_optimizer = SGD(lr=1e-4, momentum=0.9)
        #        actor_optimizer = SGD(lr=1e-3, momentum=0.9)

        self.actor = actor
        self.critic = critic

        self.target_actor = clone_model(self.actor)
        self.target_actor.compile(optimizer='sgd', loss='mse')
        self.target_critic = clone_model(self.critic)
        self.target_critic.compile(optimizer='sgd', loss='mse')

        self.target_model_update = 1e-3
        #self.target_model_update=500

        if self.target_model_update < 1.:
            # We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model.
            critic_updates = get_soft_target_model_updates(
                self.target_critic, self.critic, self.target_model_update)
            critic_optimizer = AdditionalUpdatesOptimizer(
                critic_optimizer, critic_updates)
            actor_updates = get_soft_target_model_updates(
                self.target_actor, self.actor, self.target_model_update)
            actor_optimizer = AdditionalUpdatesOptimizer(
                actor_optimizer, actor_updates)

        self.delta_clip = np.inf

        def clipped_error(y_true, y_pred):
            return K.mean(huber_loss(y_true, y_pred, self.delta_clip), axis=-1)

        actor.compile(actor_optimizer, loss='mse')
        critic.compile(critic_optimizer, loss='mse', metrics=critic_metrics)

        self.compiled = True

        self.memory = SequentialMemory(limit=100000, window_length=1)
        self.memory_interval = 1
        self.memory_actor = SequentialMemory(limit=100000, window_length=1)
        self.memory_critic = SequentialMemory(limit=100000, window_length=1)

        self.nb_steps_warmup = 50000

        self.train_interval = 4
        self.batch_size = 64
        self.gamma = 0.99

        self.processor = None
        self.random_process = OrnsteinUhlenbeckProcess(theta=.15,
                                                       mu=0.,
                                                       sigma=.3,
                                                       size=self.nb_actions)
        self.eps = 0.9

    def process_state_batch(self, batch):
        batch = np.array(batch)
        if self.processor is None:
            return batch
        return self.processor.process_state_batch(batch)

    def select_action(self, state):
        batch = [state]
        action = self.actor.predict_on_batch(np.asarray(batch)).flatten()
        # Apply noise, if a random process is set.
        if self.training and self.random_process is not None:
            #Actor exploration Bernoulli
            #            rd = np.random.rand()
            #            if rd<self.eps:
            noise = self.random_process.sample()
            assert noise.shape == action.shape
            action += noise
#               self.action_exploration=True


#            else:
#                self.action_exploration=False
        return action

    def forward(self, observation):
        # Select an action.

        state = self.memory.get_recent_state(observation)
        action = self.select_action(state)  # TODO: move this into policy

        # Book-keeping.
        self.recent_observation = observation
        self.recent_action = action

        return action

    def backward(self, reward, terminal=False):
        # Store most recent experience in memory.
        if self.step % self.memory_interval == 0:
            self.memory.append(self.recent_observation,
                               self.recent_action,
                               reward,
                               terminal,
                               training=self.training)

        metrics = [np.nan for _ in self.metrics_names]
        if not self.training:
            # We're done here. No need to update the experience memory since we only use the working
            # memory to obtain the state over the most recent observations.
            return metrics

        # Train the network on a single stochastic batch.
        can_train_either = self.step > self.nb_steps_warmup
        if can_train_either and self.step % self.train_interval == 0:
            experiences = self.memory.sample(self.batch_size)
            assert len(experiences) == self.batch_size

            # Start by extracting the necessary parameters (we use a vectorized implementation).
            state0_batch = []
            reward_batch = []
            action_batch = []
            terminal1_batch = []
            state1_batch = []
            for e in experiences:
                state0_batch.append(e.state0)
                state1_batch.append(e.state1)
                reward_batch.append(e.reward)
                action_batch.append(e.action)
                terminal1_batch.append(0. if e.terminal1 else 1.)

            # Prepare and validate parameters.
            state0_batch = self.process_state_batch(state0_batch)
            state1_batch = self.process_state_batch(state1_batch)
            terminal1_batch = np.array(terminal1_batch)
            reward_batch = np.array(reward_batch)
            action_batch = np.array(action_batch)
            assert reward_batch.shape == (self.batch_size, )
            assert terminal1_batch.shape == reward_batch.shape
            assert action_batch.shape == (self.batch_size, self.nb_actions)

            # Update actor and critic, if warm up is over.
            if self.step > self.nb_steps_warmup:
                if len(self.critic.inputs) >= 3:
                    state1_batch_with_action = state1_batch[:]
                else:
                    state1_batch_with_action = [state1_batch]
                target_q_values = self.target_critic.predict_on_batch(
                    state1_batch_with_action).flatten()
                assert target_q_values.shape == (self.batch_size, )

                # Compute r_t + gamma * max_a Q(s_t+1, a) and update the target ys accordingly,
                # but only for the affected output units (as given by action_batch).
                discounted_reward_batch = self.gamma * target_q_values
                discounted_reward_batch *= terminal1_batch
                assert discounted_reward_batch.shape == reward_batch.shape
                targets = (reward_batch + discounted_reward_batch).reshape(
                    self.batch_size, 1)

                # Perform a single batch update on the critic network.
                if len(self.critic.inputs) >= 3:
                    state0_batch_with_action = state0_batch[:]
                else:
                    state0_batch_with_action = [state0_batch]
                #state0_batch_with_action.insert(self.critic_action_input_idx, action_batch)
                metrics = self.critic.train_on_batch(state0_batch_with_action,
                                                     targets)
                if self.processor is not None:
                    metrics += self.processor.metrics

            #Actor
            experiences = self.memory.sample(self.batch_size)
            assert len(experiences) == self.batch_size

            # Start by extracting the necessary parameters (we use a vectorized implementation).
            state0_batch = []
            reward_batch = []
            action_batch = []
            terminal1_batch = []
            state1_batch = []
            for e in experiences:
                state0_batch.append(e.state0)
                state1_batch.append(e.state1)
                reward_batch.append(e.reward)
                action_batch.append(e.action)
                terminal1_batch.append(0. if e.terminal1 else 1.)

            # Prepare and validate parameters.
            state0_batch = self.process_state_batch(state0_batch)
            state1_batch = self.process_state_batch(state1_batch)
            terminal1_batch = np.array(terminal1_batch)
            reward_batch = np.array(reward_batch)
            action_batch = np.array(action_batch)
            assert reward_batch.shape == (self.batch_size, )
            assert terminal1_batch.shape == reward_batch.shape
            assert action_batch.shape == (self.batch_size, self.nb_actions)

            if self.step > self.nb_steps_warmup:
                #Actor
                target_q_values1 = self.target_critic.predict_on_batch(
                    state1_batch_with_action).flatten()
                discounted_reward_batch = self.gamma * target_q_values1
                discounted_reward_batch *= terminal1_batch
                targets = (reward_batch + discounted_reward_batch)
                target_q_values0 = self.target_critic.predict_on_batch(
                    state0_batch_with_action).flatten()
                delta = targets - target_q_values0
                if len(self.actor.inputs) >= 2:
                    inputs = state0_batch[:]
                else:
                    #inputs = [state0_batch]
                    inputs = state0_batch
                pos_dif = delta > 0
                #                if self.step%1000==0:
                #                    print(np.sum(pos_dif))
                inputs = np.asarray(inputs)[pos_dif]
                actions_target = action_batch[pos_dif]
                #state0_batch_with_action.insert(self.critic_action_input_idx, action_batch)
                self.actor.train_on_batch(inputs, actions_target)

        if self.target_model_update >= 1 and self.step % self.target_model_update == 0:
            self.update_target_models_hard()

        return metrics

    def reset_states(self):
        if self.random_process is not None:
            self.random_process.reset_states()
        self.recent_action = None
        self.recent_observation = None
        if self.compiled:
            self.actor.reset_states()
            self.critic.reset_states()
            self.target_actor.reset_states()
            self.target_critic.reset_states()

    def update_target_models_hard(self):
        self.target_critic.set_weights(self.critic.get_weights())
        self.target_actor.set_weights(self.actor.get_weights())

    @property
    def metrics_names(self):
        names = self.critic.metrics_names[:]
        if self.processor is not None:
            names += self.processor.metrics_names[:]
        return names