Beispiel #1
0
def test_training_flag():
    obs_size = (3, 4)

    obs0 = np.random.random(obs_size)
    terminal0 = False

    obs1 = np.random.random(obs_size)
    terminal1 = True

    obs2 = np.random.random(obs_size)
    terminal2 = False

    for training in (True, False):
        memory = SequentialMemory(3, window_length=2)

        state = memory.get_recent_state(obs0)
        assert state.shape == (2,) + obs_size
        assert np.allclose(state[0], 0.)
        assert np.all(state[1] == obs0)
        assert memory.nb_entries == 0
        
        memory.append(obs0, 0, 0., terminal1, training=training)
        state = memory.get_recent_state(obs1)
        assert state.shape == (2,) + obs_size
        assert np.all(state[0] == obs0)
        assert np.all(state[1] == obs1)
        if training:
            assert memory.nb_entries == 1
        else:
            assert memory.nb_entries == 0

        memory.append(obs1, 0, 0., terminal2, training=training)
        state = memory.get_recent_state(obs2)
        assert state.shape == (2,) + obs_size
        assert np.allclose(state[0], 0.)
        assert np.all(state[1] == obs2)
        if training:
            assert memory.nb_entries == 2
        else:
            assert memory.nb_entries == 0
Beispiel #2
0
def test_get_recent_state_with_episode_boundaries():
    memory = SequentialMemory(3, window_length=2, ignore_episode_boundaries=False)
    obs_size = (3, 4)
    
    obs0 = np.random.random(obs_size)
    terminal0 = False

    obs1 = np.random.random(obs_size)
    terminal1 = False

    obs2 = np.random.random(obs_size)
    terminal2 = False

    obs3 = np.random.random(obs_size)
    terminal3 = True

    obs4 = np.random.random(obs_size)
    terminal4 = False

    obs5 = np.random.random(obs_size)
    terminal5 = True

    obs6 = np.random.random(obs_size)
    terminal6 = False

    state = memory.get_recent_state(obs0)
    assert state.shape == (2,) + obs_size
    assert np.allclose(state[0], 0.)
    assert np.all(state[1] == obs0)

    # memory.append takes the current observation, the reward after taking an action and if
    # the *new* observation is terminal, thus `obs0` and `terminal1` is correct.
    memory.append(obs0, 0, 0., terminal1)
    state = memory.get_recent_state(obs1)
    assert state.shape == (2,) + obs_size
    assert np.all(state[0] == obs0)
    assert np.all(state[1] == obs1)

    memory.append(obs1, 0, 0., terminal2)
    state = memory.get_recent_state(obs2)
    assert state.shape == (2,) + obs_size
    assert np.all(state[0] == obs1)
    assert np.all(state[1] == obs2)

    memory.append(obs2, 0, 0., terminal3)
    state = memory.get_recent_state(obs3)
    assert state.shape == (2,) + obs_size
    assert np.all(state[0] == obs2)
    assert np.all(state[1] == obs3)

    memory.append(obs3, 0, 0., terminal4)
    state = memory.get_recent_state(obs4)
    assert state.shape == (2,) + obs_size
    assert np.all(state[0] == np.zeros(obs_size))
    assert np.all(state[1] == obs4)

    memory.append(obs4, 0, 0., terminal5)
    state = memory.get_recent_state(obs5)
    assert state.shape == (2,) + obs_size
    assert np.all(state[0] == obs4)
    assert np.all(state[1] == obs5)

    memory.append(obs5, 0, 0., terminal6)
    state = memory.get_recent_state(obs6)
    assert state.shape == (2,) + obs_size
    assert np.all(state[0] == np.zeros(obs_size))
    assert np.all(state[1] == obs6)
class MACE(Agent):
    def __init__(self, env: gym.Env, **kwargs):

        super(MACE, self).__init__(**kwargs)
        self.nb_actions = env.action_space.shape[0]

        obs_input_actor = Input(shape=(1, ) + env.observation_space.shape,
                                name='observation_input')
        x_ac = Flatten()(obs_input_actor)
        x_ac = Dense(units=256, activation='relu')(x_ac)

        obs_input_critic = Input(shape=(1, ) + env.observation_space.shape,
                                 name='observation_input')
        x_cr = Flatten()(obs_input_critic)
        x_cr = Dense(units=256, activation='relu')(x_cr)

        x_critic = Dense(units=128, activation='relu')(x_cr)
        value = Dense(units=1)(x_critic)

        x_actor = Dense(units=128, activation='relu')(x_ac)
        action = Dense(units=self.nb_actions, activation='tanh')(x_actor)

        actor = Model(inputs=obs_input_actor, outputs=action)
        critic = Model(inputs=obs_input_critic, outputs=value)

        metrics = []
        metrics += [mean_q]
        critic_metrics = metrics

        critic_optimizer = Adam(lr=1e-3)
        actor_optimizer = Adam(lr=1e-3)

        #        critic_optimizer = SGD(lr=1e-4, momentum=0.9)
        #        actor_optimizer = SGD(lr=1e-3, momentum=0.9)

        self.actor = actor
        self.critic = critic

        self.target_actor = clone_model(self.actor)
        self.target_actor.compile(optimizer='sgd', loss='mse')
        self.target_critic = clone_model(self.critic)
        self.target_critic.compile(optimizer='sgd', loss='mse')

        self.target_model_update = 1e-3
        #self.target_model_update=500

        if self.target_model_update < 1.:
            # We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model.
            critic_updates = get_soft_target_model_updates(
                self.target_critic, self.critic, self.target_model_update)
            critic_optimizer = AdditionalUpdatesOptimizer(
                critic_optimizer, critic_updates)
            actor_updates = get_soft_target_model_updates(
                self.target_actor, self.actor, self.target_model_update)
            actor_optimizer = AdditionalUpdatesOptimizer(
                actor_optimizer, actor_updates)

        self.delta_clip = np.inf

        def clipped_error(y_true, y_pred):
            return K.mean(huber_loss(y_true, y_pred, self.delta_clip), axis=-1)

        actor.compile(actor_optimizer, loss='mse')
        critic.compile(critic_optimizer, loss='mse', metrics=critic_metrics)

        self.compiled = True

        self.memory = SequentialMemory(limit=100000, window_length=1)
        self.memory_interval = 1
        self.memory_actor = SequentialMemory(limit=100000, window_length=1)
        self.memory_critic = SequentialMemory(limit=100000, window_length=1)

        self.nb_steps_warmup = 50000

        self.train_interval = 4
        self.batch_size = 64
        self.gamma = 0.99

        self.processor = None
        self.random_process = OrnsteinUhlenbeckProcess(theta=.15,
                                                       mu=0.,
                                                       sigma=.3,
                                                       size=self.nb_actions)
        self.eps = 0.9

    def process_state_batch(self, batch):
        batch = np.array(batch)
        if self.processor is None:
            return batch
        return self.processor.process_state_batch(batch)

    def select_action(self, state):
        batch = [state]
        action = self.actor.predict_on_batch(np.asarray(batch)).flatten()
        # Apply noise, if a random process is set.
        if self.training and self.random_process is not None:
            #Actor exploration Bernoulli
            #            rd = np.random.rand()
            #            if rd<self.eps:
            noise = self.random_process.sample()
            assert noise.shape == action.shape
            action += noise
#               self.action_exploration=True


#            else:
#                self.action_exploration=False
        return action

    def forward(self, observation):
        # Select an action.

        state = self.memory.get_recent_state(observation)
        action = self.select_action(state)  # TODO: move this into policy

        # Book-keeping.
        self.recent_observation = observation
        self.recent_action = action

        return action

    def backward(self, reward, terminal=False):
        # Store most recent experience in memory.
        if self.step % self.memory_interval == 0:
            self.memory.append(self.recent_observation,
                               self.recent_action,
                               reward,
                               terminal,
                               training=self.training)

        metrics = [np.nan for _ in self.metrics_names]
        if not self.training:
            # We're done here. No need to update the experience memory since we only use the working
            # memory to obtain the state over the most recent observations.
            return metrics

        # Train the network on a single stochastic batch.
        can_train_either = self.step > self.nb_steps_warmup
        if can_train_either and self.step % self.train_interval == 0:
            experiences = self.memory.sample(self.batch_size)
            assert len(experiences) == self.batch_size

            # Start by extracting the necessary parameters (we use a vectorized implementation).
            state0_batch = []
            reward_batch = []
            action_batch = []
            terminal1_batch = []
            state1_batch = []
            for e in experiences:
                state0_batch.append(e.state0)
                state1_batch.append(e.state1)
                reward_batch.append(e.reward)
                action_batch.append(e.action)
                terminal1_batch.append(0. if e.terminal1 else 1.)

            # Prepare and validate parameters.
            state0_batch = self.process_state_batch(state0_batch)
            state1_batch = self.process_state_batch(state1_batch)
            terminal1_batch = np.array(terminal1_batch)
            reward_batch = np.array(reward_batch)
            action_batch = np.array(action_batch)
            assert reward_batch.shape == (self.batch_size, )
            assert terminal1_batch.shape == reward_batch.shape
            assert action_batch.shape == (self.batch_size, self.nb_actions)

            # Update actor and critic, if warm up is over.
            if self.step > self.nb_steps_warmup:
                if len(self.critic.inputs) >= 3:
                    state1_batch_with_action = state1_batch[:]
                else:
                    state1_batch_with_action = [state1_batch]
                target_q_values = self.target_critic.predict_on_batch(
                    state1_batch_with_action).flatten()
                assert target_q_values.shape == (self.batch_size, )

                # Compute r_t + gamma * max_a Q(s_t+1, a) and update the target ys accordingly,
                # but only for the affected output units (as given by action_batch).
                discounted_reward_batch = self.gamma * target_q_values
                discounted_reward_batch *= terminal1_batch
                assert discounted_reward_batch.shape == reward_batch.shape
                targets = (reward_batch + discounted_reward_batch).reshape(
                    self.batch_size, 1)

                # Perform a single batch update on the critic network.
                if len(self.critic.inputs) >= 3:
                    state0_batch_with_action = state0_batch[:]
                else:
                    state0_batch_with_action = [state0_batch]
                #state0_batch_with_action.insert(self.critic_action_input_idx, action_batch)
                metrics = self.critic.train_on_batch(state0_batch_with_action,
                                                     targets)
                if self.processor is not None:
                    metrics += self.processor.metrics

            #Actor
            experiences = self.memory.sample(self.batch_size)
            assert len(experiences) == self.batch_size

            # Start by extracting the necessary parameters (we use a vectorized implementation).
            state0_batch = []
            reward_batch = []
            action_batch = []
            terminal1_batch = []
            state1_batch = []
            for e in experiences:
                state0_batch.append(e.state0)
                state1_batch.append(e.state1)
                reward_batch.append(e.reward)
                action_batch.append(e.action)
                terminal1_batch.append(0. if e.terminal1 else 1.)

            # Prepare and validate parameters.
            state0_batch = self.process_state_batch(state0_batch)
            state1_batch = self.process_state_batch(state1_batch)
            terminal1_batch = np.array(terminal1_batch)
            reward_batch = np.array(reward_batch)
            action_batch = np.array(action_batch)
            assert reward_batch.shape == (self.batch_size, )
            assert terminal1_batch.shape == reward_batch.shape
            assert action_batch.shape == (self.batch_size, self.nb_actions)

            if self.step > self.nb_steps_warmup:
                #Actor
                target_q_values1 = self.target_critic.predict_on_batch(
                    state1_batch_with_action).flatten()
                discounted_reward_batch = self.gamma * target_q_values1
                discounted_reward_batch *= terminal1_batch
                targets = (reward_batch + discounted_reward_batch)
                target_q_values0 = self.target_critic.predict_on_batch(
                    state0_batch_with_action).flatten()
                delta = targets - target_q_values0
                if len(self.actor.inputs) >= 2:
                    inputs = state0_batch[:]
                else:
                    #inputs = [state0_batch]
                    inputs = state0_batch
                pos_dif = delta > 0
                #                if self.step%1000==0:
                #                    print(np.sum(pos_dif))
                inputs = np.asarray(inputs)[pos_dif]
                actions_target = action_batch[pos_dif]
                #state0_batch_with_action.insert(self.critic_action_input_idx, action_batch)
                self.actor.train_on_batch(inputs, actions_target)

        if self.target_model_update >= 1 and self.step % self.target_model_update == 0:
            self.update_target_models_hard()

        return metrics

    def reset_states(self):
        if self.random_process is not None:
            self.random_process.reset_states()
        self.recent_action = None
        self.recent_observation = None
        if self.compiled:
            self.actor.reset_states()
            self.critic.reset_states()
            self.target_actor.reset_states()
            self.target_critic.reset_states()

    def update_target_models_hard(self):
        self.target_critic.set_weights(self.critic.get_weights())
        self.target_actor.set_weights(self.actor.get_weights())

    @property
    def metrics_names(self):
        names = self.critic.metrics_names[:]
        if self.processor is not None:
            names += self.processor.metrics_names[:]
        return names
Beispiel #4
0
class EIIE(ArenaDDPGAgent):
    """
    Modified Ensemble of Identical Independent Evaluators
    As described in:
    https://arxiv.org/pdf/1706.10059.pdf
    Selu activations instead of relu.
    """

    def __init__(self,
                 env,
                 vision_neurons=2,
                 pattern_neurons=20,
                 nb_steps_warmup_critic=100,
                 nb_steps_warmup_actor=100,
                 batch_size=32,
                 lr=.01,
                 clipnorm=1.,
                 gamma=.99,
                 target_model_update=1e-2,
                 random_process=None,
                 mem_size=10000,
                 name=None):

        self.env = env
        self.name = name
        self.batch_size = batch_size
        self.n_pairs = (self.env.action_space.shape[0])

        action_input = Input(shape=(self.env.action_space.shape[0],),
                             name='action_input')
        observation_input = Input(shape=(1,
                                         self.env.obs_steps,
                                         (self.env.action_space.shape[0] - 1) * 6 + 1),
                                         name='observation_input')
        processed_obs = ProcessObs(name='processed_observation')(observation_input)
        portifolio_vector = PortifolioVector(name='portifolio_vector')(observation_input)

        reg = l2(1e-5)
        init = lecun_normal(42)
        # init = glorot_normal(42)

        ## Actor / Critic network
        a = BatchNormalization()(processed_obs)
        a = Conv2D(vision_neurons,
                  kernel_size=(1, 3),
                  padding='same',
                  activation='selu',
                  kernel_regularizer=reg,
                  kernel_initializer=init)(a)
        b = Conv2D(vision_neurons,
                   kernel_size=(1, 5),
                   padding='same',
                   activation='selu',
                   kernel_regularizer=reg,
                   kernel_initializer=init)(a)
        c = Conv2D(vision_neurons,
                   kernel_size=(1, 7),
                   padding='same',
                   activation='selu',
                   kernel_regularizer=reg,
                   kernel_initializer=init)(a)

        v = Concatenate(axis=-1)([a, b, c])
        # v = BatchNormalization()(v)
        v = Conv2D(pattern_neurons,
                  kernel_size=(1, self.env.obs_steps),
                  padding='valid',
                  activation='selu',
                  kernel_regularizer=reg,
                  kernel_initializer=init)(v)

        # Concatenate
        ka = Concatenate(axis=-1)([v, portifolio_vector])

        # Portifolio Vector
        pv = Conv2D(1,
                  kernel_size=(1,1),
                  padding='valid',
                  activation='linear',
                  activity_regularizer=reg,
                  kernel_initializer=init)(ka)

        # Shape conforming
        fa = Flatten()(pv)

        kc = Concatenate(axis=-1)([fa, action_input])

        # Add cash bias to output vector
        actor_out = CashBias()(fa)

        critic_out = Dense(1,
                           activation='linear',
                           activity_regularizer=reg,
                           kernel_initializer=init)(kc)

        # Define and compile models
        self.actor = Model(inputs=observation_input, outputs=actor_out)
        self.critic = Model(inputs=[observation_input, action_input], outputs=critic_out)

        self.memory = SequentialMemory(limit=mem_size, window_length=1)
        super().__init__(nb_actions=self.env.action_space.shape[0],
                         actor=self.actor,
                         critic=self.critic,
                         batch_size=batch_size,
                         critic_action_input=action_input,
                         memory=self.memory,
                         nb_steps_warmup_critic=nb_steps_warmup_critic,
                         nb_steps_warmup_actor=nb_steps_warmup_actor,
                         random_process=random_process,
                         gamma=gamma,
                         target_model_update=target_model_update,
                         custom_model_objects={
                             'PortfolioVector': PortifolioVector,
                             'ProcessObs': ProcessObs,
                             'CashBias': CashBias
                         }
                         )

        self.compile(Nadam(lr=lr, clipnorm=clipnorm), metrics=['mae'])

    def forward(self, observation):
        # Select an action.
        observation = observation.values
        state = self.memory.get_recent_state(observation)
        action = self.select_action(state)  # TODO: move this into policy
        if self.processor is not None:
            action = self.processor.process_action(action)

        # Book-keeping.
        self.recent_observation = observation
        self.recent_action = action

        return action

    def select_action(self, state):
        batch = self.process_state_batch([state])
        action = self.actor.predict_on_batch(batch).flatten()
        assert action.shape == (self.nb_actions,)

        # Apply noise, if a random process is set.
        if self.training and self.random_process is not None:
            noise = self.random_process.sample()
            assert noise.shape == action.shape
            action += noise

        return array_normalize(action)

    def predict(self, observation):
        return self.forward(observation)

    def load(self, number=-1):
        return self.load_from_db(self.env, number)

    def save(self):
        return self.save_to_db(self.env, self.name)

    def save_memory(self):
        return self.save_memory_to_db(self.env, self.name)

    def load_memory(self, number=-1):
        return self.load_memory_from_db(self.env, number)