Example #1
0
    def __init__(self, DQN, parameters=DQNParameters()):
        """
        DQN: The DQN used to estimate the reward
        parameters: The parameters!
        """

        self.on_loss_computed = Signal()
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.DQN = DQN.to(self.device).train()

        self.frozen_DQN = copy.deepcopy(self.DQN).eval()
        for param in self.frozen_DQN.parameters():
            param.requires_grad = False
        self._update_frozen()

        self.memory = PrioritizedMemory(parameters.capacity)

        self.optimizer = optim.RMSprop(self.DQN.parameters(), lr=parameters.lr)
        self.parameters = parameters

        self.it_s_replay_time = generator_true_every(1)
        self.it_s_update_frozen_time = generator_true_every(
            self.parameters.frozen_steps)

        self.it_s_action_debug_time = generator_true_every(1000)
Example #2
0
def test_memory():

    memory = PrioritizedMemory(10)
    memory.add(15, 1, 2, 3, 4, 5)
    memory.add(10, 4, 5, 6, 5, 2)

    indexes, transitions = zip(*memory.sample(2))

    assert indexes == (9, 10)
    assert transitions == (Transition(state=1,
                                      action=2,
                                      reward=3,
                                      next_state=4,
                                      terminal=5),
                           Transition(state=4,
                                      action=5,
                                      reward=6,
                                      next_state=5,
                                      terminal=2))
    """ Example of batch creation """
    assert Transition(*zip(*transitions)) == Transition(state=(1, 4),
                                                        action=(2, 5),
                                                        reward=(3, 6),
                                                        next_state=(4, 5),
                                                        terminal=(5, 2))
Example #3
0
from rl.layers import NoisyNetDense

INPUT_SHAPE = (84, 84)
WINDOW_LENGTH = 4

env = gym.make('MsPacmanDeterministic-v4')
np.random.seed(231)
env.seed(231)
nb_actions = env.action_space.n
input_shape = (WINDOW_LENGTH, INPUT_SHAPE[0], INPUT_SHAPE[1])

agent = NoisyDQN(input_shape, nb_actions)
model = agent.model
memory = PrioritizedMemory(limit=1000000,
                           alpha=.6,
                           start_beta=.4,
                           end_beta=1.,
                           steps_annealed=30000000,
                           window_length=WINDOW_LENGTH)
processor = AtariProcessor()
policy = GreedyQPolicy()

dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               policy=policy,
               memory=memory,
               processor=processor,
               enable_double_dqn=True,
               enable_dueling_network=True,
               nb_steps_warmup=50000,
               gamma=.99,
               target_model_update=10000,
Example #4
0
class DoubleDQN():
    """
    From Deep Reinforcement Learning with Double Q-learning
    at https://arxiv.org/abs/1509.06461
    """
    def __init__(self, DQN, parameters=DQNParameters()):
        """
        DQN: The DQN used to estimate the reward
        parameters: The parameters!
        """

        self.on_loss_computed = Signal()
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.DQN = DQN.to(self.device).train()

        self.frozen_DQN = copy.deepcopy(self.DQN).eval()
        for param in self.frozen_DQN.parameters():
            param.requires_grad = False
        self._update_frozen()

        self.memory = PrioritizedMemory(parameters.capacity)

        self.optimizer = optim.RMSprop(self.DQN.parameters(), lr=parameters.lr)
        self.parameters = parameters

        self.it_s_replay_time = generator_true_every(1)
        self.it_s_update_frozen_time = generator_true_every(
            self.parameters.frozen_steps)

        self.it_s_action_debug_time = generator_true_every(1000)

    def _update_frozen(self):
        """
        Let it go, let it go
        I am one with the wind and sky
        Let it go, let it go
        You'll never see me cry
        Here I stand and here I stay
        Let the storm rage on
        """
        self.frozen_DQN.load_state_dict(self.DQN.state_dict())

    def select_action(self, state):
        """ Return the selected action """
        with torch.no_grad():
            values = self.DQN(torch.FloatTensor([state]).to(
                self.device)).cpu().data.numpy()[0]
            if len(self.memory) > self.parameters.waiting_time:
                selected_action = numpy.argmax(values)
                if next(self.it_s_action_debug_time):
                    print(selected_action, values)
            else:
                selected_action = numpy.random.randint(len(values))

            return selected_action

    def observe(self, state, action, reward, next_state, is_terminal):
        """
        Observe an experience tuple (state, action, reward, next_state, is_terminal)
        """

        if self.parameters.clipping is not None:  # Clip the reward
            reward = numpy.clip(reward, -self.parameters.clipping,
                                self.parameters.clipping)

        self.memory.add(10, state, action, reward, next_state, is_terminal)

        if next(self.it_s_update_frozen_time):
            self._update_frozen()

        if next(self.it_s_replay_time) and len(
                self.memory) > self.parameters.waiting_time:
            self._replay()

    def train(self):
        self.DQN.train()

    def eval(self):
        self.DQN.eval()

    def save(self):
        self.DQN.save_state_dict("model.torch")

    def _replay(self):
        """
        Learn things
        """
        indexes, transitions = zip(
            *self.memory.sample(self.parameters.batch_size))
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        state_values = self.DQN(\
            torch.FloatTensor(batch.state).to(self.device),\
            torch.LongTensor(batch.action).to(self.device).unsqueeze(1)\
        )
        with torch.no_grad():
            expected_state_values = torch.FloatTensor(batch.reward).to(self.device).unsqueeze(1)\
                + self.parameters.gamma ** self.memory.n_step * self.DQN(torch.FloatTensor(batch.next_state).to(self.device)).max(1, True)[0]*(1 - torch.FloatTensor(batch.terminal).to(self.device).unsqueeze(1))

        loss = F.mse_loss(state_values, expected_state_values)  # MSE Loss
        self.on_loss_computed.emit(
            loss.cpu().data.numpy())  # Emit the computed loss

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.DQN.parameters():
            if hasattr(param, "grad") and hasattr(param.grad, "data"):
                param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
Example #5
0
model.add(Activation('relu'))
model.add(Convolution2D(64, 4, 4, subsample=(2, 2)))
model.add(Activation('relu'))
model.add(Convolution2D(64, 3, 3, subsample=(1, 1)))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = PrioritizedMemory(limit=100000,
                           error=0.01,
                           alfa=0.6,
                           window_length=WINDOW_LENGTH)
processor = SpectrumProcessor()

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                              attr='eps',
                              value_max=1.,
                              value_min=.1,
                              value_test=.05,
                              nb_steps=1000000)
Example #6
0
    def _build_dqn_agent(self, params):
        NB_ACTIONS = 7

        # ----------------------------------------------------------------------------------------------------------------
        inputShape = (params['width'], params['height'], 3)

        model = Sequential()
        model.add(
            Conv2D(16, (3, 3),
                   input_shape=inputShape,
                   padding='same',
                   activation='relu'))
        model.add(Conv2D(32, (3, 3), padding='same', activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
        model.add(NoisyNetDense(16, activation='linear'))
        model.add(Flatten())
        model.add(NoisyNetDense(NB_ACTIONS, activation='linear'))

        model.summary()
        # ----------------------------------------------------------------------------------------------------------------

        # Memory replay
        if not params['prio_memory']:
            print("Using Sequential memory")
            memory = SequentialMemory(limit=params['mem_size'],
                                      window_length=1)
        else:
            print("Using Prioritized memory")
            params['lr'] = params['lr'] / 4
            memory = PrioritizedMemory(limit=params['mem_size'],
                                       alpha=0.6,
                                       start_beta=0.5,
                                       end_beta=1.0,
                                       steps_annealed=params['annealing'],
                                       window_length=1)

        # Epsilon Greedy policy, linearly decreasing
        if not params['noisy_layer']:
            print("Using Annealed Eps Greedy policy")
            self.policy = LinearAnnealedPolicy(EpsGreedyQPolicy(),
                                               attr='eps',
                                               value_max=params['eps'],
                                               value_min=params['eps_final'],
                                               value_test=0.0,
                                               nb_steps=params['annealing'])

        # Or Greedy policy in case of noisy layers
        else:
            print("Using Q Greedy policy (with noisy layer)")
            self.policy = GreedyQPolicy()

        # Keras DQN agent
        self._dqn = DQNAgent(
            model=model,
            nb_actions=NB_ACTIONS,
            policy=self.policy,
            memory=memory,
            batch_size=params['batch_size'],
            processor=WindowProcessor(),
            enable_double_dqn=True,
            enable_dueling_network=True,
            nb_steps_warmup=params['train_start'],
            gamma=params['discount'],
            target_model_update=1000,
            train_interval=1,
            delta_clip=1.,
            custom_model_objects={"NoisyNetDense": NoisyNetDense})

        self._dqn.compile(Adam(lr=params['lr']), metrics=['mae'])

        if params['load_file']:
            print("file loaded")
            self._dqn.load_weights(params['load_file'])