Exemple #1
0
        _t_train_batch = t_train[index]
        t_train_batch = np.eye(3)[list(map(int, _t_train_batch))]

        trainer.train_minibatch({
            features: x_train_batch,
            label: t_train_batch
        })
        sample_count = trainer.previous_minibatch_sample_count
        aggregate_loss += trainer.previous_minibatch_loss_average * sample_count

    last_avg_error = aggregate_loss / trainer.total_number_of_samples_seen

    avg_error = trainer.test_minibatch({features: x_test, label: t_test})
    print(' error rate on an unseen minibatch: {}'.format(avg_error))

# ONNX形式でモデルを保存する。
output_file_path = R"./cntk_iris_model.onnx"
model.save(output_file_path, format=C.ModelFormat.ONNX)

# ONNX形式で保存したモデルを読み込み、推論する。
print("========== START INFERENCE ==========")
reload_model = C.Function.load(output_file_path,
                               device=C.device.gpu(0),
                               format=C.ModelFormat.ONNX)

classifier = C.softmax(reload_model)

for i, train in enumerate(x_train):
    infer = np.argmax(classifier.eval([train]))
    print("[{}] in:{} correct:{} infer:{}".format(i, train, t_train[i], infer))
Exemple #2
0
class DeepQAgent(object):
    """
    Implementation of Deep Q Neural Network agent like in:
        Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015)
    """
    def __init__(self,
                 input_shape,
                 nb_actions,
                 gamma=0.99,
                 explorer=ExpEpsilonAnnealingExplorer(1, 0.1, 1000000),
                 learning_rate=0.0005,
                 momentum=0.95,
                 minibatch_size=128,
                 memory_size=500000,
                 train_after=256,
                 train_interval=2,
                 target_update_interval=10000,
                 monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma

        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval

        self._explorer = explorer
        self._minibatch_size = minibatch_size
        self._memory = ReplayMemory(memory_size, input_shape)
        self._num_actions_taken = 0

        # Metrics accumulator
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        # Action Value model (used by agent to interact with the environment)
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                # Convolution2D((8, 8), 16, strides=4),
                # Convolution2D((4, 4), 32, strides=2),
                # Convolution2D((3, 3), 32, strides=1),
                Dense(128, init=he_uniform()),
                Dense(128, init=he_uniform()),
                Dense(nb_actions, activation=None, init=he_uniform())
            ])
        self._action_value_net.update_signature(Tensor[input_shape])

        # Target model used to compute the target Q-values in training, updated
        # less frequently for increased stability.
        self._target_net = self._action_value_net.clone(CloneMethod.freeze)

        # Function computing Q-values targets as part of the computation graph
        @Function
        @Signature(post_states=Tensor[input_shape],
                   rewards=Tensor[()],
                   terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) +
                rewards,
            )

        # Define the loss, using Huber Loss (more robust to outliers)
        @Function
        @Signature(pre_states=Tensor[input_shape],
                   actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape],
                   rewards=Tensor[()],
                   terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            # Compute the q_targets
            q_targets = compute_q_targets(post_states, rewards, terminals)

            # actions is a 1-hot encoding of the action done by the agent
            q_acted = reduce_sum(self._action_value_net(pre_states) * actions,
                                 axis=0)

            # Define training criterion as the Huber Loss function
            return huber_loss(q_targets, q_acted, 1.0)

        # Adam based SGD
        lr_schedule = learning_parameter_schedule(learning_rate)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters,
                     lr_schedule,
                     momentum=m_schedule,
                     variance_momentum=vm_schedule)

        log_dir = 'metrics/' + datetime.now().strftime('%Y%m%d%H%M%S')
        self._metrics_writer = TensorBoardProgressWriter(
            freq=1, log_dir=log_dir, model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd,
                                self._metrics_writer)

    def act(self, state):
        """ This allows the agent to select the next action to perform in regard of the current state of the environment.
        It follows the terminology used in the Nature paper.

        Attributes:
            state (Tensor[input_shape]): The current environment state

        Returns: Int >= 0 : Next action to do
        """
        # If policy requires agent to explore, sample random action
        if self._explorer.is_exploring(self._num_actions_taken):
            action = self._explorer(self.nb_actions)
        else:
            # Use the network to output the best action
            q_values = self._action_value_net.eval(state)

            self._episode_q_means.append(np.mean(q_values))
            self._episode_q_stddev.append(np.std(q_values))

            # Return the value maximizing the expected reward
            action = q_values.argmax()

        # Keep track of interval action counter
        self._num_actions_taken += 1
        return action

    def observe(self, state, action, reward, new_state, done):
        """ This allows the agent to observe the output of doing the action it selected through act() on the state

        Attributes:
            state (Tensor[input_shape]): Previous environment state
            action (int): Action done by the agent
            reward (float): Reward for doing this action in the state environment
            new_state (Tensor[input_shape]): New environment state
            done (bool): Indicate if the action has terminated the environment
        """
        self._episode_rewards.append(reward)

        # If done, reset short term memory (ie. History)
        if done:
            # Plot the metrics through Tensorboard and reset buffers
            if self._metrics_writer is not None:
                self._plot_metrics()
            self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        # Append to long term memory
        self._memory.append(state, action, reward, new_state, done)

    def train(self):
        """ This allows the agent to train itself to better understand the environment dynamics.
        The agent will compute the expected reward for the state(t+1)
        and update the expected reward at step t according to this.

        The target expectation is computed through the Target Network, which is a more stable version
        of the Action Value Network for increasing training stability.

        The Target Network is a frozen copy of the Action Value Network updated as regular intervals.
        """

        agent_step = self._num_actions_taken

        if agent_step >= self._train_after:
            if (agent_step % self._train_interval) == 0:
                pre_states, actions, rewards, post_states, terminals = self._memory.minibatch(
                    self._minibatch_size)

                self._trainer.train_minibatch(
                    self._trainer.loss_function.argument_map(
                        pre_states=pre_states,
                        actions=Value.one_hot(
                            actions.reshape(-1, 1).tolist(), self.nb_actions),
                        post_states=post_states,
                        rewards=rewards,
                        terminals=terminals))

                # Update the Target Network if needed
                if (agent_step % self._target_update_interval) == 0:
                    self._target_net = self._action_value_net.clone(
                        CloneMethod.freeze)

    def _plot_metrics(self):
        """Plot current buffers accumulated values to visualize agent learning
        """
        if len(self._episode_q_means) > 0:
            mean_q = np.asscalar(np.mean(self._episode_q_means))
            self._metrics_writer.write_value('Mean Q per ep.', mean_q,
                                             self._num_actions_taken)

        if len(self._episode_q_stddev) > 0:
            std_q = np.asscalar(np.mean(self._episode_q_stddev))
            self._metrics_writer.write_value('Mean Std Q per ep.', std_q,
                                             self._num_actions_taken)

        tot_reward = sum(self._episode_rewards)
        self._metrics_writer.write_value('Sum rewards per ep.', tot_reward,
                                         self._num_actions_taken)
        self._metrics_writer.write_value('Episode length.',
                                         len(self._episode_rewards),
                                         self._num_actions_taken)
        self._metrics_writer.write_value(
            'Sum rewards per step.', tot_reward / len(self._episode_rewards),
            self._num_actions_taken)
        self._metrics_writer.write_value(
            'Exporation rate.',
            self._explorer._epsilon(self._num_actions_taken),
            self._num_actions_taken)

    def save(self, path):
        self._action_value_net.save(path)

    def load(self, path):
        from cntk import load_model
        self._action_value_net = load_model(path)
Exemple #3
0
class LearningAgent(object):
    def __init__(self,
                 state_dim,
                 action_dim,
                 gamma=0.99,
                 learning_rate=1e-4,
                 momentum=0.95):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma

        with default_options(activation=relu, init=he_uniform()):
            # Convolution filter counts were halved to save on memory, no gpu :(
            self.model = Sequential([
                Convolution2D((8, 8), 16, strides=4, name='conv1'),
                Convolution2D((4, 4), 32, strides=2, name='conv2'),
                Convolution2D((3, 3), 32, strides=1, name='conv3'),
                Dense(256, init=he_uniform(scale=0.01), name='dense1'),
                Dense(action_dim,
                      activation=None,
                      init=he_uniform(scale=0.01),
                      name='actions')
            ])
            self.model.update_signature(Tensor[state_dim])

        # Create the target model as a copy of the online model
        self.target_model = None
        self.update_target()

        self.pre_states = input_variable(state_dim, name='pre_states')
        self.actions = input_variable(action_dim, name='actions')
        self.post_states = input_variable(state_dim, name='post_states')
        self.rewards = input_variable((), name='rewards')
        self.terminals = input_variable((), name='terminals')
        self.is_weights = input_variable((), name='is_weights')

        predicted_q = reduce_sum(self.model(self.pre_states) * self.actions,
                                 axis=0)

        # DQN - calculate target q values
        # post_q = reduce_max(self.target_model(self.post_states), axis=0)

        # DDQN - calculate target q values
        online_selection = one_hot(
            argmax(self.model(self.post_states), axis=0), self.action_dim)
        post_q = reduce_sum(self.target_model(self.post_states) *
                            online_selection,
                            axis=0)

        post_q = (1.0 - self.terminals) * post_q
        target_q = stop_gradient(self.rewards + self.gamma * post_q)

        # Huber loss
        delta = 1.0
        self.td_error = minus(predicted_q, target_q, name='td_error')
        abs_error = abs(self.td_error)
        errors = element_select(less(abs_error, delta),
                                square(self.td_error) * 0.5,
                                delta * (abs_error - 0.5 * delta))
        loss = errors * self.is_weights

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_scheule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)

        self._learner = adam(self.model.parameters,
                             lr_schedule,
                             m_scheule,
                             variance_momentum=vm_schedule)
        self.writer = TensorBoardProgressWriter(log_dir='metrics',
                                                model=self.model)
        self.trainer = Trainer(self.model, (loss, None), [self._learner],
                               self.writer)

    def act(self, state, epsilon):
        """
        Selects an action to take based on the epsilon greedy method
        :param state: The current state
        :param epsilon: Determines the amount of exploration. (1 - full exploration, 0 - no exploration)
        """
        if np.random.randn(1) < epsilon:
            # Explore (random action)
            return np.random.choice(self.action_dim)
        else:
            # Exploit (greedy action based on knowledge)
            return self.model.eval(state).argmax()

    def train(self, s, a, r, s_, t, w):
        """
        Updates the network weights using the given minibatch data
        :param s: Tensor[state_dim] Current state
        :param a: Tensor[int] Action taken at state s
        :param r: Tensor[float] State resulting from taking action a at state s
        :param s_: Tensor[state_dim] Reward received for taking action a at state s
        :param t: Tensor[boolean] True if s_ was a terminal state and false otherwise
        :param w: Tensor[float] Importance sampling weights
        """
        a = Value.one_hot(a.tolist(), self.action_dim)
        td_error = self.trainer.train_minibatch(
            {
                self.pre_states: s,
                self.actions: a,
                self.rewards: r,
                self.post_states: s_,
                self.terminals: t,
                self.is_weights: w
            },
            outputs=[self.td_error])
        return td_error[0]

    def update_target(self):
        """
        Update the target network using the online network weights
        """
        self.target_model = self.model.clone(CloneMethod.freeze)

    def checkpoint(self, filename):
        self.trainer.save_checkpoint(filename)

    def save_model(self, filename):
        self.model.save(filename)