_t_train_batch = t_train[index] t_train_batch = np.eye(3)[list(map(int, _t_train_batch))] trainer.train_minibatch({ features: x_train_batch, label: t_train_batch }) sample_count = trainer.previous_minibatch_sample_count aggregate_loss += trainer.previous_minibatch_loss_average * sample_count last_avg_error = aggregate_loss / trainer.total_number_of_samples_seen avg_error = trainer.test_minibatch({features: x_test, label: t_test}) print(' error rate on an unseen minibatch: {}'.format(avg_error)) # ONNX形式でモデルを保存する。 output_file_path = R"./cntk_iris_model.onnx" model.save(output_file_path, format=C.ModelFormat.ONNX) # ONNX形式で保存したモデルを読み込み、推論する。 print("========== START INFERENCE ==========") reload_model = C.Function.load(output_file_path, device=C.device.gpu(0), format=C.ModelFormat.ONNX) classifier = C.softmax(reload_model) for i, train in enumerate(x_train): infer = np.argmax(classifier.eval([train])) print("[{}] in:{} correct:{} infer:{}".format(i, train, t_train[i], infer))
class DeepQAgent(object): """ Implementation of Deep Q Neural Network agent like in: Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015) """ def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=ExpEpsilonAnnealingExplorer(1, 0.1, 1000000), learning_rate=0.0005, momentum=0.95, minibatch_size=128, memory_size=500000, train_after=256, train_interval=2, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._memory = ReplayMemory(memory_size, input_shape) self._num_actions_taken = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ # Convolution2D((8, 8), 16, strides=4), # Convolution2D((4, 4), 32, strides=2), # Convolution2D((3, 3), 32, strides=1), Dense(128, init=he_uniform()), Dense(128, init=he_uniform()), Dense(nb_actions, activation=None, init=he_uniform()) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_parameter_schedule(learning_rate) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) log_dir = 'metrics/' + datetime.now().strftime('%Y%m%d%H%M%S') self._metrics_writer = TensorBoardProgressWriter( freq=1, log_dir=log_dir, model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer) def act(self, state): """ This allows the agent to select the next action to perform in regard of the current state of the environment. It follows the terminology used in the Nature paper. Attributes: state (Tensor[input_shape]): The current environment state Returns: Int >= 0 : Next action to do """ # If policy requires agent to explore, sample random action if self._explorer.is_exploring(self._num_actions_taken): action = self._explorer(self.nb_actions) else: # Use the network to output the best action q_values = self._action_value_net.eval(state) self._episode_q_means.append(np.mean(q_values)) self._episode_q_stddev.append(np.std(q_values)) # Return the value maximizing the expected reward action = q_values.argmax() # Keep track of interval action counter self._num_actions_taken += 1 return action def observe(self, state, action, reward, new_state, done): """ This allows the agent to observe the output of doing the action it selected through act() on the state Attributes: state (Tensor[input_shape]): Previous environment state action (int): Action done by the agent reward (float): Reward for doing this action in the state environment new_state (Tensor[input_shape]): New environment state done (bool): Indicate if the action has terminated the environment """ self._episode_rewards.append(reward) # If done, reset short term memory (ie. History) if done: # Plot the metrics through Tensorboard and reset buffers if self._metrics_writer is not None: self._plot_metrics() self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Append to long term memory self._memory.append(state, action, reward, new_state, done) def train(self): """ This allows the agent to train itself to better understand the environment dynamics. The agent will compute the expected reward for the state(t+1) and update the expected reward at step t according to this. The target expectation is computed through the Target Network, which is a more stable version of the Action Value Network for increasing training stability. The Target Network is a frozen copy of the Action Value Network updated as regular intervals. """ agent_step = self._num_actions_taken if agent_step >= self._train_after: if (agent_step % self._train_interval) == 0: pre_states, actions, rewards, post_states, terminals = self._memory.minibatch( self._minibatch_size) self._trainer.train_minibatch( self._trainer.loss_function.argument_map( pre_states=pre_states, actions=Value.one_hot( actions.reshape(-1, 1).tolist(), self.nb_actions), post_states=post_states, rewards=rewards, terminals=terminals)) # Update the Target Network if needed if (agent_step % self._target_update_interval) == 0: self._target_net = self._action_value_net.clone( CloneMethod.freeze) def _plot_metrics(self): """Plot current buffers accumulated values to visualize agent learning """ if len(self._episode_q_means) > 0: mean_q = np.asscalar(np.mean(self._episode_q_means)) self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken) if len(self._episode_q_stddev) > 0: std_q = np.asscalar(np.mean(self._episode_q_stddev)) self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken) tot_reward = sum(self._episode_rewards) self._metrics_writer.write_value('Sum rewards per ep.', tot_reward, self._num_actions_taken) self._metrics_writer.write_value('Episode length.', len(self._episode_rewards), self._num_actions_taken) self._metrics_writer.write_value( 'Sum rewards per step.', tot_reward / len(self._episode_rewards), self._num_actions_taken) self._metrics_writer.write_value( 'Exporation rate.', self._explorer._epsilon(self._num_actions_taken), self._num_actions_taken) def save(self, path): self._action_value_net.save(path) def load(self, path): from cntk import load_model self._action_value_net = load_model(path)
class LearningAgent(object): def __init__(self, state_dim, action_dim, gamma=0.99, learning_rate=1e-4, momentum=0.95): self.state_dim = state_dim self.action_dim = action_dim self.gamma = gamma with default_options(activation=relu, init=he_uniform()): # Convolution filter counts were halved to save on memory, no gpu :( self.model = Sequential([ Convolution2D((8, 8), 16, strides=4, name='conv1'), Convolution2D((4, 4), 32, strides=2, name='conv2'), Convolution2D((3, 3), 32, strides=1, name='conv3'), Dense(256, init=he_uniform(scale=0.01), name='dense1'), Dense(action_dim, activation=None, init=he_uniform(scale=0.01), name='actions') ]) self.model.update_signature(Tensor[state_dim]) # Create the target model as a copy of the online model self.target_model = None self.update_target() self.pre_states = input_variable(state_dim, name='pre_states') self.actions = input_variable(action_dim, name='actions') self.post_states = input_variable(state_dim, name='post_states') self.rewards = input_variable((), name='rewards') self.terminals = input_variable((), name='terminals') self.is_weights = input_variable((), name='is_weights') predicted_q = reduce_sum(self.model(self.pre_states) * self.actions, axis=0) # DQN - calculate target q values # post_q = reduce_max(self.target_model(self.post_states), axis=0) # DDQN - calculate target q values online_selection = one_hot( argmax(self.model(self.post_states), axis=0), self.action_dim) post_q = reduce_sum(self.target_model(self.post_states) * online_selection, axis=0) post_q = (1.0 - self.terminals) * post_q target_q = stop_gradient(self.rewards + self.gamma * post_q) # Huber loss delta = 1.0 self.td_error = minus(predicted_q, target_q, name='td_error') abs_error = abs(self.td_error) errors = element_select(less(abs_error, delta), square(self.td_error) * 0.5, delta * (abs_error - 0.5 * delta)) loss = errors * self.is_weights # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_scheule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) self._learner = adam(self.model.parameters, lr_schedule, m_scheule, variance_momentum=vm_schedule) self.writer = TensorBoardProgressWriter(log_dir='metrics', model=self.model) self.trainer = Trainer(self.model, (loss, None), [self._learner], self.writer) def act(self, state, epsilon): """ Selects an action to take based on the epsilon greedy method :param state: The current state :param epsilon: Determines the amount of exploration. (1 - full exploration, 0 - no exploration) """ if np.random.randn(1) < epsilon: # Explore (random action) return np.random.choice(self.action_dim) else: # Exploit (greedy action based on knowledge) return self.model.eval(state).argmax() def train(self, s, a, r, s_, t, w): """ Updates the network weights using the given minibatch data :param s: Tensor[state_dim] Current state :param a: Tensor[int] Action taken at state s :param r: Tensor[float] State resulting from taking action a at state s :param s_: Tensor[state_dim] Reward received for taking action a at state s :param t: Tensor[boolean] True if s_ was a terminal state and false otherwise :param w: Tensor[float] Importance sampling weights """ a = Value.one_hot(a.tolist(), self.action_dim) td_error = self.trainer.train_minibatch( { self.pre_states: s, self.actions: a, self.rewards: r, self.post_states: s_, self.terminals: t, self.is_weights: w }, outputs=[self.td_error]) return td_error[0] def update_target(self): """ Update the target network using the online network weights """ self.target_model = self.model.clone(CloneMethod.freeze) def checkpoint(self, filename): self.trainer.save_checkpoint(filename) def save_model(self, filename): self.model.save(filename)