def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = RepMem(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Dense(input_shape, init=he_uniform(scale=0.01)), Dense(input_shape), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))]) self._action_value_net.update_signature(Tensor[input_shape]) self._target_net = self._action_value_net.clone(CloneMethod.freeze) @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): q_targets = compute_q_targets(post_states, rewards, terminals) q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) return huber_loss(q_targets, q_acted, 1.0) lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)
def build_trainer(self): # Set the learning rate, and the momentum parameters for the Adam optimizer. lr = learning_rate_schedule(self.lr, UnitType.minibatch) beta1 = momentum_schedule(0.9) beta2 = momentum_schedule(0.99) # Calculate the losses. loss_on_v = cntk.squared_error(self.R, self.v) pi_a_s = cntk.log(cntk.times_transpose(self.pi, self.action)) loss_on_pi = cntk.variables.Constant(-1) * (cntk.plus( cntk.times(pi_a_s, cntk.minus(self.R, self.v_calc)), 0.01 * cntk.times_transpose(self.pi, cntk.log(self.pi)))) #loss_on_pi = cntk.times(pi_a_s, cntk.minus(self.R, self.v_calc)) self.tensorboard_v_writer = TensorBoardProgressWriter( freq=10, log_dir="tensorboard_v_logs", model=self.v) self.tensorboard_pi_writer = TensorBoardProgressWriter( freq=10, log_dir="tensorboard_pi_logs", model=self.pi) # tensorboard --logdir=tensorboard_pi_logs http://localhost:6006/ # tensorboard --logdir=tensorboard_v_logs http://localhost:6006/ # Create the trainiers. self.trainer_v = cntk.Trainer(self.v, (loss_on_v), [ adam(self.pms_v, lr, beta1, variance_momentum=beta2, gradient_clipping_threshold_per_sample=2, l2_regularization_weight=0.01) ], self.tensorboard_v_writer) self.trainer_pi = cntk.Trainer(self.pi, (loss_on_pi), [ adam(self.pms_pi, lr, beta1, variance_momentum=beta2, gradient_clipping_threshold_per_sample=2, l2_regularization_weight=0.01) ], self.tensorboard_pi_writer)
def convnet_cifar10(train_source, test_source, epoch_size, num_convolution_layers=2, minibatch_size=64, max_epochs=30, log_file=None, tboard_log_dir='.', results_path=_MODEL_PATH): _cntk_py.set_computation_network_trace_level(0) logger.info("""Running network with: {num_convolution_layers} convolution layers {minibatch_size} minibatch size for {max_epochs} epochs""".format( num_convolution_layers=num_convolution_layers, minibatch_size=minibatch_size, max_epochs=max_epochs)) network = create_network(num_convolution_layers) progress_printer = ProgressPrinter(tag='Training', log_to_file=log_file, rank=cntk.Communicator.rank(), num_epochs=max_epochs) tensorboard_writer = TensorBoardProgressWriter(freq=10, log_dir=tboard_log_dir, model=network['output']) trainer = create_trainer(network, minibatch_size, epoch_size, [progress_printer, tensorboard_writer]) cv_config = CrossValidationConfig( minibatch_source=test_source, minibatch_size=16, callback=create_results_callback( os.path.join(results_path, "model_results.json"), num_convolution_layers=num_convolution_layers, minibatch_size=minibatch_size, max_epochs=max_epochs)) train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore=False, cv_config=cv_config) network['output'].save(os.path.join(results_path, _MODEL_NAME))
def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Convolution2D((8, 8), 16, strides=4), Convolution2D((4, 4), 32, strides=2), Convolution2D((3, 3), 32, strides=1), Dense(256, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)
class DeepQAgent(object): """ Implementation of Deep Q Neural Network agent like in: Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015) """ def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Convolution2D((8, 8), 16, strides=4), Convolution2D((4, 4), 32, strides=2), Convolution2D((3, 3), 32, strides=1), Dense(256, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer) def act(self, state): """ This allows the agent to select the next action to perform in regard of the current state of the environment. It follows the terminology used in the Nature paper. Attributes: state (Tensor[input_shape]): The current environment state Returns: Int >= 0 : Next action to do """ # Append the state to the short term memory (ie. History) self._history.append(state) # If policy requires agent to explore, sample random action if self._explorer.is_exploring(self._num_actions_taken): action = self._explorer(self.nb_actions) else: # Use the network to output the best action env_with_history = self._history.value q_values = self._action_value_net.eval( # Append batch axis with only one sample to evaluate env_with_history.reshape((1,) + env_with_history.shape) ) self._episode_q_means.append(np.mean(q_values)) self._episode_q_stddev.append(np.std(q_values)) # Return the value maximizing the expected reward action = q_values.argmax() # Keep track of interval action counter self._num_actions_taken += 1 return action def observe(self, old_state, action, reward, done): """ This allows the agent to observe the output of doing the action it selected through act() on the old_state Attributes: old_state (Tensor[input_shape]): Previous environment state action (int): Action done by the agent reward (float): Reward for doing this action in the old_state environment done (bool): Indicate if the action has terminated the environment """ self._episode_rewards.append(reward) # If done, reset short term memory (ie. History) if done: # Plot the metrics through Tensorboard and reset buffers if self._metrics_writer is not None: self._plot_metrics() self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Reset the short term memory self._history.reset() # Append to long term memory self._memory.append(old_state, action, reward, done) def train(self): """ This allows the agent to train itself to better understand the environment dynamics. The agent will compute the expected reward for the state(t+1) and update the expected reward at step t according to this. The target expectation is computed through the Target Network, which is a more stable version of the Action Value Network for increasing training stability. The Target Network is a frozen copy of the Action Value Network updated as regular intervals. """ agent_step = self._num_actions_taken if agent_step >= self._train_after: if (agent_step % self._train_interval) == 0: pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size) self._trainer.train_minibatch( self._trainer.loss_function.argument_map( pre_states=pre_states, actions=Value.one_hot(actions.reshape(-1, 1).tolist(), self.nb_actions), post_states=post_states, rewards=rewards, terminals=terminals ) ) # Update the Target Network if needed if (agent_step % self._target_update_interval) == 0: self._target_net = self._action_value_net.clone(CloneMethod.freeze) filename = "models\model%d" % agent_step self._trainer.save_checkpoint(filename) def _plot_metrics(self): """Plot current buffers accumulated values to visualize agent learning """ if len(self._episode_q_means) > 0: mean_q = np.asscalar(np.mean(self._episode_q_means)) self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken) if len(self._episode_q_stddev) > 0: std_q = np.asscalar(np.mean(self._episode_q_stddev)) self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken) self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken)
def simple_mnist(tensorboard_logdir=None): input_dim = 19 num_output_classes = 2 num_hidden_layers = 2 hidden_layers_dim = 1024 # Input variables denoting the features and label data feature = C.input_variable(input_dim, np.float32) label = C.input_variable(num_output_classes, np.float32) # Instantiate the feedforward classification model # scaled_input = element_times(constant(0.00390625), feature) z = Sequential([For(range(num_hidden_layers), lambda i: Dense(hidden_layers_dim, activation=relu)), Dense(num_output_classes)])(feature) ce = cross_entropy_with_softmax(z, label) pe = classification_error(z, label) data_dir = r"." path = os.path.normpath(os.path.join(data_dir, "train.ctf")) check_path(path) reader_train = create_reader(path, True, input_dim, num_output_classes) input_map = { feature : reader_train.streams.features, label : reader_train.streams.labels } # Training config minibatch_size = 512 num_samples_per_sweep = 1825000 num_sweeps_to_train_with = 100 # Instantiate progress writers. progress_writers = [ProgressPrinter( tag='Training', num_epochs=num_sweeps_to_train_with)] if tensorboard_logdir is not None: tensorboard_writer = TensorBoardProgressWriter(freq=10, log_dir=tensorboard_logdir, model=z) progress_writers.append(tensorboard_writer) # Instantiate the trainer object to drive the model training lr = learning_parameter_schedule_per_sample(0.001) learner = create_learner(model=z) trainer = Trainer(z, (ce, pe), learner, progress_writers) num_minibatches_to_train = int(num_samples_per_sweep / minibatch_size * num_sweeps_to_train_with) model_dir = "model" for i in range(num_minibatches_to_train): mb = reader_train.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(mb) freq = int(num_samples_per_sweep / minibatch_size) if i > 0 and i % freq == 0: timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f") current_trainer_cp = os.path.join(model_dir, timestamp + "_epoch_" + str(freq) + ".trainer") trainer.save_checkpoint(current_trainer_cp) train_error = get_error_rate(os.path.join(data_dir, "train_subset.ctf"), input_map, input_dim, num_output_classes, trainer) valid_error = get_error_rate(os.path.join(data_dir, "validation.ctf"), input_map, input_dim, num_output_classes, trainer) if train_error > 0: tensorboard_writer.write_value("train_error", train_error, i) if valid_error > 0: tensorboard_writer.write_value("valid_error", valid_error, i) feat_path = os.path.normpath(os.path.join(data_dir, "test.ctf")) return get_error_rate(feat_path, input_map, input_dim, num_output_classes, trainer)
def simple_mnist(tensorboard_logdir=None): input_dim = 784 num_output_classes = 10 num_hidden_layers = 2 hidden_layers_dim = 200 # Input variables denoting the features and label data feature = C.input_variable(input_dim, np.float32) label = C.input_variable(num_output_classes, np.float32) # Instantiate the feedforward classification model scaled_input = element_times(constant(0.00390625), feature) z = Sequential([ For(range(num_hidden_layers), lambda i: Dense(hidden_layers_dim, activation=relu)), Dense(num_output_classes) ])(scaled_input) ce = cross_entropy_with_softmax(z, label) pe = classification_error(z, label) data_dir = os.path.dirname(os.path.abspath(__file__)) path = os.path.join(data_dir, 'Train-28x28_cntk_text.txt') reader_train = create_reader(path, True, input_dim, num_output_classes) input_map = { feature: reader_train.streams.features, label: reader_train.streams.labels } # Training config minibatch_size = 64 num_samples_per_sweep = 60000 num_sweeps_to_train_with = 10 # Instantiate progress writers. # training_progress_output_freq = 100 progress_writers = [ ProgressPrinter( # freq=training_progress_output_freq, tag='Training', num_epochs=num_sweeps_to_train_with) ] if tensorboard_logdir is not None: progress_writers.append( TensorBoardProgressWriter(freq=10, log_dir=tensorboard_logdir, model=z)) # Instantiate the trainer object to drive the model training lr = 0.001 trainer = Trainer(z, (ce, pe), sgd(z.parameters, lr), progress_writers) training_session(trainer=trainer, mb_source=reader_train, mb_size=minibatch_size, model_inputs_to_streams=input_map, max_samples=num_samples_per_sweep * num_sweeps_to_train_with, progress_frequency=num_samples_per_sweep).train() # Load test data path = os.path.normpath(os.path.join(data_dir, "Test-28x28_cntk_text.txt")) check_path(path) reader_test = create_reader(path, False, input_dim, num_output_classes) input_map = { feature: reader_test.streams.features, label: reader_test.streams.labels } # Test data for trained model C.debugging.start_profiler() C.debugging.enable_profiler() C.debugging.set_node_timing(True) test_minibatch_size = 1024 num_samples = 10000 num_minibatches_to_test = num_samples / test_minibatch_size test_result = 0.0 for i in range(0, int(num_minibatches_to_test)): mb = reader_test.next_minibatch(test_minibatch_size, input_map=input_map) eval_error = trainer.test_minibatch(mb) test_result = test_result + eval_error C.debugging.stop_profiler() trainer.print_node_timing() # Average of evaluation errors of all test minibatches return test_result * 100 / num_minibatches_to_test
def train_test(train_reader, test_reader, model_func, x, y, learning_rate, minibatch_size, num_sweeps_to_train_with=10, tensorboard_logdir=None): # Instantiate the model function; x is the input (feature) variable model = model_func(x) # Instantiate the Tensorboard writer tensorboard_writer = None if tensorboard_logdir is not None: tensorboard_writer = TensorBoardProgressWriter( freq=10, log_dir=tensorboard_logdir, model=model) # Instantiate the loss and error function loss, label_error = create_criterion_function(model, y) # Instantiate the trainer object to drive the model training #learning_rate = 0.2 lr_schedule = C.learning_parameter_schedule(learning_rate) learner = C.sgd(z.parameters, lr_schedule) trainer = C.Trainer(z, (loss, label_error), [learner], progress_writers=tensorboard_writer) # Initialize the parameters for the trainer #minibatch_size = 64 num_samples_per_sweep = 60000 num_minibatches_to_train = (num_samples_per_sweep * num_sweeps_to_train_with) / minibatch_size # Map the data streams to the input and labels. input_map = { y: train_reader.streams.labels, x: train_reader.streams.features } # Uncomment below for more detailed logging training_progress_output_freq = 500 # Start a timer start = time.time() for i in range(0, int(num_minibatches_to_train)): # Read a mini batch from the training data file data = train_reader.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(data) print_training_progress(trainer, i, training_progress_output_freq, verbose=1) # Print training time print("Training took {:.1f} sec".format(time.time() - start)) # Test the model test_input_map = { y: test_reader.streams.labels, x: test_reader.streams.features } # Test data for trained model test_minibatch_size = 512 num_samples = 10000 num_minibatches_to_test = num_samples // test_minibatch_size test_result = 0.0 for i in range(num_minibatches_to_test): # We are loading test data in batches specified by test_minibatch_size # Each data point in the minibatch is a MNIST digit image of 784 dimensions # with one pixel per dimension that we will encode / decode with the # trained model. data = test_reader.next_minibatch(test_minibatch_size, input_map=test_input_map) eval_error = trainer.test_minibatch(data) test_result = test_result + eval_error # Average of evaluation errors of all test minibatches print("Average test error: {0:.2f}%".format(test_result * 100 / num_minibatches_to_test))
def __init__(self, state_dim, action_dim, gamma=0.99, learning_rate=1e-4, momentum=0.95): self.state_dim = state_dim self.action_dim = action_dim self.gamma = gamma with default_options(activation=relu, init=he_uniform()): # Convolution filter counts were halved to save on memory, no gpu :( self.model = Sequential([ Convolution2D((8, 8), 16, strides=4, name='conv1'), Convolution2D((4, 4), 32, strides=2, name='conv2'), Convolution2D((3, 3), 32, strides=1, name='conv3'), Dense(256, init=he_uniform(scale=0.01), name='dense1'), Dense(action_dim, activation=None, init=he_uniform(scale=0.01), name='actions') ]) self.model.update_signature(Tensor[state_dim]) # Create the target model as a copy of the online model self.target_model = None self.update_target() self.pre_states = input_variable(state_dim, name='pre_states') self.actions = input_variable(action_dim, name='actions') self.post_states = input_variable(state_dim, name='post_states') self.rewards = input_variable((), name='rewards') self.terminals = input_variable((), name='terminals') self.is_weights = input_variable((), name='is_weights') predicted_q = reduce_sum(self.model(self.pre_states) * self.actions, axis=0) # DQN - calculate target q values # post_q = reduce_max(self.target_model(self.post_states), axis=0) # DDQN - calculate target q values online_selection = one_hot( argmax(self.model(self.post_states), axis=0), self.action_dim) post_q = reduce_sum(self.target_model(self.post_states) * online_selection, axis=0) post_q = (1.0 - self.terminals) * post_q target_q = stop_gradient(self.rewards + self.gamma * post_q) # Huber loss delta = 1.0 self.td_error = minus(predicted_q, target_q, name='td_error') abs_error = abs(self.td_error) errors = element_select(less(abs_error, delta), square(self.td_error) * 0.5, delta * (abs_error - 0.5 * delta)) loss = errors * self.is_weights # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_scheule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) self._learner = adam(self.model.parameters, lr_schedule, m_scheule, variance_momentum=vm_schedule) self.writer = TensorBoardProgressWriter(log_dir='metrics', model=self.model) self.trainer = Trainer(self.model, (loss, None), [self._learner], self.writer)
class DeepQAgent(object): """ Implementation of Deep Q Neural Network agent like in: Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015) """ def __init__(self, input_shape, nb_actions, gamma=0.95, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 100000), learning_rate=0.01, momentum=0.8, minibatch_size=16, memory_size=15000, train_after=100, train_interval=100, target_update_interval=500, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 self._num_trains = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] ''' # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Convolution2D((8, 8), 16, strides=4), Convolution2D((4, 4), 32, strides=2), Convolution2D((3, 3), 32, strides=1), Dense(256, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) ''' with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Dense(7, init=he_uniform(scale=0.01)), Dense(8, init=he_uniform(scale=0.01)), #Dense(16, init=he_uniform(scale=0.01)), #Dense(32, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer) #self._trainer.restore_from_checkpoint('models/oldmodels/model800000') def act(self, state): """ This allows the agent to select the next action to perform in regard of the current state of the environment. It follows the terminology used in the Nature paper. Attributes: state (Tensor[input_shape]): The current environment state Returns: Int >= 0 : Next action to do """ # Append the state to the short term memory (ie. History) self._history.append(state) # If policy requires agent to explore, sample random action if self._explorer.is_exploring(self._num_actions_taken): action = self._explorer(self.nb_actions) else: # Use the network to output the best action env_with_history = self._history.value q_values = self._action_value_net.eval( # Append batch axis with only one sample to evaluate env_with_history.reshape((1,) + env_with_history.shape) ) self._episode_q_means.append(np.mean(q_values)) self._episode_q_stddev.append(np.std(q_values)) # Return the value maximizing the expected reward action = q_values.argmax() # Keep track of interval action counter self._num_actions_taken += 1 #print(self._num_actions_taken) return action def observe(self, old_state, action, reward, done): """ This allows the agent to observe the output of doing the action it selected through act() on the old_state Attributes: old_state (Tensor[input_shape]): Previous environment state action (int): Action done by the agent reward (float): Reward for doing this action in the old_state environment done (bool): Indicate if the action has terminated the environment """ self._episode_rewards.append(reward) # If done, reset short term memory (ie. History) if done: # Plot the metrics through Tensorboard and reset buffers if self._metrics_writer is not None: self._plot_metrics() self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Reset the short term memory self._history.reset() # Append to long term memory self._memory.append(old_state, action, reward, done) def train(self): """ This allows the agent to train itself to better understand the environment dynamics. The agent will compute the expected reward for the state(t+1) and update the expected reward at step t according to this. The target expectation is computed through the Target Network, which is a more stable version of the Action Value Network for increasing training stability. The Target Network is a frozen copy of the Action Value Network updated as regular intervals. """ agent_step = self._num_actions_taken if agent_step >= self._train_after: #if (agent_step % self._train_interval) == 0: print('\nTraining minibatch\n') client.setCarControls(zero_controls) pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size) self._trainer.train_minibatch( self._trainer.loss_function.argument_map( pre_states=pre_states, actions=Value.one_hot(actions.reshape(-1, 1).tolist(), self.nb_actions), post_states=post_states, rewards=rewards, terminals=terminals ) ) self._num_trains += 1 # Update the Target Network if needed if self._num_trains % 20 == 0: print('updating network') self._target_net = self._action_value_net.clone(CloneMethod.freeze) filename = dirname+"\model%d" % agent_step self._trainer.save_checkpoint(filename) def _plot_metrics(self): """Plot current buffers accumulated values to visualize agent learning """ if len(self._episode_q_means) > 0: mean_q = np.asscalar(np.mean(self._episode_q_means)) self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken) if len(self._episode_q_stddev) > 0: std_q = np.asscalar(np.mean(self._episode_q_stddev)) self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken) self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken)
def __init__(self, state_shape, action_count, model_func, vmin, vmax, n, gamma=0.99, lr=0.00025, mm=0.95, use_tensorboard=False): """ Creates a new agent that learns using Categorical DQN as described in "A Distributional Perspective on Reinforcement Learning" :param state_shape: The shape of each input shape e.g. (4 x 84 x 84) for Atari :param action_count: The number of actions e.g. 14 :param model_func: The model to train :param vmin: Minimum value of return distribution :param vmax: Maximum value of return distribution :param n: Number of support atoms :param gamma: Discount factor for Bellman update :param lr: The learning rate for Adam SGD :param mm: The momentum for Adam SGD """ self.state_shape = state_shape self.action_count = action_count self.gamma = gamma self.learning_rate = lr self.momentum = mm # Distribution parameters self.vmin = vmin self.vmax = vmax self.n = n self.dz = (vmax - vmin) / (n - 1) # Support atoms self.z = np.linspace(vmin, vmax, n, dtype=np.float32) # Model input and output self.state_var = C.input_variable(self.state_shape, name='state') self.action_return_dist = C.input_variable((self.action_count, n), name='ar_dist') # Model output assigns a probability to each support atom for each action self.raw = model_func(self.state_var) self.model = C.softmax(self.raw, axis=1) # Adam-based SGD with cross-entropy loss loss = C.cross_entropy_with_softmax(self.raw, self.action_return_dist, axis=1, name='loss') lr_schedule = C.learning_rate_schedule(self.learning_rate, C.UnitType.sample) mom_schedule = C.momentum_schedule(self.momentum) vm_schedule = C.momentum_schedule(0.999) learner = C.adam(self.raw.parameters, lr_schedule, mom_schedule, variance_momentum=vm_schedule) if use_tensorboard: self.writer = TensorBoardProgressWriter(log_dir='metrics', model=self.model) else: self.writer = None self.trainer = C.Trainer(self.raw, (loss, None), [learner], self.writer) # Create target network as copy of online network self.target_model = None self.update_target()
class DQAgent(object): """docstring for DQAgent""" ############should modify @!@! def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = RepMem(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Dense(input_shape, init=he_uniform(scale=0.01)), Dense(input_shape), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))]) self._action_value_net.update_signature(Tensor[input_shape]) self._target_net = self._action_value_net.clone(CloneMethod.freeze) @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): q_targets = compute_q_targets(post_states, rewards, terminals) q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) return huber_loss(q_targets, q_acted, 1.0) lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer) def act(self, state): self._history.append(state) if self._explorer.is_exploring(self._num_actions_taken): action = self._explorer(self.nb_actions) else: env_with_history = self._history.value q_values = self._action_value_net.eval( env_with_history.reshape((1,) + env_with_history.shape) ) self._episode_q_means.append(np.mean(q_values)) self._episode_q_stddev.append(np.std(q_values)) action = q_values.argmax() self._num_actions_taken += 1 return action def observe(self, old_state, action, reward, done): self._episode_rewards.append(reward) if done: if self._metrics_writer is not None: self._plot_metrics() self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] self._history.reset() self._memory.append(old_state, action, reward, done) def train(self): agent_step = self._num_actions_taken if agent_step >= self._train_after: if (agent_step % self._train_interval) == 0: pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size) self._trainer.train_minibatch( self._trainer.loss_function.argument_map( pre_states=pre_states, actions=Value.one_hot(actions.reshape(-1,1).tolist(), self.nb_actions), post_states=post_states, rewards=rewards, terminals=terminals ) ) if (agent_step % self._target_update_interval) == 0: self._target_net = self._action_value_net.clone(CloneMethod.freeze) filename = "model\model%d" % agent_step # save ???? not good at using %d self._trainer.save_checkpoint(filename) def _plot_metrics(self): if len(self._episode_q_means) > 0: mean_q = np.asscalar(np.mean(self._episode_q_means)) self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken) if len(self._episode_q_stddev) > 0: std_q = np.asscalar(np.mean(self._episode_q_stddev)) self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken) self._metrics_writer.write_value('Sum rewards per ep', sum(self._episode_rewards), self._num_actions_taken)
class DeepQAgent(object): """ Implementation of Deep Q Neural Network agent like in: Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015) """ def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=200000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Convolution2D((8, 8), 16, strides=4), Convolution2D((4, 4), 32, strides=2), Convolution2D((3, 3), 32, strides=1), Dense(256, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter( freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer) def load(self, model_path): self._trainer.restore_from_checkpoint(model_path) def act(self, state, eval=False): """ This allows the agent to select the next action to perform in regard of the current state of the environment. It follows the terminology used in the Nature paper. Attributes: state (Tensor[input_shape]): The current environment state Returns: Int >= 0 : Next action to do """ # Append the state to the short term memory (ie. History) self._history.append(state) # If policy requires agent to explore, sample random action if self._explorer.is_exploring(self._num_actions_taken) and not eval: action = self._explorer(self.nb_actions) q_values = None else: # Use the network to output the best action env_with_history = self._history.value q_values = self._action_value_net.eval( # Append batch axis with only one sample to evaluate env_with_history.reshape((1, ) + env_with_history.shape)) self._episode_q_means.append(np.mean(q_values)) self._episode_q_stddev.append(np.std(q_values)) # Return the value maximizing the expected reward action = q_values.argmax() # Keep track of interval action counter self._num_actions_taken += 1 return action, q_values def observe(self, old_state, action, reward, done): """ This allows the agent to observe the output of doing the action it selected through act() on the old_state Attributes: old_state (Tensor[input_shape]): Previous environment state action (int): Action done by the agent reward (float): Reward for doing this action in the old_state environment done (bool): Indicate if the action has terminated the environment """ self._episode_rewards.append(reward) # If done, reset short term memory (ie. History) if done: # Plot the metrics through Tensorboard and reset buffers if self._metrics_writer is not None: self._plot_metrics() self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Reset the short term memory self._history.reset() # Append to long term memory self._memory.append(old_state, action, reward, done) def train(self, checkpoint_dir): """ This allows the agent to train itself to better understand the environment dynamics. The agent will compute the expected reward for the state(t+1) and update the expected reward at step t according to this. The target expectation is computed through the Target Network, which is a more stable version of the Action Value Network for increasing training stability. The Target Network is a frozen copy of the Action Value Network updated as regular intervals. """ agent_step = self._num_actions_taken if agent_step >= self._train_after: if (agent_step % self._train_interval) == 0: #print('training... number of steps: {}'.format(agent_step)) pre_states, actions, post_states, rewards, terminals = self._memory.minibatch( self._minibatch_size) self._trainer.train_minibatch( self._trainer.loss_function.argument_map( pre_states=pre_states, actions=Value.one_hot( actions.reshape(-1, 1).tolist(), self.nb_actions), post_states=post_states, rewards=rewards, terminals=terminals)) # Update the Target Network if needed if (agent_step % self._target_update_interval) == 0: self._target_net = self._action_value_net.clone( CloneMethod.freeze) filename = os.path.join(checkpoint_dir, "models\model%d" % agent_step) self._trainer.save_checkpoint(filename) def _plot_metrics(self): """Plot current buffers accumulated values to visualize agent learning """ if len(self._episode_q_means) > 0: mean_q = np.asscalar(np.mean(self._episode_q_means)) self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken) if len(self._episode_q_stddev) > 0: std_q = np.asscalar(np.mean(self._episode_q_stddev)) self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken) self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken) def get_depth_image(self, client): # get depth image from airsim responses = client.simGetImages([ airsim.ImageRequest("RCCamera", airsim.ImageType.DepthPerspective, True, False) ]) img1d = np.array(responses[0].image_data_float, dtype=np.float) img1d = 255 / np.maximum(np.ones(img1d.size), img1d) if img1d.size > 1: img2d = np.reshape(img1d, (responses[0].height, responses[0].width)) image = Image.fromarray(img2d) im_final = np.array(image.resize((84, 84)).convert('L')) im_final = im_final / 255.0 return im_final return np.zeros((84, 84)).astype(float) # Gets a coverage image from AirSim def get_cov_image(self, coverage_map): state, cov_reward = coverage_map.get_state_from_pose() #state = self.coverage_map.get_map_scaled() # debug only #im = Image.fromarray(np.uint8(state)) #im.save("DistributedRL\\debug\\{}.png".format(time.time())) # normalize state state = state / 255.0 return state, cov_reward
def convnet_mnist(debug_output=False, epoch_size=60000, minibatch_size=64, max_epochs=10): image_height = 28 image_width = 28 num_channels = 1 input_dim = image_height * image_width * num_channels num_output_classes = 10 # Input variables denoting the features and label data input_var = C.ops.input_variable((num_channels, image_height, image_width), np.float32) label_var = C.ops.input_variable(num_output_classes, np.float32) # Instantiate the feedforward classification model scaled_input = C.ops.element_times(C.ops.constant(0.00390625), input_var) with C.layers.default_options(activation=C.ops.relu, pad=False): conv1 = C.layers.Convolution2D((5, 5), 32, pad=True)(scaled_input) pool1 = C.layers.MaxPooling((3, 3), (2, 2))(conv1) conv2 = C.layers.Convolution2D((3, 3), 48)(pool1) pool2 = C.layers.MaxPooling((3, 3), (2, 2))(conv2) conv3 = C.layers.Convolution2D((3, 3), 64)(pool2) f4 = C.layers.Dense(96)(conv3) drop4 = C.layers.Dropout(0.5)(f4) z = C.layers.Dense(num_output_classes, activation=C.ops.sigmoid)(drop4) ce = C.losses.cross_entropy_with_softmax(z, label_var) pe = C.metrics.classification_error(z, label_var) reader_train = create_reader( os.path.join(data_dir, 'Train-28x28_cntk_text.txt'), True, input_dim, num_output_classes) # Set learning parameters lr_per_sample = [0.001] * 10 + [0.0005] * 10 + [0.0001] lr_schedule = C.learning_rate_schedule(lr_per_sample, C.learners.UnitType.sample, epoch_size) mm_time_constant = [0] * 5 + [1024] mm_schedule = C.learners.momentum_as_time_constant_schedule( mm_time_constant, epoch_size) # Instantiate the trainer object to drive the model training learner = C.learners.momentum_sgd(z.parameters, lr_schedule, mm_schedule) progress_printers = [ C.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs) ] progress_printers.append( TensorBoardProgressWriter(freq=10, log_dir=log_dir, model=z)) trainer = C.Trainer(z, (ce, pe), learner, progress_printers) # define mapping from reader streams to network inputs input_map = { input_var: reader_train.streams.features, label_var: reader_train.streams.labels } C.logging.log_number_of_parameters(z) print() # Get minibatches of images to train with and perform model training for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = reader_train.next_minibatch( min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch. trainer.train_minibatch(data) # update model with it sample_count += data[ label_var].num_samples # count samples processed so far trainer.summarize_training_progress() z.save( os.path.join(output_dir, "digit_epoch_{}.checkpoint".format(epoch))) # Load test data reader_test = create_reader( os.path.join(data_dir, 'Test-28x28_cntk_text.txt'), False, input_dim, num_output_classes) z.save(os.path.join(output_dir, "digit.model")) input_map = { input_var: reader_test.streams.features, label_var: reader_test.streams.labels } # Test data for trained model epoch_size = 10000 minibatch_size = 1024 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 while sample_count < epoch_size: current_minibatch = min(minibatch_size, epoch_size - sample_count) # Fetch next test min batch. data = reader_test.next_minibatch(current_minibatch, input_map=input_map) # minibatch data to be trained with metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. sample_count += data[label_var].num_samples minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format( minibatch_index + 1, (metric_numer * 100.0) / metric_denom, metric_denom)) print("") return metric_numer / metric_denom
def train_model(image_input, roi_input, dims_input, loss, pred_error, lr_per_sample, mm_schedule, l2_reg_weight, epochs_to_train, cfg, rpn_rois_input=None, buffered_rpn_proposals=None): if isinstance(loss, cntk.Variable): loss = combine([loss]) params = loss.parameters biases = [p for p in params if '.b' in p.name or 'b' == p.name] others = [p for p in params if not p in biases] bias_lr_mult = cfg["CNTK"].BIAS_LR_MULT if cfg["CNTK"].DEBUG_OUTPUT: print("biases") for p in biases: print(p) print("others") for p in others: print(p) print("bias_lr_mult: {}".format(bias_lr_mult)) # Instantiate the learners and the trainer object lr_schedule = learning_parameter_schedule_per_sample(lr_per_sample) learner = momentum_sgd(others, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight, unit_gain=False, use_mean_gradient=True) bias_lr_per_sample = [v * bias_lr_mult for v in lr_per_sample] bias_lr_schedule = learning_parameter_schedule_per_sample( bias_lr_per_sample) bias_learner = momentum_sgd(biases, bias_lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight, unit_gain=False, use_mean_gradient=True) trainer = Trainer(None, (loss, pred_error), [learner, bias_learner]) # Get minibatches of images and perform model training print("Training model for %s epochs." % epochs_to_train) log_number_of_parameters(loss) # Create the minibatch source if buffered_rpn_proposals is not None: proposal_provider = ProposalProvider.fromlist(buffered_rpn_proposals, requires_scaling=False) else: proposal_provider = None od_minibatch_source = ObjectDetectionMinibatchSource( cfg["DATA"].TRAIN_MAP_FILE, cfg["DATA"].TRAIN_ROI_FILE, num_classes=cfg["DATA"].NUM_CLASSES, max_annotations_per_image=cfg.INPUT_ROIS_PER_IMAGE, pad_width=cfg.IMAGE_WIDTH, pad_height=cfg.IMAGE_HEIGHT, pad_value=cfg["MODEL"].IMG_PAD_COLOR, randomize=True, use_flipping=cfg["TRAIN"].USE_FLIPPED, max_images=cfg["DATA"].NUM_TRAIN_IMAGES, proposal_provider=proposal_provider) # define mapping from reader streams to network inputs input_map = { od_minibatch_source.image_si: image_input, od_minibatch_source.roi_si: roi_input, } if buffered_rpn_proposals is not None: input_map[od_minibatch_source.proposals_si] = rpn_rois_input else: input_map[od_minibatch_source.dims_si] = dims_input progress_printer = [ ProgressPrinter(tag='Training', num_epochs=epochs_to_train, gen_heartbeat=True) ] tensorboard_logdir = os.path.join( os.path.dirname(os.path.abspath(__file__)), r"./TensorBoard") tensorboard_writer = None if tensorboard_logdir is not None: tensorboard_writer = TensorBoardProgressWriter( freq=10, log_dir=tensorboard_logdir, model=loss) progress_printer.append(tensorboard_writer) for epoch in range(epochs_to_train): # loop over epochs sample_count = 0 while sample_count < cfg[ "DATA"].NUM_TRAIN_IMAGES: # loop over minibatches in the epoch data = od_minibatch_source.next_minibatch(min( cfg.MB_SIZE, cfg["DATA"].NUM_TRAIN_IMAGES - sample_count), input_map=input_map) output = trainer.train_minibatch( data, outputs=[image_input]) # update model with it sample_count += trainer.previous_minibatch_sample_count # count samples processed so far #progress_printer.update_with_trainer(trainer, with_metric=True) # log progress #Write output images to tensorboard tensorboard_writer.write_image('training', output[1], sample_count) if sample_count % 100 == 0: print("Processed {} samples".format(sample_count)) #progress_printer.epoch_summary(with_metric=True) trainer.summarize_training_progress() if tensorboard_writer: for parameter in loss.parameters: tensorboard_writer.write_value(parameter.uid + "/mean", reduce_mean(parameter).eval(), epoch)
lr_per_minibatch = learning_rate_schedule( [0.01] * 25 + [0.001] * 25 + [0.0001] * 25 + [0.00001] * 25 + [0.000001], UnitType.minibatch, epoch_size) #lr_schedule = learning_parameter_schedule(lr_per_mb, minibatch_size=minibatch_size, epoch_size=epoch_size) #mm_schedule = momentum_schedule(0.9, minibatch_size=minibatch_size) momentum_time_constant = momentum_as_time_constant_schedule(-minibatch_size / np.log(0.9)) l2_reg_weight = 0.0005 # trainer objectS progress_writers = [ProgressPrinter(tag='Training', num_epochs=max_epochs)] #progress_writers = [ProgressPrinter(tag='Training', log_to_file=train_log_file, num_epochs=max_epochs, gen_heartbeat=True)] if cfg.train_log_dir is not None: tensorboard_writer = TensorBoardProgressWriter(freq=10, log_dir=cfg.train_log_dir, model=z) progress_writers.append(tensorboard_writer) learner = momentum_sgd(z.parameters, lr=lr_per_minibatch, momentum=momentum_time_constant, l2_regularization_weight=l2_reg_weight) ######### RESTORE TRAINER IF NEEDED trainer = Trainer(z, (ce, pe), learner, progress_writers) # trainer.restore_from_checkpoint(model_temp_file) # define mapping from reader streams to network inputs input_map = { input_var: reader_train.streams.features,
def simple_mnist(tensorboard_logdir=None): input_dim = 784 num_output_classes = 10 num_hidden_layers = 1 hidden_layers_dim = 200 # Input variables denoting the features and label data feature = input(input_dim, np.float32) label = input(num_output_classes, np.float32) # Instantiate the feedforward classification model scaled_input = element_times(constant(0.00390625), feature) z = fully_connected_classifier_net(scaled_input, num_output_classes, hidden_layers_dim, num_hidden_layers, relu) ce = cross_entropy_with_softmax(z, label) pe = classification_error(z, label) data_dir = os.path.join(abs_path, "..", "..", "..", "DataSets", "MNIST") path = os.path.normpath(os.path.join(data_dir, "Train-28x28_cntk_text.txt")) check_path(path) reader_train = create_reader(path, True, input_dim, num_output_classes) input_map = { feature: reader_train.streams.features, label: reader_train.streams.labels } # Training config minibatch_size = 64 num_samples_per_sweep = 60000 num_sweeps_to_train_with = 10 # Instantiate progress writers. #training_progress_output_freq = 100 progress_writers = [ ProgressPrinter( #freq=training_progress_output_freq, tag='Training', num_epochs=num_sweeps_to_train_with) ] if tensorboard_logdir is not None: progress_writers.append( TensorBoardProgressWriter(freq=10, log_dir=tensorboard_logdir, model=z)) # Instantiate the trainer object to drive the model training trainer = Trainer(z, (ce, pe), adadelta(z.parameters), progress_writers) training_session(trainer=trainer, mb_source=reader_train, mb_size=minibatch_size, var_to_stream=input_map, max_samples=num_samples_per_sweep * num_sweeps_to_train_with, progress_frequency=num_samples_per_sweep).train() # Load test data path = os.path.normpath(os.path.join(data_dir, "Test-28x28_cntk_text.txt")) check_path(path) reader_test = create_reader(path, False, input_dim, num_output_classes) input_map = { feature: reader_test.streams.features, label: reader_test.streams.labels } # Test data for trained model test_minibatch_size = 1024 num_samples = 10000 num_minibatches_to_test = num_samples / test_minibatch_size test_result = 0.0 for i in range(0, int(num_minibatches_to_test)): mb = reader_test.next_minibatch(test_minibatch_size, input_map=input_map) eval_error = trainer.test_minibatch(mb) test_result = test_result + eval_error # Average of evaluation errors of all test minibatches return test_result / num_minibatches_to_test
def init_train_fast_rcnn(image_height, image_width, num_classes, num_rois, mb_size, max_epochs, cntk_lr_per_image, l2_reg_weight, momentum_time_constant, base_path, boSkipTraining=False, debug_output=False, tensorboardLogDir=None): #make sure we use GPU for training if use_default_device().type() == 0: print("WARNING: using CPU for training.") else: print("Using GPU for training.") # Instantiate the Fast R-CNN prediction model image_input = input_variable((3, image_height, image_width)) roi_input = input_variable((num_rois, 4)) label_input = input_variable((num_rois, num_classes)) frcn_output, frcn_penultimateLayer = frcn_predictor( image_input, roi_input, num_classes, base_path) if boSkipTraining: print("Using pre-trained DNN without refinement") return frcn_penultimateLayer # Create the minibatch source and define mapping from reader streams to network inputs minibatch_source, epoch_size = create_mb_source("train", image_height, image_width, num_classes, num_rois, base_path, randomize=True) input_map = { image_input: minibatch_source.streams.features, roi_input: minibatch_source.streams.rois, label_input: minibatch_source.streams.roiLabels } # set loss / error functions ce = cross_entropy_with_softmax(frcn_output, label_input, axis=1) pe = classification_error(frcn_output, label_input, axis=1) if debug_output: plot(frcn_output, "graph_frcn.png") # set the progress printer(s) progress_writers = [ProgressPrinter(tag='Training', num_epochs=max_epochs)] if tensorboardLogDir != None: tensorboard_writer = TensorBoardProgressWriter( freq=10, log_dir=tensorboardLogDir, model=frcn_output) progress_writers.append(tensorboard_writer) # Set learning parameters and instantiate the trainer object lr_per_sample = [f / float(num_rois) for f in cntk_lr_per_image] lr_schedule = learning_rate_schedule(lr_per_sample, unit=UnitType.sample) mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant) learner = momentum_sgd(frcn_output.parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) trainer = Trainer(frcn_output, (ce, pe), learner, progress_writers) # Get minibatches of images and perform model training print("Training Fast R-CNN model for %s epochs." % max_epochs) log_number_of_parameters(frcn_output) for epoch in range(max_epochs): sample_count = 0 # loop over minibatches in the epoch while sample_count < epoch_size: data = minibatch_source.next_minibatch(min( mb_size, epoch_size - sample_count), input_map=input_map) if sample_count % 100 == 1: print( "Training in progress: epoch {} of {}, sample count {} of {}" .format(epoch, max_epochs, sample_count, epoch_size)) trainer.train_minibatch(data) sample_count += trainer.previous_minibatch_sample_count # count samples processed so far trainer.summarize_training_progress() # Log mean of each parameter tensor, so that we can confirm that the parameters change indeed. if tensorboardLogDir != None: for parameter in frcn_output.parameters: tensorboard_writer.write_value(parameter.uid + "/mean", np.mean(parameter.value), epoch) tensorboard_writer.write_value(parameter.uid + "/std", np.std(parameter.value), epoch) tensorboard_writer.write_value(parameter.uid + "/absSum", np.sum(np.abs(parameter.value)), epoch) if debug_output: frcn_output.save_model("frcn_py_%s.model" % (epoch + 1)) return frcn_output
def __init__(self, input_shape, nb_actions, gamma=0.95, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 100000), learning_rate=0.01, momentum=0.8, minibatch_size=16, memory_size=15000, train_after=100, train_interval=100, target_update_interval=500, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 self._num_trains = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] ''' # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Convolution2D((8, 8), 16, strides=4), Convolution2D((4, 4), 32, strides=2), Convolution2D((3, 3), 32, strides=1), Dense(256, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) ''' with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Dense(7, init=he_uniform(scale=0.01)), Dense(8, init=he_uniform(scale=0.01)), #Dense(16, init=he_uniform(scale=0.01)), #Dense(32, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)
class DeepQAgent(object): """ Implementation of Deep Q Neural Network agent like in: Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015) """ def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(0.91, 0.1, 910000), fixpolicy=LinearEpsilonAnnealingExplorer(0.5, 0.1, 100000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._fixpolicy = fixpolicy self._minibatch_size = minibatch_size self._history = History(input_shape) print("input_shape:", input_shape) print("input_shape[1:]", input_shape[1:]) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ #Convolution2D((8, 8), 16, strides=4), #Convolution2D((4, 4), 32, strides=2), #Convolution2D((1, 1), 16, strides=1), Dense(25, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter( freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer) #self._trainer.restore_from_checkpoint("models_heuristic_no_image\model") def act(self, state): """ This allows the agent to select the next action to perform in regard of the current state of the environment. It follows the terminology used in the Nature paper. Attributes: state (Tensor[input_shape]): The current environment state Returns: Int >= 0 : Next action to do """ # Append the state to the short term memory (ie. History) self._history.append(state) #if True: if self._fixpolicy.is_exploring(self._num_actions_taken): diff_x = state[3] - state[0] diff_y = state[4] - state[1] diff_z = state[5] - state[2] diff_arr = np.array([diff_x, diff_y, diff_z]) direction = np.argmax(np.absolute(diff_arr)) ''' abs_x = math.fabs(diff_x) abs_y = math.fabs(diff_y) abs_z = math.fabs(diff_z) diff = [diff_x, diff_y, diff_z] abs_diff = [abs_x, abs_y, abs_z] print(diff, abs_diff) m = max(abs_diff) direction = diff.index(m)''' print(diff_arr) if diff_arr[direction] < 0: fixaction = direction + 4 else: fixaction = direction + 1 self._num_actions_taken += 1 return fixaction # If policy requires agent to explore, sample random action if self._explorer.is_exploring(self._num_actions_taken): action = self._explorer(self.nb_actions) else: # Use the network to output the best action env_with_history = self._history.value q_values = self._action_value_net.eval( # Append batch axis with only one sample to evaluate env_with_history.reshape((1, ) + env_with_history.shape)) self._episode_q_means.append(np.mean(q_values)) self._episode_q_stddev.append(np.std(q_values)) # Return the value maximizing the expected reward action = q_values.argmax() # Keep track of interval action counter self._num_actions_taken += 1 return action def observe(self, old_state, action, reward, done): """ This allows the agent to observe the output of doing the action it selected through act() on the old_state Attributes: old_state (Tensor[input_shape]): Previous environment state action (int): Action done by the agent reward (float): Reward for doing this action in the old_state environment done (bool): Indicate if the action has terminated the environment """ self._episode_rewards.append(reward) # If done, reset short term memory (ie. History) if done: # Plot the metrics through Tensorboard and reset buffers if self._metrics_writer is not None: self._plot_metrics() self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Reset the short term memory self._history.reset() # Append to long term memory self._memory.append(old_state, action, reward, done) def train(self): """ This allows the agent to train itself to better understand the environment dynamics. The agent will compute the expected reward for the state(t+1) and update the expected reward at step t according to this. The target expectation is computed through the Target Network, which is a more stable version of the Action Value Network for increasing training stability. The Target Network is a frozen copy of the Action Value Network updated as regular intervals. """ agent_step = self._num_actions_taken print("agent_step = ", agent_step) #time.sleep(1) if agent_step >= self._train_after: if (agent_step % self._train_interval) == 0: pre_states, actions, post_states, rewards, terminals = self._memory.minibatch( self._minibatch_size) self._trainer.train_minibatch( self._trainer.loss_function.argument_map( pre_states=pre_states, actions=Value.one_hot( actions.reshape(-1, 1).tolist(), self.nb_actions), post_states=post_states, rewards=rewards, terminals=terminals)) # Update the Target Network if needed if (agent_step % self._target_update_interval) == 0: self._target_net = self._action_value_net.clone( CloneMethod.freeze) filename = "models_heuristic_no_image_less_exploration\model%d" % agent_step print( "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$filename=", filename) self._trainer.save_checkpoint(filename) #time.sleep(100) def _plot_metrics(self): global landing_count, episode_count """Plot current buffers accumulated values to visualize agent learning """ f = open('log__heuristic_no_image_less_exploration2', 'a+') f.write('episode:' + str(episode_count) + ': exploration rate= ' + str(self._explorer._rate) + ' heuristic fix rate= ' + str(self._fixpolicy._rate) + '\n') if len(self._episode_q_means) > 0: mean_q = np.asscalar(np.mean(self._episode_q_means)) self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken) print('Mean Q per ep.', mean_q, self._num_actions_taken) f.write('Mean Q per ep. ' + str(mean_q) + ' ' + str(self._num_actions_taken) + '\n') if len(self._episode_q_stddev) > 0: std_q = np.asscalar(np.mean(self._episode_q_stddev)) self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken) print('Mean Std Q per ep.', std_q, self._num_actions_taken) f.write('Mean Std Q per ep. ' + str(std_q) + ' ' + str(self._num_actions_taken) + '\n') self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken) print('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken) f.write('Sum rewards per ep. ' + str(sum(self._episode_rewards)) + ' ' + str(self._num_actions_taken) + '\n') if landing_count > 0: f.write('****************Success landing**********' + str(landing_count) + '\n') landing_count = 0 episode_count = 0 f.write('\n')