def train_lm(testing=False): data = DataReader(token_to_id_path, segment_sepparator) # Create model nodes for the source and target inputs input_sequence, label_sequence = create_inputs(data.vocab_dim) # Create the model. It has three output nodes # z: the input to softmax that provides the latent representation of the next token # cross_entropy: this is used training criterion # error: this a binary indicator if the model predicts the correct token z, cross_entropy, error = create_model(input_sequence, label_sequence, data.vocab_dim, hidden_dim) # For measurement we use the (build in) full softmax. full_ce = C.cross_entropy_with_softmax(z, label_sequence) # print out some useful training information log_number_of_parameters(z) ; print() # Run the training loop num_trained_samples = 0 num_trained_samples_since_last_report = 0 # Instantiate the trainer object to drive the model training lr_schedule = C.learning_parameter_schedule_per_sample(learning_rate) momentum_schedule = C.momentum_schedule_per_sample(momentum_per_sample) gradient_clipping_with_truncation = True learner = momentum_sgd(z.parameters, lr_schedule, momentum_schedule, gradient_clipping_threshold_per_sample=clipping_threshold_per_sample, gradient_clipping_with_truncation=gradient_clipping_with_truncation) trainer = Trainer(z, (cross_entropy, error), learner) last_avg_ce = 0 for epoch_count in range(num_epochs): for features, labels, token_count in data.minibatch_generator(train_file_path, sequence_length, sequences_per_batch): arguments = ({input_sequence : features, label_sequence : labels}) t_start = timeit.default_timer() trainer.train_minibatch(arguments) t_end = timeit.default_timer() samples_per_second = token_count / (t_end - t_start) # Print progress report every num_samples_between_progress_report samples if num_trained_samples_since_last_report >= num_samples_between_progress_report or num_trained_samples == 0: av_ce = average_cross_entropy(full_ce, input_sequence, label_sequence, data) print_progress(samples_per_second, av_ce, num_trained_samples, t_start) num_trained_samples_since_last_report = 0 last_avg_ce = av_ce num_trained_samples += token_count num_trained_samples_since_last_report += token_count if not testing: # after each epoch save the model model_filename = "models/lm_epoch%d.dnn" % epoch_count z.save(model_filename) print("Saved model to '%s'" % model_filename) return last_avg_ce
def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = RepMem(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Dense(input_shape, init=he_uniform(scale=0.01)), Dense(input_shape), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))]) self._action_value_net.update_signature(Tensor[input_shape]) self._target_net = self._action_value_net.clone(CloneMethod.freeze) @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): q_targets = compute_q_targets(post_states, rewards, terminals) q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) return huber_loss(q_targets, q_acted, 1.0) lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)
def create_trainer(network, epoch_size, num_epochs, minibatch_size, num_quantization_bits, progress_printer): # CNTK weights new gradient by (1-momentum) for unit gain, # thus we divide Caffe's learning rate by (1-momentum) initial_learning_rate = 0.45 # equal to 0.045 in caffe initial_learning_rate *= minibatch_size / 32 learn_rate_adjust_interval = 2 learn_rate_decrease_factor = 0.94 # Set learning parameters lr_per_mb = [] learning_rate = initial_learning_rate for i in range(0, num_epochs, learn_rate_adjust_interval): lr_per_mb.extend([learning_rate] * learn_rate_adjust_interval) learning_rate *= learn_rate_decrease_factor lr_schedule = learning_rate_schedule(lr_per_mb, unit=UnitType.minibatch, epoch_size=epoch_size) mm_schedule = momentum_schedule(0.9) l2_reg_weight = 0.0001 # CNTK L2 regularization is per sample, thus same as Caffe # Create learner local_learner = momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) parameter_learner = data_parallel_distributed_learner( local_learner, num_quantization_bits=num_quantization_bits, distributed_after=0) # Create trainer return Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_printer)
def init_trainer(config, text_lines, slot_value_lines): hidden_dim = config.hidden_dim segment_begin = config.segment_begin segment_end = config.segment_end data = DataReader(text_lines, slot_value_lines, segment_begin, segment_end) # Create model nodes for the source and target inputs vocab_dim = data.vocab_dim sv_dim = data.sv_dim input_sequence, sv_pair, label_sequence, inputH, inputC = create_inputs(hidden_dim, sv_dim, vocab_dim) model = create_model(hidden_dim, sv_dim, vocab_dim) z = model(input_sequence, inputH, inputC, sv_pair) # cross_entropy: this is used training criterion ce, err = cross_entropy_with_full_softmax(z, label_sequence, sv_dim, vocab_dim) learning_rate = config.learning_rate momentum_as_time_constant = config.momentum_as_time_constant clipping_threshold_per_sample = config.clipping_threshold_per_sample lr_schedule = learning_rate_schedule(learning_rate, UnitType.sample) gradient_clipping_with_truncation = True momentum_schedule = momentum_as_time_constant_schedule(momentum_as_time_constant) # Instantiate the trainer object to drive the model training learner = momentum_sgd(z.parameters, lr_schedule, momentum_schedule, gradient_clipping_threshold_per_sample=clipping_threshold_per_sample, gradient_clipping_with_truncation=gradient_clipping_with_truncation) trainer = Trainer(z, (ce, err), learner) inputs = [input_sequence, sv_pair, label_sequence, inputH, inputC] return data, z, trainer, inputs
def create_trainer(network, epoch_size, num_epochs, minibatch_size, progress_writers): # CNTK weights new gradient by (1-momentum) for unit gain, # thus we divide Caffe's learning rate by (1-momentum) initial_learning_rate = 2.0 # equal to 0.2 in caffe initial_learning_rate *= minibatch_size / 128 learn_rate_adjust_interval = 2 learn_rate_decrease_factor = 0.94 # Set learning parameters lr_per_mb = [] learning_rate = initial_learning_rate for i in range(0, num_epochs, learn_rate_adjust_interval): lr_per_mb.extend([learning_rate] * learn_rate_adjust_interval) learning_rate *= learn_rate_decrease_factor lr_schedule = learning_parameter_schedule(lr_per_mb, epoch_size=epoch_size) mm_schedule = momentum_schedule(0.9) l2_reg_weight = 0.0001 # CNTK L2 regularization is per sample, thus same as Caffe # Create learner learner = momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) # Create trainer return Trainer(network['output'], (network['ce'], network['pe']), learner, progress_writers)
def main(params): # Create output and log directories if they don't exist if not os.path.isdir(params['output_folder']): os.makedirs(params['output_folder']) if not os.path.isdir(params['log_folder']): os.makedirs(params['log_folder']) # Create the network network = create_network() # Create readers train_reader = cbf_reader(os.path.join(params['input_folder'], 'train{}.cbf'.format(params['prefix'])), is_training=True, max_samples=cntk.io.INFINITELY_REPEAT) cv_reader = cbf_reader(os.path.join(params['input_folder'], 'test{}.cbf'.format(params['prefix'])), is_training=False, max_samples=cntk.io.FULL_DATA_SWEEP) test_reader = cbf_reader(os.path.join(params['input_folder'], 'test{}.cbf'.format(params['prefix'])), is_training=False, max_samples=cntk.io.FULL_DATA_SWEEP) input_map = { network['input']: train_reader.streams.front, network['target']: train_reader.streams.label } # Create learner mm_schedule = momentum_schedule(0.90) lr_schedule = learning_parameter_schedule([(40, 0.1), (40, 0.01)], minibatch_size=params['minibatch_size']) learner = cntk.adam(network['model'].parameters, lr_schedule, mm_schedule, l2_regularization_weight=0.0005, epoch_size=params['epoch_size'], minibatch_size=params['minibatch_size']) # Use TensorBoard for visual logging log_file = os.path.join(params['log_folder'], 'log.txt') pp_writer = cntk.logging.ProgressPrinter(freq=10, tag='Training', num_epochs=params['max_epochs'], log_to_file=log_file) tb_writer = cntk.logging.TensorBoardProgressWriter(freq=10, log_dir=params['log_folder'], model=network['model']) # Create trainer and training session trainer = Trainer(network['model'], (network['loss'], network['metric']), [learner], [pp_writer, tb_writer]) test_config = TestConfig(minibatch_source=test_reader, minibatch_size=params['minibatch_size'], model_inputs_to_streams=input_map) cv_config = CrossValidationConfig(minibatch_source=cv_reader, frequency=(1, DataUnit.sweep), minibatch_size=params['minibatch_size'], model_inputs_to_streams=input_map) checkpoint_config = CheckpointConfig(os.path.join(params['output_folder'], model_name), frequency=(10, DataUnit.sweep), restore=params['restore']) session = training_session(trainer=trainer, mb_source=train_reader, mb_size=params['minibatch_size'], model_inputs_to_streams=input_map, max_samples=params['epoch_size'] * params['max_epochs'], progress_frequency=(1, DataUnit.sweep), checkpoint_config=checkpoint_config, cv_config=cv_config, test_config=test_config) cntk.logging.log_number_of_parameters(network['model']) session.train() # Save the trained model path = os.path.join(params['output_folder'], 'final_model.dnn') network['model'].save(path) print('Saved final model to', path)
def train(): print('Unpickling data (this could take a short while)') training_data = pickle.load(open('tmp_textdata.pickle', 'rb')) print('Preprocessing data (this could take a LONG while)...') do_subsampling(training_data, subsampling=4e-5, prog_freq=1e7) print('Preprocessing is done. Final # of training words: {}'.format(len(training_data.text_as_id_list))) mb_source = WordMinibatchSource(training_data, max_window_size) mb_num_samples = 128 mb_size = minibatch_size_schedule(mb_num_samples) freq_list = training_data.id2freq token2id = training_data.token2id vocab_dim = len(freq_list) print(vocab_dim) input_vector, label_vector = create_inputs(vocab_dim) z, cross_entropy, error = create_model(input_vector, label_vector, freq_list, vocab_dim, hidden_dim) lr_schedule = learning_rate_schedule(learning_rate, UnitType.sample) lr_schedule2 = learning_rate_schedule([(3e-3)*(0.8**i) for i in range(10)], UnitType.sample, epoch_size=len(training_data.text_as_id_list)//2) mom_schedule = C.learners.momentum_schedule(0.005, UnitType.sample) gradient_clipping_with_truncation = True learner = C.learners.sgd(z.parameters, lr=lr_schedule2, gradient_clipping_threshold_per_sample=clipping_threshold_per_sample, gradient_clipping_with_truncation=gradient_clipping_with_truncation) # var_mom_schedule = C.learners.momentum_schedule(0.999, UnitType.sample) # learner2 = C.learners.adam(z.parameters, # lr=lr_schedule, # momentum=mom_schedule, # variance_momentum=var_mom_schedule, # epsilon=1.5e-8, # gradient_clipping_threshold_per_sample=clipping_threshold_per_sample, # gradient_clipping_with_truncation=gradient_clipping_with_truncation) progress_printer = C.logging.ProgressPrinter(freq=200, tag='Training') checkpoint_config = CheckpointConfig(frequency = 100000*mb_num_samples, filename = os.path.join(os.getcwd(), "word2vec_checkpoint"), restore = False) trainer = Trainer(z, (cross_entropy, error), [learner], progress_writers=[progress_printer]) input_map = { input_vector: mb_source.fsi, label_vector: mb_source.lsi } session = training_session(trainer, mb_source, mb_size, input_map, progress_frequency=len(training_data.text_as_id_list), max_samples = None, checkpoint_config=checkpoint_config, cv_config=None, test_config=None) C.logging.log_number_of_parameters(z) ; print() session.train()
# define loss / metrics # like Tensorflow the softmax is done # internally (if needed), so all we need are the logits ce = cross_entropy_with_softmax(logits, labels) pe = classification_error(logits, labels) # training config batch_size = 32 epochs = 15 n_batches = len(Xtrain) // batch_size # do the training # specify the training algorithm trainer = Trainer(logits, (ce, pe), adam(logits.parameters, lr=1e-2, momentum=0.9)) # helper function def get_output(node, X, Y): ret = node.forward(dict(inputs=X, labels=Y)) return list(ret[1].values())[0].mean() costs = [] errors = [] test_costs = [] test_errors = [] for i in range(epochs): cost = 0 err = 0
def simple_mnist(tensorboard_logdir=None): input_dim = 19 num_output_classes = 2 num_hidden_layers = 2 hidden_layers_dim = 1024 # Input variables denoting the features and label data feature = C.input_variable(input_dim, np.float32) label = C.input_variable(num_output_classes, np.float32) # Instantiate the feedforward classification model # scaled_input = element_times(constant(0.00390625), feature) z = Sequential([For(range(num_hidden_layers), lambda i: Dense(hidden_layers_dim, activation=relu)), Dense(num_output_classes)])(feature) ce = cross_entropy_with_softmax(z, label) pe = classification_error(z, label) data_dir = r"." path = os.path.normpath(os.path.join(data_dir, "train.ctf")) check_path(path) reader_train = create_reader(path, True, input_dim, num_output_classes) input_map = { feature : reader_train.streams.features, label : reader_train.streams.labels } # Training config minibatch_size = 512 num_samples_per_sweep = 1825000 num_sweeps_to_train_with = 100 # Instantiate progress writers. progress_writers = [ProgressPrinter( tag='Training', num_epochs=num_sweeps_to_train_with)] if tensorboard_logdir is not None: tensorboard_writer = TensorBoardProgressWriter(freq=10, log_dir=tensorboard_logdir, model=z) progress_writers.append(tensorboard_writer) # Instantiate the trainer object to drive the model training lr = learning_parameter_schedule_per_sample(0.001) learner = create_learner(model=z) trainer = Trainer(z, (ce, pe), learner, progress_writers) num_minibatches_to_train = int(num_samples_per_sweep / minibatch_size * num_sweeps_to_train_with) model_dir = "model" for i in range(num_minibatches_to_train): mb = reader_train.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(mb) freq = int(num_samples_per_sweep / minibatch_size) if i > 0 and i % freq == 0: timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f") current_trainer_cp = os.path.join(model_dir, timestamp + "_epoch_" + str(freq) + ".trainer") trainer.save_checkpoint(current_trainer_cp) train_error = get_error_rate(os.path.join(data_dir, "train_subset.ctf"), input_map, input_dim, num_output_classes, trainer) valid_error = get_error_rate(os.path.join(data_dir, "validation.ctf"), input_map, input_dim, num_output_classes, trainer) if train_error > 0: tensorboard_writer.write_value("train_error", train_error, i) if valid_error > 0: tensorboard_writer.write_value("valid_error", valid_error, i) feat_path = os.path.normpath(os.path.join(data_dir, "test.ctf")) return get_error_rate(feat_path, input_map, input_dim, num_output_classes, trainer)
def simple_mnist(tensorboard_logdir=None): input_dim = 784 num_output_classes = 10 num_hidden_layers = 2 hidden_layers_dim = 200 # Input variables denoting the features and label data feature = C.input_variable(input_dim, np.float32) label = C.input_variable(num_output_classes, np.float32) # Instantiate the feedforward classification model scaled_input = element_times(constant(0.00390625), feature) z = Sequential([ For(range(num_hidden_layers), lambda i: Dense(hidden_layers_dim, activation=relu)), Dense(num_output_classes) ])(scaled_input) ce = cross_entropy_with_softmax(z, label) pe = classification_error(z, label) data_dir = os.path.dirname(os.path.abspath(__file__)) path = os.path.join(data_dir, 'Train-28x28_cntk_text.txt') reader_train = create_reader(path, True, input_dim, num_output_classes) input_map = { feature: reader_train.streams.features, label: reader_train.streams.labels } # Training config minibatch_size = 64 num_samples_per_sweep = 60000 num_sweeps_to_train_with = 10 # Instantiate progress writers. # training_progress_output_freq = 100 progress_writers = [ ProgressPrinter( # freq=training_progress_output_freq, tag='Training', num_epochs=num_sweeps_to_train_with) ] if tensorboard_logdir is not None: progress_writers.append( TensorBoardProgressWriter(freq=10, log_dir=tensorboard_logdir, model=z)) # Instantiate the trainer object to drive the model training lr = 0.001 trainer = Trainer(z, (ce, pe), sgd(z.parameters, lr), progress_writers) training_session(trainer=trainer, mb_source=reader_train, mb_size=minibatch_size, model_inputs_to_streams=input_map, max_samples=num_samples_per_sweep * num_sweeps_to_train_with, progress_frequency=num_samples_per_sweep).train() # Load test data path = os.path.normpath(os.path.join(data_dir, "Test-28x28_cntk_text.txt")) check_path(path) reader_test = create_reader(path, False, input_dim, num_output_classes) input_map = { feature: reader_test.streams.features, label: reader_test.streams.labels } # Test data for trained model C.debugging.start_profiler() C.debugging.enable_profiler() C.debugging.set_node_timing(True) test_minibatch_size = 1024 num_samples = 10000 num_minibatches_to_test = num_samples / test_minibatch_size test_result = 0.0 for i in range(0, int(num_minibatches_to_test)): mb = reader_test.next_minibatch(test_minibatch_size, input_map=input_map) eval_error = trainer.test_minibatch(mb) test_result = test_result + eval_error C.debugging.stop_profiler() trainer.print_node_timing() # Average of evaluation errors of all test minibatches return test_result * 100 / num_minibatches_to_test
def __init__(self, state_dim, action_dim, gamma=0.99, learning_rate=1e-4, momentum=0.95): self.state_dim = state_dim self.action_dim = action_dim self.gamma = gamma with default_options(activation=relu, init=he_uniform()): # Convolution filter counts were halved to save on memory, no gpu :( self.model = Sequential([ Convolution2D((8, 8), 16, strides=4, name='conv1'), Convolution2D((4, 4), 32, strides=2, name='conv2'), Convolution2D((3, 3), 32, strides=1, name='conv3'), Dense(256, init=he_uniform(scale=0.01), name='dense1'), Dense(action_dim, activation=None, init=he_uniform(scale=0.01), name='actions') ]) self.model.update_signature(Tensor[state_dim]) # Create the target model as a copy of the online model self.target_model = None self.update_target() self.pre_states = input_variable(state_dim, name='pre_states') self.actions = input_variable(action_dim, name='actions') self.post_states = input_variable(state_dim, name='post_states') self.rewards = input_variable((), name='rewards') self.terminals = input_variable((), name='terminals') self.is_weights = input_variable((), name='is_weights') predicted_q = reduce_sum(self.model(self.pre_states) * self.actions, axis=0) # DQN - calculate target q values # post_q = reduce_max(self.target_model(self.post_states), axis=0) # DDQN - calculate target q values online_selection = one_hot( argmax(self.model(self.post_states), axis=0), self.action_dim) post_q = reduce_sum(self.target_model(self.post_states) * online_selection, axis=0) post_q = (1.0 - self.terminals) * post_q target_q = stop_gradient(self.rewards + self.gamma * post_q) # Huber loss delta = 1.0 self.td_error = minus(predicted_q, target_q, name='td_error') abs_error = abs(self.td_error) errors = element_select(less(abs_error, delta), square(self.td_error) * 0.5, delta * (abs_error - 0.5 * delta)) loss = errors * self.is_weights # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_scheule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) self._learner = adam(self.model.parameters, lr_schedule, m_scheule, variance_momentum=vm_schedule) self.writer = TensorBoardProgressWriter(log_dir='metrics', model=self.model) self.trainer = Trainer(self.model, (loss, None), [self._learner], self.writer)
def train_lm(): data = DataReader(token_to_id_path, segment_sepparator) # Create model nodes for the source and target inputs input_sequence, label_sequence = create_inputs(data.vocab_dim) # Create the model. It has three output nodes # z: the input to softmax that provides the latent representation of the next token # cross_entropy: this is used training criterion # error: this a binary indicator if the model predicts the correct token z, cross_entropy, error = create_model(input_sequence, label_sequence, data.vocab_dim, hidden_dim) # For measurement we use the (build in) full softmax. full_ce = C.cross_entropy_with_softmax(z, label_sequence) # print out some useful training information log_number_of_parameters(z) print() # Run the training loop num_trained_samples = 0 num_trained_samples_since_last_report = 0 # Instantiate the trainer object to drive the model training lr_schedule = learning_rate_schedule(learning_rate, UnitType.sample) momentum_schedule = momentum_as_time_constant_schedule( momentum_as_time_constant) gradient_clipping_with_truncation = True learner = momentum_sgd( z.parameters, lr_schedule, momentum_schedule, gradient_clipping_threshold_per_sample=clipping_threshold_per_sample, gradient_clipping_with_truncation=gradient_clipping_with_truncation) trainer = Trainer(z, (cross_entropy, error), learner) for epoch_count in range(num_epochs): for features, labels, token_count in data.minibatch_generator( train_file_path, sequence_length, sequences_per_batch): arguments = ({input_sequence: features, label_sequence: labels}) t_start = timeit.default_timer() trainer.train_minibatch(arguments) t_end = timeit.default_timer() samples_per_second = token_count / (t_end - t_start) # Print progress report every num_samples_between_progress_report samples if num_trained_samples_since_last_report >= num_samples_between_progress_report or num_trained_samples == 0: av_ce = average_cross_entropy(full_ce, input_sequence, label_sequence, data) print_progress(samples_per_second, av_ce, num_trained_samples, t_start) num_trained_samples_since_last_report = 0 num_trained_samples += token_count num_trained_samples_since_last_report += token_count # after each epoch save the model model_filename = "models/lm_epoch%d.dnn" % epoch_count z.save_model(model_filename) print("Saved model to '%s'" % model_filename)
class DeepQAgent(object): """ Implementation of Deep Q Neural Network agent like in: Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015) """ def __init__(self, input_shape, nb_actions, gamma=0.95, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 100000), learning_rate=0.01, momentum=0.8, minibatch_size=16, memory_size=15000, train_after=100, train_interval=100, target_update_interval=500, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 self._num_trains = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] ''' # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Convolution2D((8, 8), 16, strides=4), Convolution2D((4, 4), 32, strides=2), Convolution2D((3, 3), 32, strides=1), Dense(256, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) ''' with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Dense(7, init=he_uniform(scale=0.01)), Dense(8, init=he_uniform(scale=0.01)), #Dense(16, init=he_uniform(scale=0.01)), #Dense(32, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer) #self._trainer.restore_from_checkpoint('models/oldmodels/model800000') def act(self, state): """ This allows the agent to select the next action to perform in regard of the current state of the environment. It follows the terminology used in the Nature paper. Attributes: state (Tensor[input_shape]): The current environment state Returns: Int >= 0 : Next action to do """ # Append the state to the short term memory (ie. History) self._history.append(state) # If policy requires agent to explore, sample random action if self._explorer.is_exploring(self._num_actions_taken): action = self._explorer(self.nb_actions) else: # Use the network to output the best action env_with_history = self._history.value q_values = self._action_value_net.eval( # Append batch axis with only one sample to evaluate env_with_history.reshape((1,) + env_with_history.shape) ) self._episode_q_means.append(np.mean(q_values)) self._episode_q_stddev.append(np.std(q_values)) # Return the value maximizing the expected reward action = q_values.argmax() # Keep track of interval action counter self._num_actions_taken += 1 #print(self._num_actions_taken) return action def observe(self, old_state, action, reward, done): """ This allows the agent to observe the output of doing the action it selected through act() on the old_state Attributes: old_state (Tensor[input_shape]): Previous environment state action (int): Action done by the agent reward (float): Reward for doing this action in the old_state environment done (bool): Indicate if the action has terminated the environment """ self._episode_rewards.append(reward) # If done, reset short term memory (ie. History) if done: # Plot the metrics through Tensorboard and reset buffers if self._metrics_writer is not None: self._plot_metrics() self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Reset the short term memory self._history.reset() # Append to long term memory self._memory.append(old_state, action, reward, done) def train(self): """ This allows the agent to train itself to better understand the environment dynamics. The agent will compute the expected reward for the state(t+1) and update the expected reward at step t according to this. The target expectation is computed through the Target Network, which is a more stable version of the Action Value Network for increasing training stability. The Target Network is a frozen copy of the Action Value Network updated as regular intervals. """ agent_step = self._num_actions_taken if agent_step >= self._train_after: #if (agent_step % self._train_interval) == 0: print('\nTraining minibatch\n') client.setCarControls(zero_controls) pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size) self._trainer.train_minibatch( self._trainer.loss_function.argument_map( pre_states=pre_states, actions=Value.one_hot(actions.reshape(-1, 1).tolist(), self.nb_actions), post_states=post_states, rewards=rewards, terminals=terminals ) ) self._num_trains += 1 # Update the Target Network if needed if self._num_trains % 20 == 0: print('updating network') self._target_net = self._action_value_net.clone(CloneMethod.freeze) filename = dirname+"\model%d" % agent_step self._trainer.save_checkpoint(filename) def _plot_metrics(self): """Plot current buffers accumulated values to visualize agent learning """ if len(self._episode_q_means) > 0: mean_q = np.asscalar(np.mean(self._episode_q_means)) self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken) if len(self._episode_q_stddev) > 0: std_q = np.asscalar(np.mean(self._episode_q_stddev)) self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken) self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken)
def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Convolution2D((8, 8), 16, strides=4), Convolution2D((4, 4), 32, strides=2), Convolution2D((3, 3), 32, strides=1), Dense(256, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)
def __init__(self, in_shape, output_shape, device_id=None, learning_rate=0.00025, momentum=0.9, minibatch_size=32, update_interval=10000, n_workers=1, visualizer=None): """ Q Neural Network following Mnih and al. implementation and default options. The network has the following topology: Convolution(32, (8, 8)) Convolution(64, (4, 4)) Convolution(64, (2, 2)) Dense(512) :param in_shape: Shape of the observations perceived by the learner (the neural net input) :param output_shape: Size of the action space (mapped to the number of output neurons) :param device_id: Use None to let CNTK select the best available device, -1 for CPU, >= 0 for GPU (default: None) :param learning_rate: Learning rate (default: 0.00025, as per Mnih et al.) :param momentum: Momentum, provided as momentum value for averaging gradients without unit gain filter Note that CNTK does not currently provide an implementation of Graves' RmsProp with momentum. It uses AdamSGD optimizer instead. (default: 0, no momentum with RProp optimizer) :param minibatch_size: Minibatch size (default: 32, as per Mnih et al.) :param n_workers: Number of concurrent worker for distributed training. (default: 1, not distributed) :param visualizer: Optional visualizer allowing the model to save summary data (default: None, no visualization) Ref: Mnih et al.: "Human-level control through deep reinforcement learning." Nature 518.7540 (2015): 529-533. """ assert learning_rate > 0, 'learning_rate should be > 0' assert 0. <= momentum < 1, 'momentum should be 0 <= momentum < 1' QModel.__init__(self, in_shape, output_shape) CntkModel.__init__(self, device_id, False, n_workers, visualizer) self._nb_actions = output_shape self._steps = 0 self._target_update_interval = update_interval self._target = None # Input vars self._environment = Input(in_shape, name='env', dynamic_axes=(Axis.default_batch_axis())) self._q_targets = Input(1, name='q_targets', dynamic_axes=(Axis.default_batch_axis())) self._actions = Input(output_shape, name='actions', dynamic_axes=(Axis.default_batch_axis())) # Define the neural network graph self._model = self._build_model()(self._environment) self._target = self._model.clone( CloneMethod.freeze, {self._environment: self._environment}) # Define the learning rate lr_schedule = as_learning_rate_by_sample(learning_rate, minibatch_size, momentum, True) lr_schedule = learning_rate_schedule(lr_schedule, UnitType.sample) # AdamSGD optimizer m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._model.parameters, lr_schedule, momentum=m_schedule, unit_gain=True, variance_momentum=vm_schedule) if self.distributed_training: raise NotImplementedError('ASGD not implemented yet.') # _actions is a sparse 1-hot encoding of the actions done by the agent q_acted = reduce_sum(self._model * self._actions, axis=0) # Define the trainer with Huber Loss function criterion = huber_loss(q_acted, self._q_targets, 1.0) self._learner = l_sgd self._trainer = Trainer(self._model, (criterion, None), l_sgd)
class QNeuralNetwork(CntkModel, QModel): """ Represents a learning capable entity using CNTK """ def __init__(self, in_shape, output_shape, device_id=None, learning_rate=0.00025, momentum=0.9, minibatch_size=32, update_interval=10000, n_workers=1, visualizer=None): """ Q Neural Network following Mnih and al. implementation and default options. The network has the following topology: Convolution(32, (8, 8)) Convolution(64, (4, 4)) Convolution(64, (2, 2)) Dense(512) :param in_shape: Shape of the observations perceived by the learner (the neural net input) :param output_shape: Size of the action space (mapped to the number of output neurons) :param device_id: Use None to let CNTK select the best available device, -1 for CPU, >= 0 for GPU (default: None) :param learning_rate: Learning rate (default: 0.00025, as per Mnih et al.) :param momentum: Momentum, provided as momentum value for averaging gradients without unit gain filter Note that CNTK does not currently provide an implementation of Graves' RmsProp with momentum. It uses AdamSGD optimizer instead. (default: 0, no momentum with RProp optimizer) :param minibatch_size: Minibatch size (default: 32, as per Mnih et al.) :param n_workers: Number of concurrent worker for distributed training. (default: 1, not distributed) :param visualizer: Optional visualizer allowing the model to save summary data (default: None, no visualization) Ref: Mnih et al.: "Human-level control through deep reinforcement learning." Nature 518.7540 (2015): 529-533. """ assert learning_rate > 0, 'learning_rate should be > 0' assert 0. <= momentum < 1, 'momentum should be 0 <= momentum < 1' QModel.__init__(self, in_shape, output_shape) CntkModel.__init__(self, device_id, False, n_workers, visualizer) self._nb_actions = output_shape self._steps = 0 self._target_update_interval = update_interval self._target = None # Input vars self._environment = Input(in_shape, name='env', dynamic_axes=(Axis.default_batch_axis())) self._q_targets = Input(1, name='q_targets', dynamic_axes=(Axis.default_batch_axis())) self._actions = Input(output_shape, name='actions', dynamic_axes=(Axis.default_batch_axis())) # Define the neural network graph self._model = self._build_model()(self._environment) self._target = self._model.clone( CloneMethod.freeze, {self._environment: self._environment}) # Define the learning rate lr_schedule = as_learning_rate_by_sample(learning_rate, minibatch_size, momentum, True) lr_schedule = learning_rate_schedule(lr_schedule, UnitType.sample) # AdamSGD optimizer m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._model.parameters, lr_schedule, momentum=m_schedule, unit_gain=True, variance_momentum=vm_schedule) if self.distributed_training: raise NotImplementedError('ASGD not implemented yet.') # _actions is a sparse 1-hot encoding of the actions done by the agent q_acted = reduce_sum(self._model * self._actions, axis=0) # Define the trainer with Huber Loss function criterion = huber_loss(q_acted, self._q_targets, 1.0) self._learner = l_sgd self._trainer = Trainer(self._model, (criterion, None), l_sgd) @property def loss_val(self): return self._trainer.previous_minibatch_loss_average def _build_model(self): with default_options(init=he_uniform(), activation=relu, bias=True): model = Sequential([ Convolution((8, 8), 32, strides=(4, 4)), Convolution((4, 4), 64, strides=(2, 2)), Convolution((3, 3), 64, strides=(1, 1)), Dense(512, init=he_uniform(0.01)), Dense(self._nb_actions, activation=None, init=he_uniform(0.01)) ]) return model def train(self, x, q_value_targets, actions=None): assert actions is not None, 'actions cannot be None' # We need to add extra dimensions to shape [N, 1] => [N, 1] if check_rank(q_value_targets.shape, 1): q_value_targets = q_value_targets.reshape((-1, 1)) # Add extra dimensions to match shape [N, 1] required by one_hot if check_rank(actions.shape, 1): actions = actions.reshape((-1, 1)) # We need batch axis if check_rank(x.shape, len(self._environment.shape)): x = prepend_batch_axis(x) self._trainer.train_minibatch({ self._environment: x, self._actions: Value.one_hot(actions, self._nb_actions), self._q_targets: q_value_targets }) # Counter number of train calls self._steps += 1 # Update the model with the target one if (self._steps % self._target_update_interval) == 0: self._target = self._model.clone( CloneMethod.freeze, {self._environment: self._environment}) def evaluate(self, data, model=QModel.ACTION_VALUE_NETWORK): # If evaluating a single sample, expand the minibatch axis # (minibatch = 1, input_shape...) if len(data.shape) == len(self.input_shape): data = prepend_batch_axis(data) # Append minibatch dim if model == QModel.TARGET_NETWORK: predictions = self._target.eval({self._environment: data}) else: predictions = self._model.eval({self._environment: data}) return predictions.squeeze()
# internally (if needed), so all we need are the logits ce = cross_entropy_with_softmax(logits, labels) pe = classification_error(logits, labels) # training config batch_size = 32 epochs = 15 n_batches = len(Xtrain) // batch_size # do the training # specify the training algorithm trainer = Trainer(logits, (ce, pe), adam(logits.parameters, lr=1e-2, momentum=0.9)) # helper function def get_output(node, X, Y): ret = node.forward(dict(inputs=X, labels=Y)) return list(ret[1].values())[0].mean() costs = [] errors = [] test_costs = [] test_errors = [] for i in range(epochs): cost = 0 err = 0
def simple_mnist(tensorboard_logdir=None): input_dim = 784 num_output_classes = 10 num_hidden_layers = 1 hidden_layers_dim = 200 # Input variables denoting the features and label data feature = C.input_variable(input_dim, np.float32) label = C.input_variable(num_output_classes, np.float32) # Instantiate the feedforward classification model scaled_input = element_times(constant(0.00390625), feature) z = Sequential([For(range(num_hidden_layers), lambda i: Dense(hidden_layers_dim, activation=relu)), Dense(num_output_classes)])(scaled_input) ce = cross_entropy_with_softmax(z, label) pe = classification_error(z, label) data_dir = os.path.join(abs_path, "..", "..", "..", "DataSets", "MNIST") path = os.path.normpath(os.path.join(data_dir, "Train-28x28_cntk_text.txt")) check_path(path) reader_train = create_reader(path, True, input_dim, num_output_classes) input_map = { feature : reader_train.streams.features, label : reader_train.streams.labels } # Training config minibatch_size = 64 num_samples_per_sweep = 60000 num_sweeps_to_train_with = 10 # Instantiate progress writers. #training_progress_output_freq = 100 progress_writers = [ProgressPrinter( #freq=training_progress_output_freq, tag='Training', num_epochs=num_sweeps_to_train_with)] if tensorboard_logdir is not None: progress_writers.append(TensorBoardProgressWriter(freq=10, log_dir=tensorboard_logdir, model=z)) # Instantiate the trainer object to drive the model training lr = learning_parameter_schedule_per_sample(1) trainer = Trainer(z, (ce, pe), adadelta(z.parameters, lr), progress_writers) training_session( trainer=trainer, mb_source = reader_train, mb_size = minibatch_size, model_inputs_to_streams = input_map, max_samples = num_samples_per_sweep * num_sweeps_to_train_with, progress_frequency=num_samples_per_sweep ).train() # Load test data path = os.path.normpath(os.path.join(data_dir, "Test-28x28_cntk_text.txt")) check_path(path) reader_test = create_reader(path, False, input_dim, num_output_classes) input_map = { feature : reader_test.streams.features, label : reader_test.streams.labels } # Test data for trained model C.debugging.start_profiler() C.debugging.enable_profiler() C.debugging.set_node_timing(True) #C.cntk_py.disable_cpueval_optimization() # uncomment this to check CPU eval perf without optimization test_minibatch_size = 1024 num_samples = 10000 num_minibatches_to_test = num_samples / test_minibatch_size test_result = 0.0 for i in range(0, int(num_minibatches_to_test)): mb = reader_test.next_minibatch(test_minibatch_size, input_map=input_map) eval_error = trainer.test_minibatch(mb) test_result = test_result + eval_error C.debugging.stop_profiler() trainer.print_node_timing() # Average of evaluation errors of all test minibatches return test_result / num_minibatches_to_test
def simple_mnist(): input_dim = 784 num_output_classes = 10 num_hidden_layers = 2 hidden_layers_dim = 200 # Input variables denoting the features and label data feature = C.input_variable(input_dim) label = C.input_variable(num_output_classes) # Instantiate the feedforward classification model scaled_input = element_times(constant(0.00390625), feature) # z = Sequential([ # Dense(hidden_layers_dim, activation=relu), # Dense(hidden_layers_dim, activation=relu), # Dense(num_output_classes)])(scaled_input) with default_options(activation=relu, init=C.glorot_uniform()): z = Sequential([ For(range(num_hidden_layers), lambda i: Dense(hidden_layers_dim)), Dense(num_output_classes, activation=None) ])(scaled_input) ce = cross_entropy_with_softmax(z, label) pe = classification_error(z, label) # setup the data path = abs_path + "\Train-28x28_cntk_text.txt" reader_train = MinibatchSource( CTFDeserializer( path, StreamDefs(features=StreamDef(field='features', shape=input_dim), labels=StreamDef(field='labels', shape=num_output_classes)))) input_map = { feature: reader_train.streams.features, label: reader_train.streams.labels } # Training config minibatch_size = 64 num_samples_per_sweep = 60000 num_sweeps_to_train_with = 10 # Instantiate progress writers. progress_writers = [ ProgressPrinter(tag='Training', num_epochs=num_sweeps_to_train_with) ] # Instantiate the trainer object to drive the model training lr = learning_rate_schedule(1, UnitType.sample) trainer = Trainer(z, (ce, pe), [adadelta(z.parameters, lr)], progress_writers) training_session(trainer=trainer, mb_source=reader_train, mb_size=minibatch_size, model_inputs_to_streams=input_map, max_samples=num_samples_per_sweep * num_sweeps_to_train_with, progress_frequency=num_samples_per_sweep).train() # Load test data path = abs_path + "\Test-28x28_cntk_text.txt" reader_test = MinibatchSource( CTFDeserializer( path, StreamDefs(features=StreamDef(field='features', shape=input_dim), labels=StreamDef(field='labels', shape=num_output_classes)))) input_map = { feature: reader_test.streams.features, label: reader_test.streams.labels } # Test data for trained model test_minibatch_size = 1024 num_samples = 10000 num_minibatches_to_test = num_samples / test_minibatch_size test_result = 0.0 for i in range(0, int(num_minibatches_to_test)): mb = reader_test.next_minibatch(test_minibatch_size, input_map=input_map) eval_error = trainer.test_minibatch(mb) test_result = test_result + eval_error # Average of evaluation errors of all test minibatches return test_result / num_minibatches_to_test
class DeepQAgent(object): """ Implementation of Deep Q Neural Network agent like in: Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015) """ def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Convolution2D((8, 8), 16, strides=4), Convolution2D((4, 4), 32, strides=2), Convolution2D((3, 3), 32, strides=1), Dense(256, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer) def act(self, state): """ This allows the agent to select the next action to perform in regard of the current state of the environment. It follows the terminology used in the Nature paper. Attributes: state (Tensor[input_shape]): The current environment state Returns: Int >= 0 : Next action to do """ # Append the state to the short term memory (ie. History) self._history.append(state) # If policy requires agent to explore, sample random action if self._explorer.is_exploring(self._num_actions_taken): action = self._explorer(self.nb_actions) else: # Use the network to output the best action env_with_history = self._history.value q_values = self._action_value_net.eval( # Append batch axis with only one sample to evaluate env_with_history.reshape((1,) + env_with_history.shape) ) self._episode_q_means.append(np.mean(q_values)) self._episode_q_stddev.append(np.std(q_values)) # Return the value maximizing the expected reward action = q_values.argmax() # Keep track of interval action counter self._num_actions_taken += 1 return action def observe(self, old_state, action, reward, done): """ This allows the agent to observe the output of doing the action it selected through act() on the old_state Attributes: old_state (Tensor[input_shape]): Previous environment state action (int): Action done by the agent reward (float): Reward for doing this action in the old_state environment done (bool): Indicate if the action has terminated the environment """ self._episode_rewards.append(reward) # If done, reset short term memory (ie. History) if done: # Plot the metrics through Tensorboard and reset buffers if self._metrics_writer is not None: self._plot_metrics() self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Reset the short term memory self._history.reset() # Append to long term memory self._memory.append(old_state, action, reward, done) def train(self): """ This allows the agent to train itself to better understand the environment dynamics. The agent will compute the expected reward for the state(t+1) and update the expected reward at step t according to this. The target expectation is computed through the Target Network, which is a more stable version of the Action Value Network for increasing training stability. The Target Network is a frozen copy of the Action Value Network updated as regular intervals. """ agent_step = self._num_actions_taken if agent_step >= self._train_after: if (agent_step % self._train_interval) == 0: pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size) self._trainer.train_minibatch( self._trainer.loss_function.argument_map( pre_states=pre_states, actions=Value.one_hot(actions.reshape(-1, 1).tolist(), self.nb_actions), post_states=post_states, rewards=rewards, terminals=terminals ) ) # Update the Target Network if needed if (agent_step % self._target_update_interval) == 0: self._target_net = self._action_value_net.clone(CloneMethod.freeze) filename = "models\model%d" % agent_step self._trainer.save_checkpoint(filename) def _plot_metrics(self): """Plot current buffers accumulated values to visualize agent learning """ if len(self._episode_q_means) > 0: mean_q = np.asscalar(np.mean(self._episode_q_means)) self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken) if len(self._episode_q_stddev) > 0: std_q = np.asscalar(np.mean(self._episode_q_stddev)) self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken) self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken)
class DeepQAgent(object): """ Implementation of Deep Q Neural Network agent like in: Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015) """ def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=200000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Convolution2D((8, 8), 16, strides=4), Convolution2D((4, 4), 32, strides=2), Convolution2D((3, 3), 32, strides=1), Dense(256, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter( freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer) def load(self, model_path): self._trainer.restore_from_checkpoint(model_path) def act(self, state, eval=False): """ This allows the agent to select the next action to perform in regard of the current state of the environment. It follows the terminology used in the Nature paper. Attributes: state (Tensor[input_shape]): The current environment state Returns: Int >= 0 : Next action to do """ # Append the state to the short term memory (ie. History) self._history.append(state) # If policy requires agent to explore, sample random action if self._explorer.is_exploring(self._num_actions_taken) and not eval: action = self._explorer(self.nb_actions) q_values = None else: # Use the network to output the best action env_with_history = self._history.value q_values = self._action_value_net.eval( # Append batch axis with only one sample to evaluate env_with_history.reshape((1, ) + env_with_history.shape)) self._episode_q_means.append(np.mean(q_values)) self._episode_q_stddev.append(np.std(q_values)) # Return the value maximizing the expected reward action = q_values.argmax() # Keep track of interval action counter self._num_actions_taken += 1 return action, q_values def observe(self, old_state, action, reward, done): """ This allows the agent to observe the output of doing the action it selected through act() on the old_state Attributes: old_state (Tensor[input_shape]): Previous environment state action (int): Action done by the agent reward (float): Reward for doing this action in the old_state environment done (bool): Indicate if the action has terminated the environment """ self._episode_rewards.append(reward) # If done, reset short term memory (ie. History) if done: # Plot the metrics through Tensorboard and reset buffers if self._metrics_writer is not None: self._plot_metrics() self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Reset the short term memory self._history.reset() # Append to long term memory self._memory.append(old_state, action, reward, done) def train(self, checkpoint_dir): """ This allows the agent to train itself to better understand the environment dynamics. The agent will compute the expected reward for the state(t+1) and update the expected reward at step t according to this. The target expectation is computed through the Target Network, which is a more stable version of the Action Value Network for increasing training stability. The Target Network is a frozen copy of the Action Value Network updated as regular intervals. """ agent_step = self._num_actions_taken if agent_step >= self._train_after: if (agent_step % self._train_interval) == 0: #print('training... number of steps: {}'.format(agent_step)) pre_states, actions, post_states, rewards, terminals = self._memory.minibatch( self._minibatch_size) self._trainer.train_minibatch( self._trainer.loss_function.argument_map( pre_states=pre_states, actions=Value.one_hot( actions.reshape(-1, 1).tolist(), self.nb_actions), post_states=post_states, rewards=rewards, terminals=terminals)) # Update the Target Network if needed if (agent_step % self._target_update_interval) == 0: self._target_net = self._action_value_net.clone( CloneMethod.freeze) filename = os.path.join(checkpoint_dir, "models\model%d" % agent_step) self._trainer.save_checkpoint(filename) def _plot_metrics(self): """Plot current buffers accumulated values to visualize agent learning """ if len(self._episode_q_means) > 0: mean_q = np.asscalar(np.mean(self._episode_q_means)) self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken) if len(self._episode_q_stddev) > 0: std_q = np.asscalar(np.mean(self._episode_q_stddev)) self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken) self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken) def get_depth_image(self, client): # get depth image from airsim responses = client.simGetImages([ airsim.ImageRequest("RCCamera", airsim.ImageType.DepthPerspective, True, False) ]) img1d = np.array(responses[0].image_data_float, dtype=np.float) img1d = 255 / np.maximum(np.ones(img1d.size), img1d) if img1d.size > 1: img2d = np.reshape(img1d, (responses[0].height, responses[0].width)) image = Image.fromarray(img2d) im_final = np.array(image.resize((84, 84)).convert('L')) im_final = im_final / 255.0 return im_final return np.zeros((84, 84)).astype(float) # Gets a coverage image from AirSim def get_cov_image(self, coverage_map): state, cov_reward = coverage_map.get_state_from_pose() #state = self.coverage_map.get_map_scaled() # debug only #im = Image.fromarray(np.uint8(state)) #im.save("DistributedRL\\debug\\{}.png".format(time.time())) # normalize state state = state / 255.0 return state, cov_reward
def simple_mnist(tensorboard_logdir=None): input_dim = 784 num_output_classes = 10 num_hidden_layers = 1 hidden_layers_dim = 200 # Input variables denoting the features and label data feature = input(input_dim, np.float32) label = input(num_output_classes, np.float32) # Instantiate the feedforward classification model scaled_input = element_times(constant(0.00390625), feature) z = fully_connected_classifier_net(scaled_input, num_output_classes, hidden_layers_dim, num_hidden_layers, relu) ce = cross_entropy_with_softmax(z, label) pe = classification_error(z, label) data_dir = os.path.join(abs_path, "..", "..", "..", "DataSets", "MNIST") path = os.path.normpath(os.path.join(data_dir, "Train-28x28_cntk_text.txt")) check_path(path) reader_train = create_reader(path, True, input_dim, num_output_classes) input_map = { feature: reader_train.streams.features, label: reader_train.streams.labels } # Training config minibatch_size = 64 num_samples_per_sweep = 60000 num_sweeps_to_train_with = 10 # Instantiate progress writers. #training_progress_output_freq = 100 progress_writers = [ ProgressPrinter( #freq=training_progress_output_freq, tag='Training', num_epochs=num_sweeps_to_train_with) ] if tensorboard_logdir is not None: progress_writers.append( TensorBoardProgressWriter(freq=10, log_dir=tensorboard_logdir, model=z)) # Instantiate the trainer object to drive the model training trainer = Trainer(z, (ce, pe), adadelta(z.parameters), progress_writers) training_session(trainer=trainer, mb_source=reader_train, mb_size=minibatch_size, var_to_stream=input_map, max_samples=num_samples_per_sweep * num_sweeps_to_train_with, progress_frequency=num_samples_per_sweep).train() # Load test data path = os.path.normpath(os.path.join(data_dir, "Test-28x28_cntk_text.txt")) check_path(path) reader_test = create_reader(path, False, input_dim, num_output_classes) input_map = { feature: reader_test.streams.features, label: reader_test.streams.labels } # Test data for trained model test_minibatch_size = 1024 num_samples = 10000 num_minibatches_to_test = num_samples / test_minibatch_size test_result = 0.0 for i in range(0, int(num_minibatches_to_test)): mb = reader_test.next_minibatch(test_minibatch_size, input_map=input_map) eval_error = trainer.test_minibatch(mb) test_result = test_result + eval_error # Average of evaluation errors of all test minibatches return test_result / num_minibatches_to_test
class DeepQAgent(object): """ Implementation of Deep Q Neural Network agent like in: Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015) """ def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(0.91, 0.1, 910000), fixpolicy=LinearEpsilonAnnealingExplorer(0.5, 0.1, 100000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._fixpolicy = fixpolicy self._minibatch_size = minibatch_size self._history = History(input_shape) print("input_shape:", input_shape) print("input_shape[1:]", input_shape[1:]) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ #Convolution2D((8, 8), 16, strides=4), #Convolution2D((4, 4), 32, strides=2), #Convolution2D((1, 1), 16, strides=1), Dense(25, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter( freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer) #self._trainer.restore_from_checkpoint("models_heuristic_no_image\model") def act(self, state): """ This allows the agent to select the next action to perform in regard of the current state of the environment. It follows the terminology used in the Nature paper. Attributes: state (Tensor[input_shape]): The current environment state Returns: Int >= 0 : Next action to do """ # Append the state to the short term memory (ie. History) self._history.append(state) #if True: if self._fixpolicy.is_exploring(self._num_actions_taken): diff_x = state[3] - state[0] diff_y = state[4] - state[1] diff_z = state[5] - state[2] diff_arr = np.array([diff_x, diff_y, diff_z]) direction = np.argmax(np.absolute(diff_arr)) ''' abs_x = math.fabs(diff_x) abs_y = math.fabs(diff_y) abs_z = math.fabs(diff_z) diff = [diff_x, diff_y, diff_z] abs_diff = [abs_x, abs_y, abs_z] print(diff, abs_diff) m = max(abs_diff) direction = diff.index(m)''' print(diff_arr) if diff_arr[direction] < 0: fixaction = direction + 4 else: fixaction = direction + 1 self._num_actions_taken += 1 return fixaction # If policy requires agent to explore, sample random action if self._explorer.is_exploring(self._num_actions_taken): action = self._explorer(self.nb_actions) else: # Use the network to output the best action env_with_history = self._history.value q_values = self._action_value_net.eval( # Append batch axis with only one sample to evaluate env_with_history.reshape((1, ) + env_with_history.shape)) self._episode_q_means.append(np.mean(q_values)) self._episode_q_stddev.append(np.std(q_values)) # Return the value maximizing the expected reward action = q_values.argmax() # Keep track of interval action counter self._num_actions_taken += 1 return action def observe(self, old_state, action, reward, done): """ This allows the agent to observe the output of doing the action it selected through act() on the old_state Attributes: old_state (Tensor[input_shape]): Previous environment state action (int): Action done by the agent reward (float): Reward for doing this action in the old_state environment done (bool): Indicate if the action has terminated the environment """ self._episode_rewards.append(reward) # If done, reset short term memory (ie. History) if done: # Plot the metrics through Tensorboard and reset buffers if self._metrics_writer is not None: self._plot_metrics() self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Reset the short term memory self._history.reset() # Append to long term memory self._memory.append(old_state, action, reward, done) def train(self): """ This allows the agent to train itself to better understand the environment dynamics. The agent will compute the expected reward for the state(t+1) and update the expected reward at step t according to this. The target expectation is computed through the Target Network, which is a more stable version of the Action Value Network for increasing training stability. The Target Network is a frozen copy of the Action Value Network updated as regular intervals. """ agent_step = self._num_actions_taken print("agent_step = ", agent_step) #time.sleep(1) if agent_step >= self._train_after: if (agent_step % self._train_interval) == 0: pre_states, actions, post_states, rewards, terminals = self._memory.minibatch( self._minibatch_size) self._trainer.train_minibatch( self._trainer.loss_function.argument_map( pre_states=pre_states, actions=Value.one_hot( actions.reshape(-1, 1).tolist(), self.nb_actions), post_states=post_states, rewards=rewards, terminals=terminals)) # Update the Target Network if needed if (agent_step % self._target_update_interval) == 0: self._target_net = self._action_value_net.clone( CloneMethod.freeze) filename = "models_heuristic_no_image_less_exploration\model%d" % agent_step print( "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$filename=", filename) self._trainer.save_checkpoint(filename) #time.sleep(100) def _plot_metrics(self): global landing_count, episode_count """Plot current buffers accumulated values to visualize agent learning """ f = open('log__heuristic_no_image_less_exploration2', 'a+') f.write('episode:' + str(episode_count) + ': exploration rate= ' + str(self._explorer._rate) + ' heuristic fix rate= ' + str(self._fixpolicy._rate) + '\n') if len(self._episode_q_means) > 0: mean_q = np.asscalar(np.mean(self._episode_q_means)) self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken) print('Mean Q per ep.', mean_q, self._num_actions_taken) f.write('Mean Q per ep. ' + str(mean_q) + ' ' + str(self._num_actions_taken) + '\n') if len(self._episode_q_stddev) > 0: std_q = np.asscalar(np.mean(self._episode_q_stddev)) self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken) print('Mean Std Q per ep.', std_q, self._num_actions_taken) f.write('Mean Std Q per ep. ' + str(std_q) + ' ' + str(self._num_actions_taken) + '\n') self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken) print('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken) f.write('Sum rewards per ep. ' + str(sum(self._episode_rewards)) + ' ' + str(self._num_actions_taken) + '\n') if landing_count > 0: f.write('****************Success landing**********' + str(landing_count) + '\n') landing_count = 0 episode_count = 0 f.write('\n')
def __init__(self, input_shape, nb_actions, gamma=0.95, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 100000), learning_rate=0.01, momentum=0.8, minibatch_size=16, memory_size=15000, train_after=100, train_interval=100, target_update_interval=500, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 self._num_trains = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] ''' # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Convolution2D((8, 8), 16, strides=4), Convolution2D((4, 4), 32, strides=2), Convolution2D((3, 3), 32, strides=1), Dense(256, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) ''' with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Dense(7, init=he_uniform(scale=0.01)), Dense(8, init=he_uniform(scale=0.01)), #Dense(16, init=he_uniform(scale=0.01)), #Dense(32, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)
class DQAgent(object): """docstring for DQAgent""" ############should modify @!@! def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = RepMem(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Dense(input_shape, init=he_uniform(scale=0.01)), Dense(input_shape), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))]) self._action_value_net.update_signature(Tensor[input_shape]) self._target_net = self._action_value_net.clone(CloneMethod.freeze) @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): q_targets = compute_q_targets(post_states, rewards, terminals) q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) return huber_loss(q_targets, q_acted, 1.0) lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer) def act(self, state): self._history.append(state) if self._explorer.is_exploring(self._num_actions_taken): action = self._explorer(self.nb_actions) else: env_with_history = self._history.value q_values = self._action_value_net.eval( env_with_history.reshape((1,) + env_with_history.shape) ) self._episode_q_means.append(np.mean(q_values)) self._episode_q_stddev.append(np.std(q_values)) action = q_values.argmax() self._num_actions_taken += 1 return action def observe(self, old_state, action, reward, done): self._episode_rewards.append(reward) if done: if self._metrics_writer is not None: self._plot_metrics() self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] self._history.reset() self._memory.append(old_state, action, reward, done) def train(self): agent_step = self._num_actions_taken if agent_step >= self._train_after: if (agent_step % self._train_interval) == 0: pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size) self._trainer.train_minibatch( self._trainer.loss_function.argument_map( pre_states=pre_states, actions=Value.one_hot(actions.reshape(-1,1).tolist(), self.nb_actions), post_states=post_states, rewards=rewards, terminals=terminals ) ) if (agent_step % self._target_update_interval) == 0: self._target_net = self._action_value_net.clone(CloneMethod.freeze) filename = "model\model%d" % agent_step # save ???? not good at using %d self._trainer.save_checkpoint(filename) def _plot_metrics(self): if len(self._episode_q_means) > 0: mean_q = np.asscalar(np.mean(self._episode_q_means)) self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken) if len(self._episode_q_stddev) > 0: std_q = np.asscalar(np.mean(self._episode_q_stddev)) self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken) self._metrics_writer.write_value('Sum rewards per ep', sum(self._episode_rewards), self._num_actions_taken)
def simple_mnist(): input_dim = 784 num_output_classes = 10 num_hidden_layers = 2 hidden_layers_dim = 200 # Input variables denoting the features and label data feature = C.input_variable(input_dim) label = C.input_variable(num_output_classes) # Instantiate the feedforward classification model scaled_input = element_times(constant(0.00390625), feature) # z = Sequential([ # Dense(hidden_layers_dim, activation=relu), # Dense(hidden_layers_dim, activation=relu), # Dense(num_output_classes)])(scaled_input) with default_options(activation=relu, init=C.glorot_uniform()): z = Sequential([For(range(num_hidden_layers), lambda i: Dense(hidden_layers_dim)), Dense(num_output_classes, activation=None)])(scaled_input) ce = cross_entropy_with_softmax(z, label) pe = classification_error(z, label) # setup the data path = abs_path + "\Train-28x28_cntk_text.txt" reader_train = MinibatchSource(CTFDeserializer(path, StreamDefs( features=StreamDef(field='features', shape=input_dim), labels=StreamDef(field='labels', shape=num_output_classes)))) input_map = { feature: reader_train.streams.features, label: reader_train.streams.labels } # Training config minibatch_size = 64 num_samples_per_sweep = 60000 num_sweeps_to_train_with = 10 # Instantiate progress writers. progress_writers = [ProgressPrinter( tag='Training', num_epochs=num_sweeps_to_train_with)] # Instantiate the trainer object to drive the model training lr = learning_rate_schedule(1, UnitType.sample) trainer = Trainer(z, (ce, pe), [adadelta(z.parameters, lr)], progress_writers) training_session( trainer=trainer, mb_source=reader_train, mb_size=minibatch_size, model_inputs_to_streams=input_map, max_samples=num_samples_per_sweep * num_sweeps_to_train_with, progress_frequency=num_samples_per_sweep ).train() # Load test data path = abs_path + "\Test-28x28_cntk_text.txt" reader_test = MinibatchSource(CTFDeserializer(path, StreamDefs( features=StreamDef(field='features', shape=input_dim), labels=StreamDef(field='labels', shape=num_output_classes)))) input_map = { feature: reader_test.streams.features, label: reader_test.streams.labels } # Test data for trained model test_minibatch_size = 1024 num_samples = 10000 num_minibatches_to_test = num_samples / test_minibatch_size test_result = 0.0 for i in range(0, int(num_minibatches_to_test)): mb = reader_test.next_minibatch(test_minibatch_size, input_map=input_map) eval_error = trainer.test_minibatch(mb) test_result = test_result + eval_error # Average of evaluation errors of all test minibatches return test_result / num_minibatches_to_test
class LearningAgent(object): def __init__(self, state_dim, action_dim, gamma=0.99, learning_rate=1e-4, momentum=0.95): self.state_dim = state_dim self.action_dim = action_dim self.gamma = gamma with default_options(activation=relu, init=he_uniform()): # Convolution filter counts were halved to save on memory, no gpu :( self.model = Sequential([ Convolution2D((8, 8), 16, strides=4, name='conv1'), Convolution2D((4, 4), 32, strides=2, name='conv2'), Convolution2D((3, 3), 32, strides=1, name='conv3'), Dense(256, init=he_uniform(scale=0.01), name='dense1'), Dense(action_dim, activation=None, init=he_uniform(scale=0.01), name='actions') ]) self.model.update_signature(Tensor[state_dim]) # Create the target model as a copy of the online model self.target_model = None self.update_target() self.pre_states = input_variable(state_dim, name='pre_states') self.actions = input_variable(action_dim, name='actions') self.post_states = input_variable(state_dim, name='post_states') self.rewards = input_variable((), name='rewards') self.terminals = input_variable((), name='terminals') self.is_weights = input_variable((), name='is_weights') predicted_q = reduce_sum(self.model(self.pre_states) * self.actions, axis=0) # DQN - calculate target q values # post_q = reduce_max(self.target_model(self.post_states), axis=0) # DDQN - calculate target q values online_selection = one_hot( argmax(self.model(self.post_states), axis=0), self.action_dim) post_q = reduce_sum(self.target_model(self.post_states) * online_selection, axis=0) post_q = (1.0 - self.terminals) * post_q target_q = stop_gradient(self.rewards + self.gamma * post_q) # Huber loss delta = 1.0 self.td_error = minus(predicted_q, target_q, name='td_error') abs_error = abs(self.td_error) errors = element_select(less(abs_error, delta), square(self.td_error) * 0.5, delta * (abs_error - 0.5 * delta)) loss = errors * self.is_weights # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_scheule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) self._learner = adam(self.model.parameters, lr_schedule, m_scheule, variance_momentum=vm_schedule) self.writer = TensorBoardProgressWriter(log_dir='metrics', model=self.model) self.trainer = Trainer(self.model, (loss, None), [self._learner], self.writer) def act(self, state, epsilon): """ Selects an action to take based on the epsilon greedy method :param state: The current state :param epsilon: Determines the amount of exploration. (1 - full exploration, 0 - no exploration) """ if np.random.randn(1) < epsilon: # Explore (random action) return np.random.choice(self.action_dim) else: # Exploit (greedy action based on knowledge) return self.model.eval(state).argmax() def train(self, s, a, r, s_, t, w): """ Updates the network weights using the given minibatch data :param s: Tensor[state_dim] Current state :param a: Tensor[int] Action taken at state s :param r: Tensor[float] State resulting from taking action a at state s :param s_: Tensor[state_dim] Reward received for taking action a at state s :param t: Tensor[boolean] True if s_ was a terminal state and false otherwise :param w: Tensor[float] Importance sampling weights """ a = Value.one_hot(a.tolist(), self.action_dim) td_error = self.trainer.train_minibatch( { self.pre_states: s, self.actions: a, self.rewards: r, self.post_states: s_, self.terminals: t, self.is_weights: w }, outputs=[self.td_error]) return td_error[0] def update_target(self): """ Update the target network using the online network weights """ self.target_model = self.model.clone(CloneMethod.freeze) def checkpoint(self, filename): self.trainer.save_checkpoint(filename) def save_model(self, filename): self.model.save(filename)