def create_adam_learner(learn_params, learning_rate=0.0005, gradient_clipping_threshold_per_sample=0.001): """ Create adam learner """ lr_schedule = learners.learning_rate_schedule(learning_rate, learners.UnitType.sample) momentum = learners.momentum_schedule(0.90) gradient_clipping_threshold_per_sample = gradient_clipping_threshold_per_sample gradient_clipping_with_truncation = True momentum_var = learners.momentum_schedule(0.999) lr = learners.adam( learn_params, lr_schedule, momentum, True, momentum_var, gradient_clipping_threshold_per_sample= gradient_clipping_threshold_per_sample, gradient_clipping_with_truncation=gradient_clipping_with_truncation) learner_desc = 'Alg: Adam, learning rage: {0}, momentum: {1}, gradient clip: {2}'.format( learning_rate, momentum[0], gradient_clipping_threshold_per_sample) logger.log("Create learner. {0}".format(learner_desc)) return lr
def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = RepMem(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Dense(input_shape, init=he_uniform(scale=0.01)), Dense(input_shape), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))]) self._action_value_net.update_signature(Tensor[input_shape]) self._target_net = self._action_value_net.clone(CloneMethod.freeze) @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): q_targets = compute_q_targets(post_states, rewards, terminals) q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) return huber_loss(q_targets, q_acted, 1.0) lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)
def build_trainer(self): # Set the learning rate, and the momentum parameters for the Adam optimizer. lr = learning_rate_schedule(self.lr, UnitType.minibatch) beta1 = momentum_schedule(0.9) beta2 = momentum_schedule(0.99) # Calculate the losses. loss_on_v = cntk.squared_error(self.R, self.v) pi_a_s = cntk.log(cntk.times_transpose(self.pi, self.action)) loss_on_pi = cntk.variables.Constant(-1) * (cntk.plus( cntk.times(pi_a_s, cntk.minus(self.R, self.v_calc)), 0.01 * cntk.times_transpose(self.pi, cntk.log(self.pi)))) #loss_on_pi = cntk.times(pi_a_s, cntk.minus(self.R, self.v_calc)) self.tensorboard_v_writer = TensorBoardProgressWriter( freq=10, log_dir="tensorboard_v_logs", model=self.v) self.tensorboard_pi_writer = TensorBoardProgressWriter( freq=10, log_dir="tensorboard_pi_logs", model=self.pi) # tensorboard --logdir=tensorboard_pi_logs http://localhost:6006/ # tensorboard --logdir=tensorboard_v_logs http://localhost:6006/ # Create the trainiers. self.trainer_v = cntk.Trainer(self.v, (loss_on_v), [ adam(self.pms_v, lr, beta1, variance_momentum=beta2, gradient_clipping_threshold_per_sample=2, l2_regularization_weight=0.01) ], self.tensorboard_v_writer) self.trainer_pi = cntk.Trainer(self.pi, (loss_on_pi), [ adam(self.pms_pi, lr, beta1, variance_momentum=beta2, gradient_clipping_threshold_per_sample=2, l2_regularization_weight=0.01) ], self.tensorboard_pi_writer)
def __init__(self, name, observation_space_shape, num_actions, pretrained_policy=None, *args, **kwargs): self.name = name self.observation_space_shape = observation_space_shape self.num_actions = num_actions self._build_network(pretrained_policy) self.trainer = Trainer( self.value, self.loss, [adam(self.value.parameters, lr=0.001, momentum=0.9)])
def __init__(self, name, num_frames_to_stack, observation_space_shape, num_actions, pretrained_policy=None, *args, **kwargs): self.name = name self.num_frames_to_stack = num_frames_to_stack self.observation_space_shape = observation_space_shape self.num_actions = num_actions self._build_network(pretrained_policy) self.trainer = Trainer( self.log_probability, self.loss, [adam(self.probabilities.parameters, lr=0.00001, momentum=0.9)])
def __init__(self, input_shape, nb_actions, gamma=0.95, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 100000), learning_rate=0.01, momentum=0.8, minibatch_size=16, memory_size=15000, train_after=100, train_interval=100, target_update_interval=500, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 self._num_trains = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] ''' # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Convolution2D((8, 8), 16, strides=4), Convolution2D((4, 4), 32, strides=2), Convolution2D((3, 3), 32, strides=1), Dense(256, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) ''' with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Dense(7, init=he_uniform(scale=0.01)), Dense(8, init=he_uniform(scale=0.01)), #Dense(16, init=he_uniform(scale=0.01)), #Dense(32, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)
def __init__(self, in_shape, output_shape, device_id=None, learning_rate=0.00025, momentum=0.9, minibatch_size=32, update_interval=10000, n_workers=1, visualizer=None): """ Q Neural Network following Mnih and al. implementation and default options. The network has the following topology: Convolution(32, (8, 8)) Convolution(64, (4, 4)) Convolution(64, (2, 2)) Dense(512) :param in_shape: Shape of the observations perceived by the learner (the neural net input) :param output_shape: Size of the action space (mapped to the number of output neurons) :param device_id: Use None to let CNTK select the best available device, -1 for CPU, >= 0 for GPU (default: None) :param learning_rate: Learning rate (default: 0.00025, as per Mnih et al.) :param momentum: Momentum, provided as momentum value for averaging gradients without unit gain filter Note that CNTK does not currently provide an implementation of Graves' RmsProp with momentum. It uses AdamSGD optimizer instead. (default: 0, no momentum with RProp optimizer) :param minibatch_size: Minibatch size (default: 32, as per Mnih et al.) :param n_workers: Number of concurrent worker for distributed training. (default: 1, not distributed) :param visualizer: Optional visualizer allowing the model to save summary data (default: None, no visualization) Ref: Mnih et al.: "Human-level control through deep reinforcement learning." Nature 518.7540 (2015): 529-533. """ assert learning_rate > 0, 'learning_rate should be > 0' assert 0. <= momentum < 1, 'momentum should be 0 <= momentum < 1' QModel.__init__(self, in_shape, output_shape) CntkModel.__init__(self, device_id, False, n_workers, visualizer) self._nb_actions = output_shape self._steps = 0 self._target_update_interval = update_interval self._target = None # Input vars self._environment = input(in_shape, name='env', dynamic_axes=(Axis.default_batch_axis())) self._q_targets = input(1, name='q_targets', dynamic_axes=(Axis.default_batch_axis())) self._actions = input(output_shape, name='actions', dynamic_axes=(Axis.default_batch_axis())) # Define the neural network graph self._model = self._build_model()(self._environment) self._target = self._model.clone( CloneMethod.freeze, {self._environment: self._environment}) # Define the learning rate lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) # AdamSGD optimizer m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._model.parameters, lr_schedule, momentum=m_schedule, unit_gain=True, variance_momentum=vm_schedule) if self.distributed_training: raise NotImplementedError('ASGD not implemented yet.') # _actions is a sparse 1-hot encoding of the actions done by the agent q_acted = reduce_sum(self._model * self._actions, axis=0) # Define the trainer with Huber Loss function criterion = huber_loss(q_acted, self._q_targets, 1.0) self._learner = l_sgd self._trainer = Trainer(self._model, (criterion, None), l_sgd)
def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Convolution2D((8, 8), 16, strides=4), Convolution2D((4, 4), 32, strides=2), Convolution2D((3, 3), 32, strides=1), Dense(256, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)
# define loss / metrics # like Tensorflow the softmax is done # internally (if needed), so all we need are the logits ce = cross_entropy_with_softmax(logits, labels) pe = classification_error(logits, labels) # training config batch_size = 32 epochs = 15 n_batches = len(Xtrain) // batch_size # do the training # specify the training algorithm trainer = Trainer(logits, (ce, pe), adam(logits.parameters, lr=1e-2, momentum=0.9)) # helper function def get_output(node, X, Y): ret = node.forward(dict(inputs=X, labels=Y)) return list(ret[1].values())[0].mean() costs = [] errors = [] test_costs = [] test_errors = [] for i in range(epochs): cost = 0 err = 0
# internally (if needed), so all we need are the logits ce = cross_entropy_with_softmax(logits, labels) pe = classification_error(logits, labels) # training config batch_size = 32 epochs = 15 n_batches = len(Xtrain) // batch_size # do the training # specify the training algorithm trainer = Trainer(logits, (ce, pe), adam(logits.parameters, lr=1e-2, momentum=0.9)) # helper function def get_output(node, X, Y): ret = node.forward(dict(inputs=X, labels=Y)) return list(ret[1].values())[0].mean() costs = [] errors = [] test_costs = [] test_errors = [] for i in range(epochs): cost = 0 err = 0
def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_epochs, minibatch_size, model_dir=None, log_dir=None, tensorboard_logdir=None, gen_heartbeat=False, fp16=False): """ :param reader_train: :param reader_test: :param network_name: :param epoch_size: 一个epoch有多少样本 :param max_epochs: 训练多少个epoch :param model_dir: :param log_dir: :param tensorboard_logdir: :param gen_heartbeat: :param fp16: :return:准确率,用时 """ set_computation_network_trace_level(0) # Input variables denoting the features and label data input_var = C.input_variable((num_channels, image_height, image_width), name='features') label_var = C.input_variable((num_classes)) with C.default_options(dtype=np.float32): # create model, and configure learning parameters model = create_cifar10_model(input_var, 3, num_classes) # loss and metric loss = cross_entropy_with_softmax(model, label_var) error_rate = classification_error(model, label_var) # shared training parameters # Set learning parameters lr_per_sample = [] check_point = [80, 120, 160, 180] lrs = [3e-2, 3e-3, 3e-4, 3e-4, 5e-5] for i in range(max_epochs + 1): if i in range(0, check_point[0]): lr_per_sample.append(lrs[0]) if i in range(check_point[0], check_point[1]): lr_per_sample.append(lrs[1]) if i in range(check_point[1], check_point[2]): lr_per_sample.append(lrs[2]) if i in range(check_point[2], check_point[3]): lr_per_sample.append(lrs[3]) if i > check_point[3]: lr_per_sample.append(lrs[4]) lr_schedule = learning_parameter_schedule(lr_per_sample, minibatch_size=minibatch_size, epoch_size=epoch_size) mm_schedule = momentum_schedule(0.9, minibatch_size) #动量 # progress writers progress_writers = [ ProgressPrinter(tag='Training', num_epochs=max_epochs, gen_heartbeat=gen_heartbeat) ] tensorboard_writer = None if tensorboard_logdir is not None: tensorboard_writer = TensorBoardProgressWriter( freq=10, log_dir=tensorboard_logdir, model=model) progress_writers.append(tensorboard_writer) # trainer object l2_reg_weight = 0.0001 learner = adam(model.parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) trainer = Trainer(model, (loss, error_rate), learner, progress_writers) # define mapping from reader streams to network inputs input_map = { input_var: reader_train.streams.features, label_var: reader_train.streams.labels } log_number_of_parameters(model) print("*********Training Start*********") start = time.clock() for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = reader_train.next_minibatch( min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch. trainer.train_minibatch(data) # update model with it sample_count += trainer.previous_minibatch_sample_count # count samples processed so far trainer.summarize_training_progress() # Log mean of each parameter tensor, so that we can confirm that the parameters change indeed. if tensorboard_writer: for parameter in model.parameters: tensorboard_writer.write_value(parameter.uid + "/mean", reduce_mean(parameter).eval(), epoch) if model_dir: model.save( os.path.join(model_dir, network_name + "_{}.dnn".format(epoch))) enable_profiler() # begin to collect profiler data after first epoch # Evaluation parameters test_epoch_size = 10000 minibatch_size = 32 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 while sample_count < test_epoch_size: current_minibatch = min(minibatch_size, test_epoch_size - sample_count) # Fetch next test min batch. data = reader_test.next_minibatch(current_minibatch, input_map=input_map) # minibatch data to be trained with metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. sample_count += data[label_var].num_samples print("") trainer.summarize_test_progress() print("") elapsed = (time.clock() - start) return 1 - metric_numer / metric_denom, elapsed
input_sequences, labels = create_model_placeholders() if USE_SAVED_MODEL: model = load_model(MODEL_FILE_PATH) else: model = create_model() z = model(input_sequences) ce = cross_entropy_with_softmax(z, labels) errs = classification_error(z, labels) momentum_schedule = momentum_schedule_per_sample(0.9990913221888589) clipping_threshold_per_sample = 5.0 gradient_clipping_with_truncation = True learner = adam(z.parameters, 0.001, momentum_schedule) trainer = Trainer(z, (ce, errs), [learner]) for e in range(EPOCHS): arguments = get_random_batch() if TRAIN: trainer.train_minibatch(arguments) if e % LOG_FREQUENCY == 0: print('Epoch: ' + str(e) + ', Average Classification Error: {:,.0%}'.format( pd.DataFrame(errs.eval(arguments))[0].mean())) if TRAIN: z.save(MODEL_FILE_PATH) print("Saved model to '%s'" % MODEL_FILE_PATH)
def __init__(self, state_dim, action_dim, gamma=0.99, learning_rate=1e-4, momentum=0.95): self.state_dim = state_dim self.action_dim = action_dim self.gamma = gamma with default_options(activation=relu, init=he_uniform()): # Convolution filter counts were halved to save on memory, no gpu :( self.model = Sequential([ Convolution2D((8, 8), 16, strides=4, name='conv1'), Convolution2D((4, 4), 32, strides=2, name='conv2'), Convolution2D((3, 3), 32, strides=1, name='conv3'), Dense(256, init=he_uniform(scale=0.01), name='dense1'), Dense(action_dim, activation=None, init=he_uniform(scale=0.01), name='actions') ]) self.model.update_signature(Tensor[state_dim]) # Create the target model as a copy of the online model self.target_model = None self.update_target() self.pre_states = input_variable(state_dim, name='pre_states') self.actions = input_variable(action_dim, name='actions') self.post_states = input_variable(state_dim, name='post_states') self.rewards = input_variable((), name='rewards') self.terminals = input_variable((), name='terminals') self.is_weights = input_variable((), name='is_weights') predicted_q = reduce_sum(self.model(self.pre_states) * self.actions, axis=0) # DQN - calculate target q values # post_q = reduce_max(self.target_model(self.post_states), axis=0) # DDQN - calculate target q values online_selection = one_hot( argmax(self.model(self.post_states), axis=0), self.action_dim) post_q = reduce_sum(self.target_model(self.post_states) * online_selection, axis=0) post_q = (1.0 - self.terminals) * post_q target_q = stop_gradient(self.rewards + self.gamma * post_q) # Huber loss delta = 1.0 self.td_error = minus(predicted_q, target_q, name='td_error') abs_error = abs(self.td_error) errors = element_select(less(abs_error, delta), square(self.td_error) * 0.5, delta * (abs_error - 0.5 * delta)) loss = errors * self.is_weights # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_scheule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) self._learner = adam(self.model.parameters, lr_schedule, m_scheule, variance_momentum=vm_schedule) self.writer = TensorBoardProgressWriter(log_dir='metrics', model=self.model) self.trainer = Trainer(self.model, (loss, None), [self._learner], self.writer)
def __init__(self, in_shape, output_shape, device_id=None, learning_rate=0.00025, momentum=0.9, minibatch_size=32, update_interval=10000, n_workers=1, visualizer=None): """ Q Neural Network following Mnih and al. implementation and default options. The network has the following topology: Convolution(32, (8, 8)) Convolution(64, (4, 4)) Convolution(64, (2, 2)) Dense(512) :param in_shape: Shape of the observations perceived by the learner (the neural net input) :param output_shape: Size of the action space (mapped to the number of output neurons) :param device_id: Use None to let CNTK select the best available device, -1 for CPU, >= 0 for GPU (default: None) :param learning_rate: Learning rate (default: 0.00025, as per Mnih et al.) :param momentum: Momentum, provided as momentum value for averaging gradients without unit gain filter Note that CNTK does not currently provide an implementation of Graves' RmsProp with momentum. It uses AdamSGD optimizer instead. (default: 0, no momentum with RProp optimizer) :param minibatch_size: Minibatch size (default: 32, as per Mnih et al.) :param n_workers: Number of concurrent worker for distributed training. (default: 1, not distributed) :param visualizer: Optional visualizer allowing the model to save summary data (default: None, no visualization) Ref: Mnih et al.: "Human-level control through deep reinforcement learning." Nature 518.7540 (2015): 529-533. """ assert learning_rate > 0, 'learning_rate should be > 0' assert 0. <= momentum < 1, 'momentum should be 0 <= momentum < 1' QModel.__init__(self, in_shape, output_shape) CntkModel.__init__(self, device_id, False, n_workers, visualizer) self._nb_actions = output_shape self._steps = 0 self._target_update_interval = update_interval self._target = None # Input vars self._environment = input(in_shape, name='env', dynamic_axes=(Axis.default_batch_axis())) self._q_targets = input(1, name='q_targets', dynamic_axes=(Axis.default_batch_axis())) self._actions = input(output_shape, name='actions', dynamic_axes=(Axis.default_batch_axis())) # Define the neural network graph self._model = self._build_model()(self._environment) self._target = self._model.clone( CloneMethod.freeze, {self._environment: self._environment} ) # Define the learning rate lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) # AdamSGD optimizer m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._model.parameters, lr_schedule, momentum=m_schedule, unit_gain=True, variance_momentum=vm_schedule) if self.distributed_training: raise NotImplementedError('ASGD not implemented yet.') # _actions is a sparse 1-hot encoding of the actions done by the agent q_acted = reduce_sum(self._model * self._actions, axis=0) # Define the trainer with Huber Loss function criterion = huber_loss(q_acted, self._q_targets, 1.0) self._learner = l_sgd self._trainer = Trainer(self._model, (criterion, None), l_sgd)