def train(X, contents_Y, styles_Y, ctx, lr, num_epochs, lr_decay_epoch): X, styles_Y_gram, trainer = get_inits(X, ctx, lr, styles_Y) animator = d2l.Animator(xlabel='epoch', ylabel='loss', xlim=[1, num_epochs], legend=['content', 'style', 'TV'], ncols=2, figsize=(7,2.5)) for epoch in range(1, num_epochs+1): with autograd.record(): contents_Y_hat, styles_Y_hat = extract_features( X, content_layers, style_layers) contents_l, styles_l, tv_l, l = compute_loss( X, contents_Y_hat, styles_Y_hat, contents_Y, styles_Y_gram) l.backward() trainer.step(1) nd.waitall() if epoch % lr_decay_epoch == 0: trainer.set_learning_rate(trainer.learning_rate * 0.1) # if epoch%10 == 0: # print(epoch) if epoch % 10 == 0: # animator.axes[1].imshow(postprocess(X).asnumpy()) animator.add(epoch, [nd.add_n(*contents_l).asscalar(), nd.add_n(*styles_l).asscalar(), tv_l.asscalar()]) if epoch % 100 == 0: d2l.plt.imsave('neural-style'+str(epoch)+'.png', postprocess(X).asnumpy()) return X
def compute_loss(X, contents_Y_hat, styles_Y_hat, contents_Y, styles_Y_gram): # Calculate the content, style, and total variance losses respectively contents_l = [content_loss(Y_hat, Y) * content_weight for Y_hat, Y in zip( contents_Y_hat, contents_Y)] styles_l = [style_loss(Y_hat, Y) * style_weight for Y_hat, Y in zip( styles_Y_hat, styles_Y_gram)] tv_l = tv_loss(X) * tv_weight # Add up all the losses l = nd.add_n(*styles_l) + nd.add_n(*contents_l) + tv_l return contents_l, styles_l, tv_l, l
def compute_loss(X, contents_Y_hat, styles_Y_hat, contents_Y, styles_Y_gram): # 分别计算内容损失、样式损失和总变差损失 contents_l = [content_loss(Y_hat, Y) * content_weight for Y_hat, Y in zip( contents_Y_hat, contents_Y)] styles_l = [style_loss(Y_hat, Y) * style_weight for Y_hat, Y in zip( styles_Y_hat, styles_Y_gram)] tv_l = tv_loss(X) * tv_weight # 对所有损失求和 l = nd.add_n(*styles_l) + nd.add_n(*contents_l) + tv_l return contents_l, styles_l, tv_l, l
def compute_loss(X, contents_Y_hat, styles_Y_hat, contents_Y, styles_Y_gram): contents_l = [ content_loss(Y_hat, Y) * content_weight for Y_hat, Y in zip(contents_Y_hat, contents_Y) ] styles_l = [ style_loss(Y_hat, Y) * style_weight for Y_hat, Y in zip(styles_Y_hat, styles_Y_gram) ] tv_l = tv_loss(X) * tv_weight l = nd.add_n(*styles_l) + nd.add_n(*contents_l) + tv_l return contents_l, styles_l, tv_l, l
def corr2d_multi_in(X, K): """ 首先沿着X和K的第0维(通道维)遍历。 然后使用*将结果列表变成add_n函数的位置参数 # (positional argument)来进行相加 """ # zip(ndarray, ndarray)打包成组合ndarray实现同时取数 return nd.add_n(*[tool.corr2d(x, k) for x, k in zip(X, K)])
def forward(self, x_list): ''' Parameters ---------- x_list: list[mx.ndarray], shape is (batch_size, num_of_vertices, num_of_features, num_of_timesteps) Returns ---------- Y_hat: mx.ndarray, shape is (batch_size, num_of_vertices, num_for_prediction) ''' if len(x_list) != len(self.submodules): raise ValueError("num of submodule not equals to " "length of the input list") num_of_vertices_set = {i.shape[1] for i in x_list} if len(num_of_vertices_set) != 1: raise ValueError("Different num_of_vertices detected! " "Check if your input data have same " "size on axis 1.") batch_size_set = {i.shape[0] for i in x_list} if len(batch_size_set) != 1: raise ValueError("Input values must have same batch size!") submodule_outputs = [ self.submodules[idx](x_list[idx]) for idx in range(len(x_list)) ] return nd.add_n(*submodule_outputs)
def compute_loss(res_img, weights, contents_features_h, styles_features_h, contents_features, styles_features_gram): content_weight, style_weight, tv_weight = weights contents_l = [ content_loss(c_f_h, c_f) * content_weight for c_f_h, c_f in zip(contents_features_h, contents_features) ] contents_l = nd.add_n(*contents_l).asscalar() styles_l = [ style_loss(s_f_h, s_f_gram) * style_weight for s_f_h, s_f_gram in zip(styles_features_h, styles_features_gram) ] styles_l = nd.add_n(*styles_l).asscalar() tv_l = (tv_loss(res_img) * tv_weight).asscalar() total_l = contents_l + styles_l + tv_l return total_l, contents_l, styles_l, tv_l
def corr1d_multi_in(X, K): """ 多输入通道一维卷积 :param X: :param K: :return: """ # 首先沿着 X 和 Y 的第 0 维(通道维)遍历,使用 * 将结果列表变成 add_n 函数的位置参数来进行相加 return nd.add_n(*[corr1d(x, k) for x, k in zip(X, K)])
def train(X, contents_Y, styles_Y, ctx, lr, max_epochs, lr_decay_epoch): X, styles_Y_gram, trainer = get_inits(X, ctx, lr, styles_Y) for i in range(max_epochs): start = time.time() with autograd.record(): contents_Y_hat, styles_Y_hat = extract_features( X, content_layers, style_layers) contents_l, styles_l, tv_l, l = compute_loss( X, contents_Y_hat, styles_Y_hat, contents_Y, styles_Y_gram) l.backward() trainer.step(1) nd.waitall() if i % 50 == 0 and i != 0: print('epoch %3d, content loss %.2f, style loss %.2f, ' 'TV loss %.2f, %.2f sec' % (i, nd.add_n(*contents_l).asscalar(), nd.add_n(*styles_l).asscalar(), tv_l.asscalar(), time.time() - start)) if i % lr_decay_epoch == 0 and i != 0: trainer.set_learning_rate(trainer.learning_rate * 0.1) print('change lr to %.1e' % trainer.learning_rate) return X
def test_ActivationRegularizationLoss(alpha: float): ar = ActivationRegularizationLoss(alpha, batch_axis=0) inputs = [ nd.arange(1000).reshape(10, 10, 10), nd.arange(1000).reshape(10, 10, 10), nd.arange(1000).reshape(10, 10, 10), ] ar_result = ar(*inputs) outputs = [ alpha * nd.mean((array * array), axis=0, exclude=True) for array in inputs ] assert np.isclose(nd.add_n(*outputs).asnumpy(), ar_result.asnumpy()).all()
def test_TemporalActivationRegularizationLoss(beta: float): tar = TemporalActivationRegularizationLoss(beta, time_axis=1, batch_axis=0) inputs = [ nd.arange(1000).reshape(10, 10, 10), nd.arange(1000).reshape(10, 10, 10), nd.arange(1000).reshape(10, 10, 10), ] tar_result = tar(*inputs) outputs = [ beta * nd.mean( (array[:, 1:, :] - array[:, :-1, :]).__pow__(2), axis=0, exclude=True, ) for array in inputs ] assert np.isclose(nd.add_n(*outputs).asnumpy(), tar_result.asnumpy()).all()
def corr2d_multi_in(X, K): # 首先沿着X和K的第0维(通道维)遍历。然后使用*将结果列表变成add_n函数的位置参数 # (positional argument)来进行相加 # # [d2l.corr2d(x, k) for x, k in zip(X, K)] # [0] # [[19. 25.] # [37. 43.]] # [1] # [[37. 47.] # [67. 77.]] # [0] + [1] = # 56 72 # 104 120 return nd.add_n(*[d2l.corr2d(x, k) for x, k in zip(X, K)])
def global_norm( arrays: Union[Generator[NDArray, NDArray, NDArray], List[NDArray], Tuple[NDArray]] ) -> NDArray: """ Calculate global norm on list or tuple of NDArrays using this formula: `global_norm = sqrt(sum([l2norm(p)**2 for p in parameters]))` :param arrays: list or tuple of parameters to calculate global norm on :return: single-value NDArray """ def _norm(array): if array.stype == 'default': x = array.reshape((-1, )) return nd.dot(x, x) return array.norm().square() total_norm = nd.add_n(*[_norm(arr) for arr in arrays]) total_norm = nd.sqrt(total_norm) return total_norm
def grad_global_norm(parameters, max_norm): """Calculate the 2-norm of gradients of parameters, and how much they should be scaled down such that their 2-norm does not exceed `max_norm`. If gradients exist for more than one context for a parameter, user needs to explicitly call ``trainer.allreduce_grads`` so that the gradients are summed first before calculating the 2-norm. .. note:: This function is only for use when `update_on_kvstore` is set to False in trainer. Example:: trainer = Trainer(net.collect_params(), update_on_kvstore=False, ...) for x, y in mx.gluon.utils.split_and_load(X, [mx.gpu(0), mx.gpu(1)]): with mx.autograd.record(): y = net(x) loss = loss_fn(y, label) loss.backward() trainer.allreduce_grads() norm, ratio = grad_global_norm(net.collect_params().values(), max_norm) trainer.update(batch_size * ratio) ... Parameters ---------- parameters : list of Parameters Returns ------- NDArray Total norm. Shape is (1,) NDArray Ratio for rescaling gradients based on max_norm s.t. grad = grad / ratio. If total norm is NaN, ratio will be NaN, too. Shape is (1,) NDArray Whether the total norm is finite. Shape is (1,) """ # collect gradient arrays arrays = [] idx = 0 for p in parameters: if p.grad_req != 'null': p_grads = p.list_grad() arrays.append(p_grads[idx % len(p_grads)]) idx += 1 assert len(arrays) > 0, 'No parameter found available for gradient norm.' # compute gradient norms def _norm(array): # TODO(haibin) norm operator does not support fp16 safe reduction. # Issue is tracked at: https://github.com/apache/incubator-mxnet/issues/14126 x = array.reshape((-1, )).astype('float32', copy=False) return nd.dot(x, x) norm_arrays = [_norm(arr) for arr in arrays] # group norm arrays by ctx def group_by_ctx(arr_list): groups = collections.defaultdict(list) for arr in arr_list: ctx = arr.context groups[ctx].append(arr) return groups norm_groups = group_by_ctx(norm_arrays) # reduce ctx, dtype = arrays[0].context, 'float32' norms = [nd.add_n(*g).as_in_context(ctx) for g in norm_groups.values()] total_norm = nd.add_n(*norms).sqrt() scale = total_norm / max_norm # is_finite = 0 if NaN or Inf, 1 otherwise. is_finite = nd.contrib.isfinite(scale) # if scale is finite, nd.maximum selects the max between scale and 1. That is, # 1 is returned if total_norm does not exceed max_norm. # if scale = NaN or Inf, the result of nd.minimum is undefined. Therefore, we use # choices.take to return NaN or Inf. scale_or_one = nd.maximum(nd.ones((1, ), dtype=dtype, ctx=ctx), scale) choices = nd.concat(scale, scale_or_one, dim=0) chosen_scale = choices.take(is_finite) return total_norm, chosen_scale, is_finite
def corr2d_multi_in(X, K): return nd.add_n(*[corr2d(x, k) for x, k in zip(X, K)])
def accumulate_gradients(self, inputs: Dict[str, np.ndarray], targets: List[np.ndarray], additional_fetches: List[Tuple[int, str]] = None, importance_weights: np.ndarray = None, no_accumulation: bool = False) -> Tuple[float, List[float], float, list]: """ Runs a forward & backward pass, clips gradients if needed and accumulates them into the accumulation :param inputs: environment states (observation, etc.) as well extra inputs required by loss. Shape of ndarray is (batch_size, observation_space_size) or (batch_size, observation_space_size, stack_size) :param targets: targets required by loss (e.g. sum of discounted rewards) :param additional_fetches: additional fetches to calculate and return. Each fetch is specified as (int, str) tuple of head-type-index and fetch-name. The tuple is obtained from each head. :param importance_weights: ndarray of shape (batch_size,) to multiply with batch loss. :param no_accumulation: if True, set gradient values to the new gradients, otherwise sum with previously calculated gradients :return: tuple of total_loss, losses, norm_unclipped_grads, fetched_tensors total_loss (float): sum of all head losses losses (list of float): list of all losses. The order is list of target losses followed by list of regularization losses. The specifics of losses is dependant on the network parameters (number of heads, etc.) norm_unclippsed_grads (float): global norm of all gradients before any gradient clipping is applied fetched_tensors: all values for additional_fetches """ if self.accumulated_gradients is None: self.reset_accumulated_gradients() embedders = [emb.embedder_name for emb in self.model.nets[0].input_embedders] nd_inputs = tuple(nd.array(inputs[emb]) for emb in embedders) assert self.middleware.__class__.__name__ != 'LSTMMiddleware', "LSTM middleware not supported" targets = force_list(targets) with autograd.record(): out_per_head = utils.split_outputs_per_head(self.model(*nd_inputs), self.model.output_heads) tgt_per_loss = utils.split_targets_per_loss(targets, self.losses) losses = list() regularizations = list() additional_fetches = [(k, None) for k in additional_fetches] for h, h_loss, h_out, l_tgt in zip(self.model.output_heads, self.losses, out_per_head, tgt_per_loss): l_in = utils.get_loss_agent_inputs(inputs, head_type_idx=h.head_type_idx, loss=h_loss) # Align arguments with loss.loss_forward and convert to NDArray l_args = utils.to_mx_ndarray(utils.align_loss_args(h_out, l_in, l_tgt, h_loss)) # Calculate loss and all auxiliary outputs loss_outputs = utils.loss_output_dict(utils.to_list(h_loss(*l_args)), h_loss.output_schema) if LOSS_OUT_TYPE_LOSS in loss_outputs: losses.extend(loss_outputs[LOSS_OUT_TYPE_LOSS]) if LOSS_OUT_TYPE_REGULARIZATION in loss_outputs: regularizations.extend(loss_outputs[LOSS_OUT_TYPE_REGULARIZATION]) # Set additional fetches for i, fetch in enumerate(additional_fetches): head_type_idx, fetch_name = fetch[0] # fetch key is a tuple of (head_type_index, fetch_name) if head_type_idx == h.head_type_idx: assert fetch[1] is None # sanity check that fetch is None additional_fetches[i] = (fetch[0], loss_outputs[fetch_name]) # Total loss is losses and regularization (NOTE: order is important) total_loss_list = losses + regularizations total_loss = nd.add_n(*total_loss_list) # Calculate gradients total_loss.backward() assert self.optimizer_type != 'LBFGS', 'LBFGS not supported' # allreduce gradients from all contexts self.trainer.allreduce_grads() # Calculate global norm of gradients # FIXME global norm is returned even when not used for clipping! Is this necessary? # FIXME global norm might be calculated twice if clipping method is global norm norm_unclipped_grads = utils.global_norm(self._model_grads) # Clip gradients if self.network_parameters.clip_gradients: utils.clip_grad( self._model_grads, clip_method=self.network_parameters.gradients_clipping_method, clip_val=self.network_parameters.clip_gradients, inplace=True) # Update self.accumulated_gradients depending on no_accumulation flag if no_accumulation: for acc_grad, model_grad in zip(self.accumulated_gradients, self._model_grads): acc_grad[:] = model_grad else: for acc_grad, model_grad in zip(self.accumulated_gradients, self._model_grads): acc_grad += model_grad # result of of additional fetches fetched_tensors = [fetch[1] for fetch in additional_fetches] # convert everything to numpy or scalar before returning result = utils.asnumpy_or_asscalar((total_loss, total_loss_list, norm_unclipped_grads, fetched_tensors)) return result
def test_add_n(): x = [nd.ones(LARGE_X) for j in range(SMALL_Y)] y = nd.add_n(*x) assert y[0] == SMALL_Y assert y[-1] == SMALL_Y
def test_add_n(): x = [nd.ones(LARGE_X)] y = nd.add_n(*x) assert y[0] == 1 assert y[-1] == 1
def corr2d_multi_in(X, K): #首先沿着X,K的通道维遍历 return nd.add_n(*[d2l.corr2d(x,k) for x,k in zip(X,K)])
def corr2d_multi_in(X, K): # for x, k in zip(X, K): # print(d2l.corr2d(x, k)) return nd.add_n(*[d2l.corr2d(x, k) for x, k in zip(X, K)])
def sum_loss(loss, pred, truths, weights): return nd.add_n( *[w * loss(yhat, y) for w, yhat, y in zip(weights, pred, truths)])
def sum_loss(loss, preds, truths, weights): # loss: a function. e.g. content_loss. return nd.add_n( *[w * loss(yhat, y) for w, yhat, y in zip(weights, preds, truths)]) '''
def clip_grad_global_norm(parameters, max_norm, check_isfinite=True): """Rescales gradients of parameters so that the sum of their 2-norm is smaller than `max_norm`. If gradients exist for more than one context for a parameter, user needs to explicitly call ``trainer.allreduce_grads`` so that the gradients are summed first before calculating the 2-norm. .. note:: This function is only for use when `update_on_kvstore` is set to False in trainer. In cases where training happens on multiple contexts, this method should be used in conjunction with ``trainer.allreduce_grads()`` and ``trainer.update()``. (**not** ``trainer.step()``) Example:: trainer = Trainer(net.collect_params(), update_on_kvstore=False, ...) for x, y in mx.gluon.utils.split_and_load(X, [mx.gpu(0), mx.gpu(1)]): with mx.autograd.record(): y = net(x) loss = loss_fn(y, label) loss.backward() trainer.allreduce_grads() nlp.utils.clip_grad_global_norm(net.collect_params().values(), max_norm) trainer.update(batch_size) ... Parameters ---------- parameters : list of Parameters max_norm : float check_isfinite : bool, default True If True, check that the total_norm is finite (not nan or inf). This requires a blocking .asscalar() call. Returns ------- NDArray or float Total norm. Return type is NDArray of shape (1,) if check_isfinite is False. Otherwise a float is returned. """ def _norm(array): if array.stype == 'default': x = array.reshape((-1)) return nd.dot(x, x) return array.norm().square() arrays = [] i = 0 for p in parameters: if p.grad_req != 'null': grad_list = p.list_grad() arrays.append(grad_list[i % len(grad_list)]) i += 1 assert len( arrays) > 0, 'No parameter found available for gradient norm clipping.' ctx, dtype = arrays[0].context, arrays[0].dtype total_norm = nd.add_n(*[_norm(arr).as_in_context(ctx) for arr in arrays]) total_norm = nd.sqrt(total_norm) if check_isfinite: total_norm = total_norm.asscalar() if not np.isfinite(total_norm): warnings.warn(UserWarning('nan or inf is detected. ' 'Clipping results will be undefined.'), stacklevel=2) scale = max_norm / (total_norm + 1e-8) if check_isfinite: scale = nd.array([scale], dtype=dtype, ctx=ctx) scale = nd.min( nd.concat(scale, nd.ones((1, ), dtype=dtype, ctx=ctx), dim=0)) for p in parameters: if p.grad_req != 'null': for arr in p.list_grad(): arr *= scale.as_in_context(arr.context) return total_norm
def content_loss(content_y_hat, content_y, weights): loss = [] for y, y_hat, w in zip(content_y, content_y_hat, weights): loss.append(w * nd.mean(nd.abs(y - y_hat), axis=0, exclude=True)) if len(loss) == 0: return 0 return nd.add_n(*loss)
def sum_loss(loss, preds, truths, weights): return nd.add_n(*[w*loss(yhat, y) for w, yhat, y in zip( weights, preds, truths)])
def corr2d_multi_in(X, K): # ⾸先沿着X和K的第0维(通道维)遍历。然后使⽤*将结果列表变成add_n函数的位置参数 # ( positional argument)来进⾏相加 return nd.add_n(*[d2l.corr2d(x, k) for x, k in zip(X, K)])
from mxnet import nd def conv_2d_multi_in(X, K): return nd.add_n(*[conv_2d(x, k) for x, k in zip(X, K)]) # 2 * 3 * 3 X = nd.array([[[0, 1, 2], [3, 4, 5], [6, 7, 8]], [[1, 2, 3], [4, 5, 6], [7, 8, 9]]]) # 2 * 2 * 2 K = nd.array([[[0, 1], [2, 3]], [[1, 2], [3, 4]]]) items = [] for x, k in zip(X, K): items.append(conv_2d(x, k)) print(nd.add_n(X, X)) print(nd.add_n(*[X, X])) print(nd.add_n(*items)) print(conv_2d_multi_in(X, K)) def conv_2d_multi_in_out(X, K): return nd.stack(*[conv_2d_multi_in(X, k) for k in K]) print(K) print(nd.stack(K, K + 1, K + 2).shape) print(conv_2d_multi_in_out(X, [K, K + 1, K + 2])) def conv2d_multi_in_out_1x1(X, K):
def corr2d_multi_in(X, K): # 我们首先沿着 X 和 K 的第 0 维(通道维)遍历。然后使用 * 将结果列表 (list) 变成 # add_n 的位置参数(positional argument)来进行相加。 return nd.add_n(*[corr2d(x, k) for x, k in zip(X, K)])
def corr2d_multi_in(X, K): # 首先沿着X和K的第0维(通道维)遍历 # 然后使用*将结果列表变成add_n函数的位置参数来进行相加 return nd.add_n(*[corr2d(x, k) for x, k in zip(X, K)])
def corr1d_multi_in(X, K): # 我们⾸先沿着 X 和 K 的第 0 维(通道维)遍历。然后使⽤ * 将结果列表变成 add_n 函数 # 的位置参数(positional argument)来进⾏相加。 return nd.add_n(*[corr1d(x, k) for x, k in zip(X, K)])
def corr2d_multi_in(X, K): # First, traverse along the 0th dimension (channel dimension) of X and K. # Then, add them together by using * to turn the result list into a # positional argument of the add_n function return nd.add_n(*[d2l.corr2d(x, k) for x, k in zip(X, K)])