def get_rmse_log(net, X_train, y_train): """Gets root mse between the logarithms of the prediction and the truth.""" num_train = X_train.shape[0] clipped_preds = np.clip(net(X_train), 1, float('inf')) return np.sqrt( 2 * np.sum(square_loss(np.log(clipped_preds), np.log(y_train))).item() / num_train)
def multibox_prior(data, sizes, ratios): #data: batch, channels, height, width in_height, in_width = data.shape[-2:] device, num_sizes, num_ratios = data.ctx, len(sizes), len(ratios) boxes_per_pixel = num_sizes + num_ratios - 1 size_tensor = np.array(sizes, ctx=device) ratio_tensor = np.array(ratios, ctx=device) # Offsets are required to move the anchor to center of a pixel # Since pixel (height=1, width=1), we choose to offset our centers by 0.5 offset_w, offset_h = 0.5, 0.5 steps_h = 1.0 / in_height # Scaled steps in y axis steps_w = 1.0 / in_width # Scaled steps in x axis # Generate all center points for the anchor boxes center_h = (np.arange(in_height, ctx=device) + offset_h) * steps_h center_w = (np.arange(in_width, ctx=device) + offset_w) * steps_w shift_x, shift_y = np.meshgrid(center_w, center_h) shift_x, shift_y = shift_x.reshape(-1), shift_y.reshape(-1) # Generate boxes_per_pixel number of heights and widths which are later # used to create anchor box corner coordinates (xmin, xmax, ymin, ymax) # concat (various sizes, first ratio) and (first size, various ratios) w = np.concatenate((size_tensor * np.sqrt(ratio_tensor[0]), size_tensor[0]* np.sqrt(ratio_tensor[1:])))\ * in_height / in_width h = np.concatenate((size_tensor / np.sqrt(ratio_tensor[0]), sizes[0] / np.sqrt(ratio_tensor[1:]))) # Divide by 2 to get half height and half width anchor_manipulations = np.tile( np.stack((-w, -h, w, h)).T, (in_height * in_width, 1)) / 2 # Each center point will have boxes_per_pixel number of anchor boxes, so # generate grid of all anchor box centers with boxes_per_pixel repeats out_grid = np.stack([shift_x, shift_y, shift_x, shift_y], axis=1).repeat(boxes_per_pixel, axis=0) output = out_grid + anchor_manipulations # print(output) print(in_height, in_width) return np.expand_dims(output, axis=0)
def evaluator(network, inter_matrix, test_data, ctx): scores = [] for values in inter_matrix: feat = gluon.utils.split_and_load(values, ctx, even_split=False) scores.extend([network(i).asnumpy() for i in feat]) recons = np.array([item for sublist in scores for item in sublist]) # Calculate the test RMSE. rmse = np.sqrt( np.sum(np.square(test_data - np.sign(test_data) * recons)) / np.sum(np.sign(test_data))) return float(rmse)
def log_rmse(net, features, labels): #To further stabilize the value when the logarithm is taken, set the #value less than 1 as 1 clipped_preds = np.clip(net(features), 1, float('inf')) return np.sqrt(2 * loss(np.log(clipped_preds), np.log(labels)).mean())
def __init__(self, d_model, d_kv, d_ff, is_decoder, num_heads=12, dropout_prob=0.1, layer_norm_eps=1E-6, activation='relu', init_factor=1.0, layout='NT', dtype='float32'): super().__init__() self._d_model = d_model self._d_kv = d_kv self._d_ff = d_ff self._is_decoder = is_decoder self._num_heads = num_heads self._inner_dim = self._num_heads * self._d_kv self._dtype = dtype assert layout in ['TN', 'NT'], \ 'Invalid layout: {}. Only "TN" and "NT" are supported.'.format(layout) self._layout = layout self._time_axis = 1 if self.layout == 'NT' else 0 self.self_attn_layer_norm = RMSNorm(in_channels=d_model, center=False, scale=True, gamma_initializer=Constant( 1.0 * init_factor), variance_epsilon=layer_norm_eps, dtype=dtype) # avoid scaling before softmax # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136 self.self_attn_q = nn.Dense(units=self._inner_dim, in_units=d_model, flatten=False, use_bias=False, weight_initializer=Normal( (d_model * d_kv)**-0.5 * init_factor), dtype=dtype) self.self_attn_k = nn.Dense(units=self._inner_dim, in_units=d_model, flatten=False, use_bias=False, weight_initializer=Normal(d_model**-0.5 * init_factor), dtype=dtype) self.self_attn_v = nn.Dense(units=self._inner_dim, in_units=d_model, flatten=False, use_bias=False, weight_initializer=Normal(d_model**-0.5 * init_factor), dtype=dtype) self.self_attn = MultiHeadAttentionCell( query_units=self._inner_dim, num_heads=num_heads, attention_dropout=dropout_prob, scaled=False, normalized=False, dtype=dtype, layout='NTK' if layout == 'NT' else 'TNK', use_einsum=False) self.self_attn_proj = nn.Dense( units=d_model, in_units=self._inner_dim, flatten=False, use_bias=False, weight_initializer=Normal(self._inner_dim**-0.5 * init_factor), dtype=dtype) if is_decoder: self.cross_attn_layer_norm = RMSNorm( in_channels=d_model, center=False, scale=True, gamma_initializer=Constant(1.0 * init_factor), variance_epsilon=layer_norm_eps, dtype=dtype) # avoid scaling before softmax self.cross_attn_q = nn.Dense( units=self._inner_dim, in_units=d_model, flatten=False, use_bias=False, weight_initializer=Normal( (d_model * d_kv)**-0.5 * init_factor), dtype=dtype) self.cross_attn_k = nn.Dense(units=self._inner_dim, in_units=d_model, flatten=False, use_bias=False, weight_initializer=Normal( d_model**-0.5 * init_factor), dtype=dtype) self.cross_attn_v = nn.Dense(units=self._inner_dim, in_units=d_model, flatten=False, use_bias=False, weight_initializer=Normal( d_model**-0.5 * init_factor), dtype=dtype) self.cross_attn = MultiHeadAttentionCell( query_units=self._inner_dim, num_heads=num_heads, attention_dropout=dropout_prob, scaled=False, normalized=False, dtype=dtype, layout='NTK' if layout == 'NT' else 'TNK', use_einsum=False) self.cross_attn_proj = nn.Dense( units=d_model, in_units=self._inner_dim, flatten=False, use_bias=False, weight_initializer=Normal(self._inner_dim**-0.5 * init_factor), dtype=dtype) assert activation in ['relu', 'gated-gelu'], \ '{} is not supported. Please choose from "relu" and "gated-gelu"'.format(activation) # the weight_initializer here is equivalent to Normal(in_units ** -0.5 * init_factor) self.ffn = PositionwiseFFN( units=d_model, hidden_size=d_ff, use_bias=False, activation_dropout=dropout_prob, dropout=dropout_prob, weight_initializer=Xavier('gaussian', 'in', np.sqrt(init_factor)), activation='relu' if activation == 'relu' else 'gelu(tanh)', use_gated_activation=False if activation == 'relu' else True, normalization='rms_norm', layer_norm_eps=layer_norm_eps, pre_norm=True, dtype=dtype, center=False, scale=True, gamma_initializer=Constant(1.0 * init_factor)) self.dropout = nn.Dropout(dropout_prob)
def multi_head_dot_attn(query, key, value, mask=None, edge_scores=None, dropout: float = 0.0, scaled: bool = True, normalized: bool = False, eps: float = 1E-6, query_head_units: Optional[int] = None, layout: str = 'NKT', use_einsum: bool = False, dtype=np.float32): """Multihead dot product attention between the query, key, value. scaled is False, normalized is False: D(h_q, h_k) = <h_q, h_k> scaled is True, normalized is False: D(h_q, h_k) = <h_q, h_k> / sqrt(dim_q) scaled is False, normalized is True: D(h_q, h_k) = <h_q / ||h_q||, h_k / ||h_k||> scaled is True, normalized is True: D(h_q, h_k) = <h_q / ||h_q||, h_k / ||h_k||> / sqrt(dim_q) If edge_scores is provided, we will calcualte the attention as scores = D(h_q, h_k) + EdgeScore_{q, k} Parameters ---------- query Query. The shape depends on the layout - layout is 'NKT' Shape (batch_size, num_heads, query_length, key_dim) - layout is 'NTK' Shape (batch_size, query_length, num_heads, key_dim) - layout is 'TNK' Shape (query_length, batch_size, num_heads, key_dim) key Key. The shape depends on the layout - layout is 'NKT' Shape (batch_size, num_heads, mem_length, key_dim) - layout is 'NTK' Shape (batch_size, mem_length, num_heads, key_dim) - layout is 'TNK' Shape (mem_length, batch_size, num_heads, key_dim) value Value. The shape depends on the layout - layout is 'NKT' Shape (batch_size, num_heads, mem_length, value_dim) - layout is 'NTK' Shape (batch_size, mem_length, num_heads, value_dim) - layout is 'TNK' Shape (mem_length, batch_size, num_heads, value_dim) mask Mask between query and memory. Shape (batch_size, query_length, mem_length) edge_scores The edge attention score. Shape can be any shape that is broadcastable to (batch_size, num_heads, query_length, mem_length) dropout Dropout rate scaled Whether to divide the attention weights by the sqrt of the query dimension. This is first proposed in "[NIPS2017] Attention is all you need.":: score = <h_q, h_k> / sqrt(dim_q) normalized If turned on, the cosine distance is used, i.e:: score = <h_q / ||h_q||, h_k / ||h_k||> eps The epsilon value used in L2 normalization query_head_units The units of each query head. If it's empty, we will estimate it via the shape_array of the query. layout This stands for the layout of the attention cell. The shape of the input/output will depend on the layout. Currently, we support 'NKT', 'NTK' and 'TNK' in which 'N' means the batch_size, 'K' means the head, and 'T' means the length dimension. use_einsum Whether to use einsum for the computation Returns ------- context_vec - layout is 'NKT' or 'NTK' Shape (batch_size, query_length, num_heads * value_units) - layout is 'TNK' Shape (query_length, batch_size, num_heads * value_units) additional_info scores: Shape (batch_size, num_head, query_length, mem_length) attn_weight: Shape (batch_size, num_head, query_length, mem_length) """ # TODO(sxjscience) Profile layout if normalized: query = l2_normalize(query, axis=-1, eps=eps) key = l2_normalize(key, axis=-1, eps=eps) if scaled: if query_head_units is None: query_shape = npx.shape_array(query) scale = np.sqrt(query_shape[-1]) else: scale = math.sqrt(query_head_units) else: scale = None if layout == 'NKT': # 1. Expand the dimension of the mask: # (B, L_query, L_mem) --> (B, 1, L_query, L_mem) if mask is not None: mask = np.expand_dims(mask, axis=1) # 2. Calculate the attention weights # Score: (B, N, L_query, C_Q) X (B, N, L_mem, C_Q) --> (B, N, L_query, L_mem) scores = npx.batch_dot(query, key, transpose_b=True) if edge_scores is not None: scores = scores + edge_scores if scaled: scores = scores / scale attn_weights = masked_softmax(scores, mask, dtype=dtype, axis=-1) attn_weights = npx.dropout(attn_weights, p=dropout) # 3. Calculate the context vector # (B, N, L_query, L_mem) X (B, N, L_mem, C_V) --> (B, L_query, N * C_V) if use_einsum: context_vec = np.einsum('bnij,bnjc->binc', attn_weights, value) else: context_vec = npx.batch_dot(attn_weights, value).transpose( (0, 2, 1, 3)) context_vec = npx.reshape(context_vec, (-2, -2, -1)) elif layout == 'NTK': # 1. Expand the dimension of the mask: # (B, L_query, L_mem) --> (B, 1, L_query, L_mem) if mask is not None: mask = np.expand_dims(mask, axis=1) # 2. Calculate the attention weights # Score: (B, L_query, N, C_Q) X (B, L_mem, N, C_Q) --> (B, N, L_query, L_mem) if use_einsum: scores = np.einsum('binc,bjnc->bnij', query, key) else: scores = npx.batch_dot(np.swapaxes(query, 1, 2), np.swapaxes(key, 1, 2), transpose_b=True) if edge_scores is not None: scores = scores + edge_scores if scaled: scores = scores / scale attn_weights = masked_softmax(scores, mask, dtype=dtype) attn_weights = npx.dropout(attn_weights, p=dropout) # 3. Calculate the context vector # (B, N, L_query, L_mem) X (B, L_mem, N, C_V) --> (B, L_query, N * C_V) if use_einsum: context_vec = np.einsum('bnij,bjnc->binc', attn_weights, value) else: context_vec = npx.batch_dot(attn_weights, np.swapaxes(value, 1, 2)).transpose( (0, 2, 1, 3)) context_vec = npx.reshape(context_vec, (-2, -2, -1)) elif layout == 'TNK': # 1. Expand the dimension of the mask: # (B, L_query, L_mem) --> (B, 1, L_query, L_mem) if mask is not None: mask = np.expand_dims(mask, axis=1) # 2. Calculate the attention weights # Score: (L_query, B, N, C_Q) X (L_mem, B, N, C_Q) --> (B, N, L_query, L_mem) # This layout structure can be implemented very efficiently because B, N are consecutive # to each other. To have a clear picture of what's happening, we may consider the # (i, j)th element of the output # out[i, j, :, :] = query[:, i, j, :] X key[:, i, j, :].T, which is just one GEMM call # We can thus implement the whole kernel via a single call of batched GEMM with stride. if use_einsum: scores = np.einsum('ibnc,jbnc->bnij', query, key) else: scores = npx.batch_dot(query.transpose((1, 2, 0, 3)), key.transpose((1, 2, 3, 0))) if edge_scores is not None: scores = scores + edge_scores if scaled: scores = scores / scale attn_weights = masked_softmax(scores, mask, dtype=dtype) attn_weights = npx.dropout(attn_weights, p=dropout) # 3. Calculate the context vector # (B, N, L_query, L_mem) X (L_mem, B, N, C_V) --> (L_query, B, N * C_V) # Again, we can implement it via a single call to batched GEMM with stride. # Shape (B, N, L_query, C_V) if use_einsum: context_vec = np.einsum('bnij,jbnc->ibnc', attn_weights, value) else: context_vec = npx.batch_dot(attn_weights, value.transpose( (1, 2, 0, 3))).transpose( (2, 0, 1, 3)) context_vec = npx.reshape(context_vec, (-2, -2, -1)) else: raise NotImplementedError( 'layout="{}" is not supported! ' 'We only support layout = "NKT", "NTK", and "TNK".'.format(layout)) return context_vec, [scores, attn_weights]
def dot_attn_score(query, key, scaled=True, normalized=False, eps=1E-6, layout='NT'): """The inner function call to calculate the score used in dot-product attention. We support multiple leading batch dimensions. scaled is True: D(h_q, h_k) = <h_q, h_k> / sqrt(dim_q) normalized is True: D(h_q, h_k) = <h_q / ||h_q||, h_k / ||h_k||> both scaled and normalized: D(h_q, h_k) = <h_q / ||h_q||, h_k / ||h_k||> / sqrt(dim_q) Parameters ---------- query : symbol or ndarray - layout is 'NT' (B0, ..., BN, query_length, query_dim) - layout is 'TN' (query_length, B0, ..., BN, query_dim) key : symbol or ndarray - layout is 'NT' (B0, ..., BN, key_length, key_dim) - layout is 'TN' (key_length, B0, ..., BN, key_dim) scaled : bool Whether to divide the query by the square-root of the query_dim If True: D(h_q, h_k) = <h_q, h_k> / sqrt(dim_q) normalized : bool Whether to normalize the query and the key embeddings If True: D(h_q, h_k) = <h_q / ||h_q||, h_k / ||h_k||> eps : float The epsilon used in the normalization layout The layout of the layer. Can be 'TN' or 'NT'. Returns ------- scores : symbol or ndarray (B0, ..., BN, query_length, key_length) """ if normalized: query = l2_normalize(query, -1, eps=eps) key = l2_normalize(key, -1, eps=eps) if scaled: query_shape = npx.shape_array(query) # TODO(sxjscience) Remove .astype(np.float32). # Wait for https://github.com/apache/incubator-mxnet/issues/18084 query_units = query_shape[-1].astype(np.float32) query = query / np.sqrt(query_units) if layout == 'NT': scores = npx.batch_dot(query, key, transpose_b=True) else: raise NotImplementedError( 'layout={} is not supported.' ' Currently, only layout = "NT" is implemented!'.format(layout)) return scores
def forward(self, data): var = np.power(data, 2).mean(-1, keepdims=True) data = data * np.reciprocal(np.sqrt(var + self._epsilon)) return data * self.gamma.data() + self.beta.data()
def forward(self, x): var = np.power(x.astype('float32'), 2).mean(-1, keepdims=True) x = x * np.reciprocal(np.sqrt(var + self.variance_epsilon)) if self.gemma.dtype == 'float16': x = x.astype('float16') return self.gemma * x
def log_rmse(net, features, labels): # to futher stabilize the value when the log is taken # set the value less than 1 as 1 net_out = net(features) clipped_preds = np.clip(net_out, 1, float('inf')) return np.sqrt(2 * loss(np.log(clipped_preds), np.log(labels)).mean())