def adjust_kp(kp_source, kp_driving, kp_driving_initial, adapt_movement_scale=1, use_relative_movement=False, use_relative_jacobian=False): kp_new = {k: v for k, v in kp_driving.items()} if use_relative_movement: kp_value_diff = (kp_driving['value'] - kp_driving_initial['value']) kp_value_diff *= adapt_movement_scale kp_new['value'] = kp_value_diff + kp_source['value'] if use_relative_jacobian: jacobian_diff = F.batch_matmul( kp_driving['jacobian'], F.reshape( F.batch_inv( F.reshape(kp_driving_initial['jacobian'], (-1, ) + kp_driving_initial['jacobian'].shape[-2:], inplace=False)), kp_driving_initial['jacobian'].shape)) kp_new['jacobian'] = F.batch_matmul(jacobian_diff, kp_source['jacobian']) return kp_new
def warp_coordinates(self, coordinates): theta = self.theta theta = F.reshape( theta, theta.shape[:1] + (1,) + theta.shape[1:], inplace=False) if coordinates.shape[0] == self.bs: transformed = F.batch_matmul( F.tile(theta[:, :, :, :2], (1, coordinates.shape[1], 1, 1)), F.reshape(coordinates, coordinates.shape + (1,), inplace=False)) + theta[:, :, :, 2:] else: transformed = F.batch_matmul( F.tile(theta[:, :, :, :2], (1, coordinates.shape[1], 1, 1)), F.tile(F.reshape(coordinates, coordinates.shape + (1,), inplace=False), (self.bs / coordinates.shape[0], 1, 1, 1))) + theta[:, :, :, 2:] transformed = F.reshape( transformed, transformed.shape[:-1], inplace=False) if self.tps: control_points = self.control_points control_params = self.control_params distances = F.reshape( coordinates, (coordinates.shape[0], -1, 1, 2), inplace=False) - F.reshape(control_points, (1, 1, -1, 2)) distances = F.sum(F.abs(distances), axis=distances.ndim - 1) result = distances ** 2 result = result * F.log(distances + 1e-6) result = result * control_params result = F.sum(result, axis=2) result = F.reshape( result, (self.bs, coordinates.shape[1], 1), inplace=False) transformed = transformed + result return transformed
def attnblock(h, r=8, fix_parameters=False, sn=True, test=False): """Attention block""" x = h # 1x1 convolutions b, c, s0, s1 = h.shape c_r = c // r assert c_r > 0 f_x = convolution(h, c_r, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="f", with_bias=False, sn=sn, test=test) g_x = convolution(h, c_r, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="g", with_bias=False, sn=sn, test=test) h_x = convolution(h, c, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="h", with_bias=False, sn=sn, test=test) # Attend attn = F.batch_matmul(f_x.reshape( [b, c_r, -1]), g_x.reshape([b, c_r, -1]), transpose_a=True) attn = F.softmax(attn, 1) h_x = h_x.reshape([b, c, -1]) o = F.batch_matmul(h_x, attn) o = F.reshape(o, [b, c, s0, s1]) # Shortcut gamma = get_parameter_or_create( "gamma", [1, 1, 1, 1], ConstantInitializer(0.), not fix_parameters) y = gamma * o + x return y
def attn_block(x, name, num_heads=4, fix_parameters=False): """Multihead attention block""" B, C, H, W = x.shape with nn.parameter_scope(name): # Get query, key, value h = normalize(x, name="norm") # nin(3 * C) -> split is faster? q = nin(h, C, name="q") k = nin(h, C, name="k") v = nin(h, C, name="v") # Attention w = F.batch_matmul(F.reshape(q, (B * num_heads, -1, H * W)), F.reshape(k, (B * num_heads, -1, H * W)), transpose_a=True) w = F.mul_scalar(w, int(C)**(-0.5), inplace=True) assert w.shape == (B * num_heads, H * W, H * W) w = F.softmax(w, axis=-1) h = F.reshape(v, (B * num_heads, -1, H * W)) h = F.batch_matmul(h, w) h = F.reshape(h, (B, C, H, W)) # output projection h = nin(h, C, name='proj_out', zeroing_w=True) assert h.shape == x.shape return F.add2(h, x, inplace=True)
def _scaled_dot_product_attention(q, k, v, attn_mask, dropout): B, Nt, E = q.shape q *= float(E)**-0.5 # (B, Nt, E) x (B, E, Ns) -> (B, Nt, Ns) attn = F.batch_matmul(q, k, transpose_b=True) if attn_mask is not None: attn += attn_mask attn_output_weights = F.softmax(attn, axis=len(attn.shape) - 1) if dropout > 0.0: attn = F.dropout(attn, p=dropout) # (B, Nt, Ns) x (B, Ns, E) -> (B, Nt, E) attn_output = F.batch_matmul(attn_output_weights, v) return attn_output, attn_output_weights
def batch_matmul_backward(inputs, transpose_a=False, transpose_b=False): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dc = inputs[0] a = inputs[1] b = inputs[2] if (transpose_a, transpose_b) == (True, True): da = F.batch_matmul(b, dc, True, True) db = F.batch_matmul(dc, a, True, True) elif (transpose_a, transpose_b) == (True, False): da = F.batch_matmul(b, dc, False, True) db = F.batch_matmul(a, dc, False, False) elif (transpose_a, transpose_b) == (False, True): da = F.batch_matmul(dc, b, False, False) db = F.batch_matmul(dc, a, True, False) elif (transpose_a, transpose_b) == (False, False): da = F.batch_matmul(dc, b, False, True) db = F.batch_matmul(a, dc, True, False) da = _sum(da, a) db = _sum(db, b) return da, db
def batch_inv_backward(inputs): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] x0 = inputs[1] x0_inv = get_output(x0, "BatchInv") t01 = -F.batch_matmul(x0_inv, dy, True, False) dx0 = F.batch_matmul(t01, x0_inv, False, True) return dx0
def compute_context(prev_state): batch_size = prev_state.shape[0] ht = PF.affine(prev_state, attention_units, with_bias=False, name='Waht') # -> (batch_size, attention_units) ht = F.reshape(ht, (batch_size, 1, attention_units)) # -> (batch_size, 1, attention_units) ht = F.broadcast(ht, (batch_size, sentence_length_source, attention_units)) # -> (batch_size, sentence_length_source, attention_units) attention = F.tanh(hs + ht) # -> (batch_size, sentence_length_source, attention_units) attention = time_distributed(PF.affine)(attention, 1, with_bias=False, name='attention') # -> (batch_size, sentence_length_source, 1) attention = F.softmax(attention, axis=1) # -> (batch_size, sentence_length_source, 1) context = F.batch_matmul(hs, attention, transpose_a=True) context = F.reshape(context, (batch_size, attention_units)) return context
def classification_loss_with_orthogonal_loss( pred_logit: nn.Variable, label: nn.Variable, transformation_mat: nn.Variable, reg_weight=0.001) -> Tuple[nn.Variable, Dict[str, nn.Variable]]: """classification loss with orthogonal loss Args: pred_logit (nn.Variable): pred logit, shape(batch, num_classes) label (nn.Variable): label, shape(batch, 1) transformation_mat (nn.Variable): label, shape(batch, K, K) Returns: Tuple[nn.Variable, Dict[str, nn.Variable]]: loss and internal loss """ cross_entropy_loss = F.softmax_cross_entropy(pred_logit, label) classify_loss = F.mean(cross_entropy_loss) # Enforce the transformation as orthogonal matrix mat_squared = F.batch_matmul(transformation_mat, F.transpose(transformation_mat, (0, 2, 1))) batch_size, k, _ = transformation_mat.shape target_array = np.tile(np.eye(k, dtype=np.float32), (batch_size, 1, 1)) target = nn.Variable.from_numpy_array(target_array) mat_diff = mat_squared - target # Frobenius norm mat_diff = F.reshape(mat_diff, (batch_size, -1)) mat_loss = F.mean(F.norm(mat_diff, axis=1)) return classify_loss + mat_loss * reg_weight, { "classify_loss": classify_loss, "mat_loss": mat_loss, "mat_diff": mat_diff, }
def Bahdanau_attention(query, values, out_features, scope): r"""Return the Bahdanau attention mechanism. Args: query (nn.Variable): A query of size (B, 1, C). values (nn.Variable): Values of size (B, T, C). out_features (int): The projected dimensionality. scope (str): Parameter scope. Returns: nn.Variable: The context vector. nn.Variable: The attention weight vector. """ with nn.parameter_scope(scope): x = PF.affine(query, out_features, base_axis=2, with_bias=False, name='query') y = PF.affine(values, out_features, base_axis=2, with_bias=False, name='values') # scores of shape (B, T, 1) scores = PF.affine(F.tanh(x + y), 1, base_axis=2, with_bias=False, name='scores') # attention_weights of shape (B, 1, T) attention_weights = F.softmax( scores, axis=1).reshape((query.shape[0], 1, -1)) # context_vector shape after sum == (B, 1, C) context_vector = F.batch_matmul(attention_weights, values) return context_vector, attention_weights
def dyn_2d_filter(x, lf_2d, k_sz): """ Dynamic 2d filtering """ with nn.parameter_scope('Dynamic_2D_Filtering'): f_localexpand = nn.Variable.from_numpy_array( np.eye(k_sz[0] * k_sz[1], k_sz[0] * k_sz[1])) f_localexpand = F.reshape( f_localexpand, (k_sz[0], k_sz[1], 1, k_sz[0] * k_sz[1])) # (9,9,1,81)) f_localexpand = F.transpose(f_localexpand, (3, 0, 1, 2)) # (81,9,9,1)) x_sz = x.shape x = F.reshape(x, (x_sz[0], x_sz[1], x_sz[2], 1)) # (1,100,170,1) x_localexpand = F.convolution(x, f_localexpand, stride=(1, 1), pad=(4, 4), channel_last=True) # (1,100,170,81) x_le_sz = x_localexpand.shape x_localexpand = F.reshape( x_localexpand, (x_le_sz[0], x_le_sz[1], x_le_sz[2], 1, x_le_sz[3])) y = F.batch_matmul(x_localexpand, lf_2d) y_sz = y.shape y = F.reshape(y, (y_sz[0], y_sz[1], y_sz[2], y_sz[4])) return y
def sample_noise(inpt_size, out_size): _f = lambda x: F.sign(x) * F.pow_scalar(F.abs(x), 0.5) noise = _f(F.randn(shape=(inpt_size + out_size, ))) eps_w = F.batch_matmul(F.reshape(noise[:inpt_size], (1, -1)), F.reshape(noise[inpt_size:], (1, -1)), True) eps_b = noise[inpt_size:] return eps_w, eps_b
def logits(image, text): image_features = encode_image(image) text_features = encode_text(text) # normalized features image_features = image_features / \ F.norm(image_features, axis=1, keepdims=True) text_features = text_features / \ F.norm(text_features, axis=1, keepdims=True) # cosine similarity as logits logit_scale = nn.parameter.get_parameter_or_create(name='logit_scale', shape=()) logit_scale = F.exp(logit_scale) image_features = image_features.reshape( (1, image_features.shape[0], image_features.shape[1])) text_features = F.transpose(text_features, (1, 0)) text_features = text_features.reshape( (1, text_features.shape[0], text_features.shape[1])) per_image = F.batch_matmul(image_features, text_features).reshape( (image_features.shape[0], -1)) logits_per_image = logit_scale.reshape((1, 1)) * per_image logits_per_text = F.transpose(logits_per_image, (1, 0)) # shape = [global_batch_size, global_batch_size] return logits_per_image, logits_per_text
def transform(point, center, scale, resolution, invert=False): """Generate and affine transformation matrix. Given a set of points, a center, a scale and a targer resolution, the function generates and affine transformation matrix. If invert is ``True`` it will produce the inverse transformation. Arguments: point {numpy.array} -- the input 2D point center {numpy.array} -- the center around which to perform the transformations scale {float} -- the scale of the face/object resolution {float} -- the output resolution Keyword Arguments: invert {bool} -- define wherever the function should produce the direct or the inverse transformation matrix (default: {False}) """ point.append(1) h = 200.0 * scale t = F.matrix_diag(F.constant(1, [3])) t.d[0, 0] = resolution / h t.d[1, 1] = resolution / h t.d[0, 2] = resolution * (-center[0] / h + 0.5) t.d[1, 2] = resolution * (-center[1] / h + 0.5) if invert: t = F.reshape(F.batch_inv(F.reshape(t, [1, 3, 3])), [3, 3]) _pt = nn.Variable.from_numpy_array(point) new_point = F.reshape(F.batch_matmul( F.reshape(t, [1, 3, 3]), F.reshape(_pt, [1, 3, 1])), [3, ])[0:2] return new_point.d.astype(int)
def vision_transformer(x, input_res, patch_size, v_width, v_layers, v_heads, embed_dim): scale = v_width**-0.5 with nn.parameter_scope("visual"): con1_w = nn.parameter.get_parameter_or_create(name="conv1/W", shape=(v_width, 3, patch_size, patch_size)) x = F.convolution( x, con1_w, bias=None, stride=(patch_size, patch_size)) # shape = [*, width, grid, grid] # shape = [*, width, grid ** 2] x = F.reshape(x, (x.shape[0], x.shape[1], -1)) x = F.transpose(x, (0, 2, 1)) # shape = [*, grid ** 2, width] z = np.zeros((x.shape[0], 1, x.shape[-1])) zeros = nn.Variable.from_numpy_array(z) class_embed = nn.parameter.get_parameter_or_create( name="class_embedding", shape=(v_width, )).reshape( (x.shape[0], 1, v_width)) # shape = [*, grid ** 2 + 1, width] x = F.concatenate(class_embed + zeros, x, axis=1) positional_embedding = nn.parameter.get_parameter_or_create( name='positional_embedding', shape=((input_res // patch_size)**2 + 1, v_width)).reshape( (x.shape[0], x.shape[1], v_width)) x = x + positional_embedding ln_pre_w = nn.parameter.get_parameter_or_create( name="ln_pre/W", shape=(v_width, )).reshape((1, 1, v_width)) ln_pre_b = nn.parameter.get_parameter_or_create( name="ln_pre/b", shape=(v_width, )).reshape((1, 1, v_width)) x = F.layer_normalization(x, ln_pre_b, ln_pre_w, batch_axis=(0, 1)) x = F.transpose(x, (1, 0, 2)) # NLD -> LND x = transformer(x, v_width, v_layers, v_heads) x = F.transpose(x, (1, 0, 2)) # LND -> NLD ln_post_w = nn.parameter.get_parameter_or_create( name="ln_post/W", shape=(v_width, )).reshape((1, 1, v_width)) ln_post_b = nn.parameter.get_parameter_or_create( name="ln_post/b", shape=(v_width, )).reshape((1, 1, v_width)) x = F.slice(x, stop=(x.shape[0], 1, x.shape[2])) x = F.layer_normalization(x, ln_post_b, ln_post_w) if 'proj' in nn.get_parameters(): visual_proj = nn.parameter.get_parameter_or_create( name="proj", shape=(v_width, embed_dim)).reshape( (1, v_width, -1)) x = F.batch_matmul(x, visual_proj) x = x.reshape((-1, embed_dim)) return x
def build_self_attention_model(train=True): x = nn.Variable((batch_size, max_len)) t = nn.Variable((batch_size, 1)) mask = get_mask(x) attention_mask = (F.constant(1, shape=mask.shape) - mask) * F.constant( np.finfo(np.float32).min, shape=mask.shape) with nn.parameter_scope('embedding'): h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask with nn.parameter_scope('forward'): h_f = lstm(h, hidden_size, mask=mask, return_sequences=True, return_state=False) with nn.parameter_scope('backward'): h_b = lstm(h[:, ::-1, ], hidden_size, mask=mask, return_sequences=True, return_state=False)[:, ::-1, ] h = F.concatenate(h_f, h_b, axis=2) if train: h = F.dropout(h, p=dropout_ratio) with nn.parameter_scope('da'): a = F.tanh(time_distributed(PF.affine)(h, da)) if train: a = F.dropout(a, p=dropout_ratio) with nn.parameter_scope('r'): a = time_distributed(PF.affine)(a, r) if train: a = F.dropout(a, p=dropout_ratio) a = F.softmax(a + attention_mask, axis=1) m = F.batch_matmul(a, h, transpose_a=True) with nn.parameter_scope('output_mlp'): output = F.relu(PF.affine(m, output_mlp_size)) if train: output = F.dropout(output, p=dropout_ratio) with nn.parameter_scope('output'): y = F.sigmoid(PF.affine(output, 1)) accuracy = F.mean(F.equal(F.round(y), t)) loss = F.mean(F.binary_cross_entropy( y, t)) + attention_penalty_coef * frobenius( F.batch_matmul(a, a, transpose_a=True) - batch_eye(batch_size, r)) return x, t, accuracy, loss
def _spectral_norm_outer_most_dim_backward(dw_sn, w, u, itr=1, eps=1e-12): # Forward recomputation w_shape = w.shape d0 = np.prod(w.shape[0:-1]) # In d1 = w.shape[-1] # Out w = F.reshape(w, [d0, d1]) u = F.reshape(u, [d1, 1]) # Power method for _ in range(itr): # v v = F.affine(w, u) v = v / ((F.sum(v ** 2.0, keepdims=True) + eps) ** 0.5) v = F.reshape(v, [1, d0]) # u u = F.affine(v, w) u = u / ((F.sum(u ** 2.0, keepdims=True) + eps) ** 0.5) u = F.reshape(u, [d1, 1]) # No grad u = no_grad(u) v = no_grad(v) # Spectral normalization vw = F.affine(v, w) sigma = F.affine(vw, u) w_sn = w / sigma # The fowllowing process is not necessary for gradient calculation # w_sn = F.reshape(w_sn, w_shape) # Backward for spectral norm dw_sn = dw_sn.reshape(w.shape) # Sum for broadcast backward S = sum_for_arithmetics(dw_sn * w_sn, sigma) # Add batch axis S = S.reshape((1,) + S.shape) u = u.reshape((1,) + u.shape) v = v.reshape((1,) + v.shape) m = F.batch_matmul(v, S, transpose_a=True) m = F.batch_matmul(m, u, transpose_b=True) # Remove batch axis m = m.reshape((m.shape[1], m.shape[2])) dw = (dw_sn - m) / sigma dw = dw.reshape(w_shape) return dw, None
def attention(query, key, value, mask: Optional[nn.Variable] = None, train: bool = True, dropout_ratio: float = 0.1, fix_parameters=False): ''' A global attention layer Args: inputs (nnabla.Variable): A shape of [B, sen_len_query, units] memory (nnabla.Variable): A shape of [B, sen_len_memory, units] mask (nnabla.Variable): A shape of [B, sen_len_query, sen_len_memory] fix_parameters (bool): Fix parameters (Set need_grad=False). Returns: nn.Variable: A shape [B, units]. ''' batch_size, sentence_length_query, embedding_size = query.shape batch_size, sentence_length_memory, embedding_size = key.shape q = query # -> (batch_size, sentence_length_query, embedding_size) k = key # -> (batch_size, sentence_length_memory, embedding_size) v = value # -> (batch_size, sentence_length_memory, embedding_size) logit = F.batch_matmul(q, k, transpose_b=True) * (embedding_size**-0.5) # -> (batch_size, sentence_length_query, sentence_length_memory) # maskのshapeは-> (batch_size, sentence_length_query, sentence_length_memory)である if mask is not None: logit += get_attention_logit_mask(mask) attention_weights = F.softmax(logit, axis=2) # -> (batch_size, sentence_length_query, sentence_length_memory) if train: attention_weights = F.dropout(attention_weights, p=dropout_ratio) attention_output = F.batch_matmul(attention_weights, v) # -> (batch_size, sentence_length_query, embedding_size) return attention_output
def equivariance_jacobian_loss(kp_driving_jacobian, arithmetic_jacobian, trans_kp_jacobian, weight): jacobian_transformed = F.batch_matmul(arithmetic_jacobian, trans_kp_jacobian) normed_driving = F.reshape( F.batch_inv( F.reshape(kp_driving_jacobian, (-1, ) + kp_driving_jacobian.shape[-2:])), kp_driving_jacobian.shape) normed_transformed = jacobian_transformed value = F.batch_matmul(normed_driving, normed_transformed) eye = nn.Variable.from_numpy_array(np.reshape(np.eye(2), (1, 1, 2, 2))) jacobian_loss = F.mean(F.absolute_error(eye, value)) loss = weight * jacobian_loss return loss
def compute_mel(self, wave): hp = self.hparams reals, imags = F.stft(wave, window_size=hp.win_length, stride=hp.hop_length, fft_size=hp.n_fft) linear = F.pow_scalar( F.add2(F.pow_scalar(reals, 2), F.pow_scalar(imags, 2)), 0.5) mels = F.batch_matmul(self.basis, linear) mels = F.log(F.clip_by_value(mels, 1e-5, np.inf)).apply(need_grad=False) return mels
def create_sparse_motions(source_image, kp_driving, kp_source, num_kp): bs, _, h, w = source_image.shape identity_grid = make_coordinate_grid((h, w)) identity_grid = F.reshape(identity_grid, (1, 1, h, w, 2)) # (1, 1, h, w, 2) coordinate_grid = identity_grid - \ F.reshape(kp_driving['value'], (bs, num_kp, 1, 1, 2), inplace=False) if 'jacobian' in kp_driving: jacobian = F.batch_matmul( kp_source['jacobian'], F.reshape( F.batch_inv( F.reshape(kp_driving['jacobian'], (-1, ) + kp_driving['jacobian'].shape[-2:], inplace=False)), kp_driving['jacobian'].shape)) # what it does # batched_driving_jacobian = F.reshape(kp_driving['jacobian'], (-1) + kp_driving['jacobian'].shape[-2:]) # batched_inverse_jacobian = F.batch_inv(batched_driving_jacobian) # inverse_jacobian = F.reshape(batched_inverse_jacobian, kp_driving['jacobian'].shape) jacobian = F.reshape( jacobian, jacobian.shape[:-2] + (1, 1) + jacobian.shape[-2:]) jacobian = F.broadcast( jacobian, jacobian.shape[:2] + (h, w) + jacobian.shape[-2:]) coordinate_grid = F.batch_matmul( jacobian, F.reshape(coordinate_grid, coordinate_grid.shape + (1, ))) coordinate_grid = F.reshape(coordinate_grid, coordinate_grid.shape[:-1]) driving_to_source = coordinate_grid + \ F.reshape(kp_source['value'], (bs, num_kp, 1, 1, 2), inplace=False) # background feature identity_grid = F.broadcast(identity_grid, (bs, 1, h, w, 2)) sparse_motions = F.concatenate(identity_grid, driving_to_source, axis=1) return sparse_motions
def encode_text(text): param_dict = nn.get_parameters() embed_dim = param_dict['text_projection'].shape[1] context_length = param_dict['positional_embedding'].shape[0] vocab_size = param_dict['token_embedding/W'].shape[0] transformer_width = param_dict['ln_final/W'].shape[0] transformer_heads = transformer_width // 64 transformer_layers = len( set( k.split('/')[2] for k in param_dict.keys() if k.startswith(f'transformer/resblocks'))) token_embedding = nn.parameter.get_parameter_or_create( name='token_embedding/W', shape=(vocab_size, transformer_width)) x = F.embed(text, token_embedding) # [batch_size, n_ctx, d_model] positional_embedding = nn.parameter.get_parameter_or_create( name='positional_embedding', shape=(context_length, transformer_width)).reshape( (1, context_length, transformer_width)) x = x + positional_embedding x = F.transpose(x, (1, 0, 2)) # NLD -> LND x = transformer(x, transformer_width, transformer_layers, transformer_heads, attn_mask=build_attn_mask(context_length)) x = F.transpose(x, (1, 0, 2)) # LND -> NLD ln_final_W = nn.parameter.get_parameter_or_create( name='ln_final/W', shape=(transformer_width, )).reshape( (1, 1, transformer_width)) ln_final_b = nn.parameter.get_parameter_or_create( name='ln_final/b', shape=(transformer_width, )).reshape( (1, 1, transformer_width)) x = F.layer_normalization(x, ln_final_b, ln_final_W, batch_axis=(0, 1)) idx = F.max(text, axis=-1, only_index=True) idx.forward() x = x[list(range(x.shape[0])), idx.d].reshape((1, x.shape[0], -1)) text_projection = nn.parameter.get_parameter_or_create( name='text_projection', shape=(transformer_width, embed_dim)).reshape( (1, transformer_width, embed_dim)) x = F.batch_matmul(x, text_projection) x = x.reshape((-1, embed_dim)) return x
def unit_sphere_intersection(self, camloc, raydir): BR, _ = raydir.shape a = 1.0 # raydir is already normalized b = 2.0 * F.batch_matmul(F.reshape(camloc, (BR, 1, 3)), F.reshape(raydir, (BR, 3, 1))) c = F.batch_matmul(F.reshape(camloc, (BR, 1, 3)), F.reshape(camloc, (BR, 3, 1))) - 1.0 D = b**2 - 4 * a * c mask = F.reshape(F.greater_scalar(D, 0.0), (BR, 1)) b = F.reshape(b, (BR, 1)) D = F.reshape(D, (BR, 1)) D = mask * D D_sqrt = D**0.5 t_start = -(b + D_sqrt) / (2 * a) t_finish = -(b - D_sqrt) / (2 * a) t_start = t_start * mask + self.t_near * (1 - mask) t_finish = t_finish * mask + self.t_far * (1 - mask) return t_start, t_finish, mask
def compute_mel(wave, basis, hp): r"""Compute the mel-spectrogram from the waveform. Args: wave (nn.Variable): Wavefrom variable of shape (B, 1, L). basis (nn.Variable): Basis for mel-spectrogram computation. hp (HParams): Hyper-parameters. Returns: nn.Variable: Output variable. """ reals, imags = stft(wave, window_size=hp.win_length, stride=hp.hop_length, fft_size=hp.n_fft) linear = (reals**2 + imags**2)**0.5 mels = F.batch_matmul(basis, linear) mels = F.log(F.clip_by_value(mels, 1e-5, np.inf)) return mels
def sample_network(x_curr, sdf_cur, raydir, grad_curr): """ x_curr: Points (B, R, 3) either on surface or not sdf_cur: SDF on x_curr (B, R, 1) raydir: Ray direction (B, R, 3) grad_curr: Gradients on x_curr (B, R, 3) """ # Denominator de = F.batch_matmul(grad_curr[..., np.newaxis, :], raydir[..., np.newaxis, :], transpose_b=True) de = de.reshape(sdf_cur.shape) de_inv = (1.0 / de).apply(need_grad=False) de_inv = F.minimum_scalar(de_inv, 1e30).apply( need_grad=False) # (numerical issue de = cos(x, y) = 0) # Differentiable intersection point (discrete update of implicit differentiation) sdf_cur0 = sdf_cur.get_unlinked_variable(need_grad=False) x_hat = x_curr - (sdf_cur - sdf_cur0) * de_inv * raydir return x_hat
def log_mel_spectrogram(wave, sr, window_size, n_mels=80): """Return log mel-spectrogram. Args: wave (nn.Variable): Input waveform of shape (B, 1, L). sr (int): Sampling rate. window_size (int): Window size. n_mels (int): Number of mel banks. jitter (bool): Whether to apply random crop. Defaults to False. max_jitter_steps (int): Maximum number of jitter steps if jitter is set to `True`. Returns: nn.Variable: Log mel-spectrogram. """ linear = spectrogram(wave, window_size) mel_basis = librosa_mel_fn(sr, window_size, n_mels=n_mels, fmin=80.0, fmax=7600.0) basis = nn.Variable.from_numpy_array(mel_basis[None, ...]) mels = F.batch_matmul(basis, linear) return F.log(mels * 1e4 + 1.0)
def backward_impl(self, inputs, outputs, prop_down, accum): # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph] shape_a = inputs[0].shape shape_b = inputs[1].shape if shape_a[:-2] != shape_b[:-2]: raise ValueError("shape_a[:-2] () != shape_b[:-2] (). \n" "Implicit broadcast is supported now.", shape_a[:-2] != shape_b[:-2]) # Args transpose_a = self.forward_func.info.args["transpose_a"] transpose_b = self.forward_func.info.args["transpose_b"] # Inputs x0 = inputs[0].data x1 = inputs[1].data dy = inputs[2].data # Outputs dx0 = outputs[0].data dx1 = outputs[1].data # Grads of inputs g_x0 = inputs[0].grad g_x1 = inputs[1].grad g_dy = inputs[2].grad # Grads of outputs g_dx0 = outputs[0].grad g_dx1 = outputs[1].grad # Computation if prop_down[0]: # condition if (transpose_a, transpose_b) == (True, True): g_x0_ = F.batch_matmul(g_dx1, dy, True, True) if (transpose_a, transpose_b) == (True, False): g_x0_ = F.batch_matmul(g_dx1, dy, False, True) if (transpose_a, transpose_b) == (False, True): g_x0_ = F.batch_matmul(dy, g_dx1, False, False) if (transpose_a, transpose_b) == (False, False): g_x0_ = F.batch_matmul(dy, g_dx1, False, True) # reshape for batch axes if g_x0_.shape != g_x0.shape: g_x0_ = F.reshape(g_x0_, g_x0.shape) if accum[0]: g_x0 += g_x0_ else: g_x0.copy_from(g_x0_) if prop_down[1]: # condition if (transpose_a, transpose_b) == (True, True): g_x1_ = F.batch_matmul(dy, g_dx0, True, True) if (transpose_a, transpose_b) == (True, False): g_x1_ = F.batch_matmul(g_dx0, dy, False, False) if (transpose_a, transpose_b) == (False, True): g_x1_ = F.batch_matmul(dy, g_dx0, True, False) if (transpose_a, transpose_b) == (False, False): g_x1_ = F.batch_matmul(g_dx0, dy, True, False) # reshape for batch axes if g_x1_.shape != g_x1.shape: g_x1_ = F.reshape(g_x1_, g_x1.shape) if accum[1]: g_x1 += g_x1_ else: g_x1.copy_from(g_x1_) if prop_down[2]: t1 = F.batch_matmul(g_dx0, x1, transpose_a, transpose_b) t2 = F.batch_matmul(x0, g_dx1, transpose_a, transpose_b) g_dy_ = t1 + t2 if accum[2]: g_dy += g_dy_ else: g_dy.copy_from(g_dy_)
with_file_cache=False) x = nn.Variable([batch_size, window_size * 2]) with nn.parameter_scope('W_in'): h = PF.embed(x, vocab_size, embedding_size) h = F.mean(h, axis=1) h = expand_dims(h, axis=-1) # (batch_size, embedding_size, 1) t = nn.Variable([batch_size, 1]) t_neg = nn.Variable([batch_size, k]) with nn.parameter_scope('W_out'): _t = PF.embed(t, vocab_size, embedding_size) # (batch_size, 1, embedding_size) _t_neg = PF.embed(t_neg, vocab_size, embedding_size) # (batch_size, k, embedding_size) t_score = F.sigmoid(F.reshape(F.batch_matmul(_t, h), shape=(batch_size, 1))) t_neg_score = F.sigmoid( F.reshape(F.batch_matmul(_t_neg, h), shape=(batch_size, k))) t_loss = F.binary_cross_entropy(t_score, F.constant(1, shape=(batch_size, 1))) t_neg_loss = F.binary_cross_entropy(t_neg_score, F.constant(0, shape=(batch_size, k))) loss = F.mean(F.sum(t_loss, axis=1) + F.sum(t_neg_loss, axis=1)) # Create solver. solver = S.Adam() solver.set_parameters(nn.get_parameters()) trainer = Trainer(inputs=[x, t, t_neg], loss=loss, solver=solver) trainer.run(train_data_iter, valid_data_iter, epochs=max_epoch)
def cond_att_lstm(x, parent_index, mask, context, context_mask, state_size, att_hidden_size, initial_state=None, initial_cell=None, hist=None, dropout=0, train=True, w_init=None, inner_w_init=None, b_init=I.ConstantInitializer(0), forget_bias_init=I.ConstantInitializer(1)): """ x: (batch_size, length, input_size) parent_index: (batch_size, length) mask: (batch_size, length) context: (batch_size, context_length, context_size) context_mask: (batch_size, context_length) hist: (batch_size, l, state_size) """ batch_size, length, input_size = x.shape _, context_length, context_size = context.shape if w_init is None: w_init = I.UniformInitializer( I.calc_uniform_lim_glorot(input_size, state_size)) if inner_w_init is None: inner_w_init = orthogonal retain_prob = 1.0 - dropout z_w = nn.Variable((batch_size, 4, input_size), need_grad=False) z_w.d = 1 z_u = nn.Variable((batch_size, 4, state_size), need_grad=False) z_u.d = 1 if dropout > 0: if train: z_w = F.dropout(z_w, p=retain_prob) z_u = F.dropout(z_u, p=retain_prob) z_w *= retain_prob z_u *= retain_prob z_w = F.reshape(z_w, (batch_size, 4, 1, input_size)) z_w = F.broadcast(z_w, (batch_size, 4, length, input_size)) z_w = F.split(z_w, axis=1) z_u = F.split(z_u, axis=1) xi = z_w[0] * x xf = z_w[1] * x xc = z_w[2] * x xo = z_w[3] * x with nn.parameter_scope("cond_att_lstm"): # (batch_size, length, state_size) with nn.parameter_scope("lstm"): xi = PF.affine( xi, state_size, base_axis=2, w_init=w_init, b_init=b_init, name="Wi") xf = PF.affine( xf, state_size, base_axis=2, w_init=w_init, b_init=forget_bias_init, name="Wf") xc = PF.affine( xc, state_size, base_axis=2, w_init=w_init, b_init=b_init, name="Wc") xo = PF.affine( xo, state_size, base_axis=2, w_init=w_init, b_init=b_init, name="Wo") with nn.parameter_scope("context"): # context_att_trans: (batch_size, context_size, att_hidden_size) context_att_trans = PF.affine( context, att_hidden_size, base_axis=2, w_init=w_init, b_init=b_init, name="layer1_c") if initial_state is None: h = nn.Variable((batch_size, state_size), need_grad=False) h.data.zero() else: h = initial_state if initial_cell is None: c = nn.Variable((batch_size, state_size), need_grad=False) c.data.zero() else: c = initial_cell if hist is None: hist = nn.Variable((batch_size, 1, state_size), need_grad=False) hist.data.zero() # (batch_size, state_size) xi = split(xi, axis=1) xf = split(xf, axis=1) xc = split(xc, axis=1) xo = split(xo, axis=1) mask = F.reshape(mask, [batch_size, length, 1]) # (batch_size, length, 1) mask = F.broadcast(mask, [batch_size, length, state_size]) # (batch_size, state_size) mask = split(mask, axis=1) # (batch_size, max_action_length) parent_index = parent_index + 1 # index == 0 means that parent is root # (batch_size) parent_index = split(parent_index, axis=1) hs = [] cs = [] ctx = [] for i, f, c2, o, m, p in zip(xi, xf, xc, xo, mask, parent_index): h_num = hist.shape[1] with nn.parameter_scope("context"): h_att_trans = PF.affine( h, att_hidden_size, with_bias=False, w_init=w_init, name="layer1_h") # (batch_size, att_hidden_size) h_att_trans = F.reshape(h_att_trans, (batch_size, 1, att_hidden_size)) h_att_trans = F.broadcast( h_att_trans, (batch_size, context_length, att_hidden_size)) att_hidden = F.tanh(context_att_trans + h_att_trans) att_raw = PF.affine( att_hidden, 1, base_axis=2, w_init=w_init, b_init=b_init) # (batch_size, context_length, 1) att_raw = F.reshape(att_raw, (batch_size, context_length)) ctx_att = F.exp(att_raw - F.max(att_raw, axis=1, keepdims=True)) ctx_att = ctx_att * context_mask ctx_att = ctx_att / F.sum(ctx_att, axis=1, keepdims=True) ctx_att = F.reshape(ctx_att, (batch_size, context_length, 1)) ctx_att = F.broadcast(ctx_att, (batch_size, context_length, context_size)) ctx_vec = F.sum( context * ctx_att, axis=1) # (batch_size, context_size) # parent_history p = F.reshape(p, (batch_size, 1)) p = F.one_hot(p, (h_num, )) p = F.reshape(p, (batch_size, 1, h_num)) par_h = F.batch_matmul(p, hist) # [batch_size, 1, state_size] par_h = F.reshape(par_h, (batch_size, state_size)) with nn.parameter_scope("lstm"): i_t = PF.affine( z_u[0] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Ui") i_t += PF.affine( ctx_vec, state_size, w_init=inner_w_init(context_size, state_size), with_bias=False, name="Ci") i_t += PF.affine( par_h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Pi") i_t = F.sigmoid(i + i_t) f_t = PF.affine( z_u[1] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Uf") f_t += PF.affine( ctx_vec, state_size, w_init=inner_w_init(context_size, state_size), with_bias=False, name="Cf") f_t += PF.affine( par_h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Pf") f_t = F.sigmoid(f + f_t) c_t = PF.affine( z_u[2] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Uc") c_t += PF.affine( ctx_vec, state_size, w_init=inner_w_init(context_size, state_size), with_bias=False, name="Cc") c_t += PF.affine( par_h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Pc") c_t = f_t * c + i_t * F.tanh(c2 + c_t) o_t = PF.affine( z_u[3] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Uo") o_t += PF.affine( ctx_vec, state_size, w_init=inner_w_init(context_size, state_size), with_bias=False, name="Co") o_t += PF.affine( par_h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Po") o_t = F.sigmoid(o + o_t) h_t = o_t * F.tanh(c_t) h_t = (1 - m) * h + m * h_t c_t = (1 - m) * c + m * c_t h = h_t c = c_t h_t = F.reshape(h_t, (batch_size, 1, state_size), inplace=False) c_t = F.reshape(c_t, (batch_size, 1, state_size), inplace=False) ctx_vec = F.reshape( ctx_vec, (batch_size, 1, context_size), inplace=False) hs.append(h_t) cs.append(c_t) ctx.append(ctx_vec) hist = F.concatenate( hist, h_t, axis=1) # (batch_size, h_num + 1, state_size) return concatenate( *hs, axis=1), concatenate( *cs, axis=1), concatenate( *ctx, axis=1), hist
def _spectral_norm_backward(dw_sn, w, u, dim=0, itr=1, eps=1e-12): # Forward recomputation w_shape = w.shape # Transpose if the output dimension is not the most-left dimension. if dim != 0: dims_transpose = [dim] + [i for i in range(len(w_shape)) if i != dim] w = F.transpose(w, dims_transpose) w_shape = w.shape d0 = w.shape[0] # Out d1 = np.prod(w.shape[1:]) # In w = F.reshape(w, [d0, d1]) u = F.reshape(u, [1, d0]) # Power method for _ in range(itr): # v v = F.affine(u, w) v = v / ((F.sum(v ** 2.0, keepdims=True) + eps) ** 0.5) v = F.reshape(v, [d1, 1]) # u u = F.affine(w, v) u = u / ((F.sum(u ** 2.0, keepdims=True) + eps) ** 0.5) u = F.reshape(u, [1, d0]) # No grad u = no_grad(u) v = no_grad(v) # Spectral normalization wv = F.affine(w, v) sigma = F.affine(u, wv) w_sn = w / sigma # The fowllowing process is not necessary for gradient calculation # w_sn = F.reshape(w_sn, w_shape) # # Transpose again if the output dimension is not the most-left dimension. # if dim != 0: # dims_transpose = [i for i in range(1, dim + 1)] \ # + [0] + [i for i in range(dim + 1, len(w_shape))] # w_sn = F.transpose(w_sn, dims_transpose) # Backward # Backward for post-transpose if dim != 0: dims_transpose = [dim] + [i for i in range(len(w_shape)) if i != dim] dw_sn = F.transpose(dw_sn, dims_transpose) dw_sn = dw_sn.reshape(w.shape) # Backward for spectral norm # Sum for broadcast backward S = sum_for_arithmetics(dw_sn * w_sn, sigma) # Add batch axis S = S.reshape((1,) + S.shape) u = u.reshape((1,) + u.shape) v = v.reshape((1,) + v.shape) m = F.batch_matmul(u, S, transpose_a=True) m = F.batch_matmul(m, v, transpose_b=True) # Remove batch axis m = m.reshape((m.shape[1], m.shape[2])) dw = (dw_sn - m) / sigma # Backward for pre-transpose dw = dw.reshape(w_shape) if dim != 0: dims_transpose = [i for i in range(1, dim + 1)] \ + [0] + [i for i in range(dim + 1, len(w_shape))] dw = F.transpose(dw, dims_transpose) return dw, None