def epoch_predict(env, args, model, loader): """Predict in one epoch""" model.eval() arcs, rels, probs = [], [], [] for words, feats in loader(): # ignore the first token of each sentence tmp_words = layers.pad(words[:, 1:], paddings=[0, 0, 1, 0], pad_value=args.pad_index) mask = tmp_words != args.pad_index lens = nn.reduce_sum(mask, -1) s_arc, s_rel = model(words, feats) arc_preds, rel_preds = decode(args, s_arc, s_rel, mask) arcs.extend( layers.split(nn.masked_select(arc_preds, mask), lens.numpy().tolist())) rels.extend( layers.split(nn.masked_select(rel_preds, mask), lens.numpy().tolist())) if args.prob: arc_probs = nn.index_sample(layers.softmax(s_arc, -1), layers.unsqueeze(arc_preds, -1)) probs.extend( layers.split( nn.masked_select(layers.squeeze(arc_probs, axes=[-1]), mask), lens.numpy().tolist())) arcs = [seq.numpy().tolist() for seq in arcs] rels = [env.REL.vocab[seq.numpy().tolist()] for seq in rels] probs = [[round(p, 3) for p in seq.numpy().tolist()] for seq in probs] return arcs, rels, probs
def forward(self, input, pre_encode_hidden): #print('Im here!') #print(input.shape) pre_hidden, encode_hidden = layers.split(pre_encode_hidden, num_or_sections=[self._hiden_size, self._encode_hiden_size], dim=1) concat_input_hidden = layers.concat([input, pre_hidden, encode_hidden], 1) gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight) gate_input = layers.elementwise_add(gate_input, self._gate_bias) gate_input = self._gate_activation(gate_input) r, u = layers.split(gate_input, num_or_sections=2, dim=1) r_hidden = r * pre_hidden candidate = layers.matmul( layers.concat([input, r_hidden, encode_hidden], 1), self._candidate_weight) candidate = layers.elementwise_add(candidate, self._candidate_bias) c = self._activation(candidate) new_hidden = u * pre_hidden + (1 - u) * c return new_hidden
def forward(self, x, y, **kargs): """ Adaptive Normalization forward. Args: x (N x C1 x *): input, y (N x C2): Conditional information. Returns: out (N x c1 x *): output """ residual_dim = len(x.shape) - len(y.shape) if self.projection: if self.separate_projection: gamma = self.fc_gamma(y) beta = self.fc_beta(y) for _ in range(residual_dim): gamma = L.unsqueeze(gamma, -1) beta = L.unsqueeze(beta, -1) else: y = self.fc(x) for _ in range(residual_dim): y = L.unsqueeze(y, -1) gamma, beta = L.split(y, num_or_sections=2, dim=1) else: for _ in range(residual_dim): y = L.unsqueeze(y, -1) gamma, beta = L.split(y, 2, 1) x = self.norm(x) if self.norm is not None else x out = x * (1 + gamma) + beta return out
def seq2seq_api_rnn(input_embedding, len=3, init_hiddens=None, init_cells=None): class EncoderCell(layers.RNNCell): def __init__(self, num_layers, hidden_size, dropout_prob=0., forget_bias=0.): self.num_layers = num_layers self.hidden_size = hidden_size self.dropout_prob = dropout_prob self.lstm_cells = [] for i in range(num_layers): self.lstm_cells.append( layers.LSTMCell( hidden_size, forget_bias=forget_bias, param_attr=fluid.ParamAttr( initializer=fluid.initializer. UniformInitializer(low=-init_scale, high=init_scale)))) def call(self, step_input, states): new_states = [] for i in range(self.num_layers): out, new_state = self.lstm_cells[i](step_input, states[i]) step_input = layers.dropout( out, self.dropout_prob, dropout_implementation='upscale_in_train' ) if self.dropout_prob > 0 else out new_states.append(new_state) return step_input, new_states cell = EncoderCell(num_layers, hidden_size, dropout) output, new_states = layers.rnn( cell, inputs=input_embedding, initial_states=[[hidden, cell] for hidden, cell in zip([ layers.reshape(init_hidden, shape=[-1, hidden_size]) for init_hidden in layers.split( init_hiddens, num_or_sections=num_layers, dim=0) ], [ layers.reshape(init_cell, shape=[-1, hidden_size]) for init_cell in layers.split( init_cells, num_or_sections=num_layers, dim=0) ])], time_major=False) last_hidden = layers.stack([hidden for hidden, _ in new_states], 0) last_cell = layers.stack([cell for _, cell in new_states], 0) return output, last_hidden, last_cell
def forward(self, z, condition=None): """Transform a random noise sampled from a standard Gaussian distribution into sample from the target distribution. And output the mean and log standard deviation of the output distribution. Args: z (Variable): shape(B, T), random noise sampled from a standard gaussian disribution. condition (Variable, optional): shape(B, F, T), dtype float, the upsampled condition. Defaults to None. Returns: (z, out_mu, out_log_std) z (Variable): shape(B, T), dtype float, transformed noise, it is the synthesized waveform. out_mu (Variable): shape(B, T), dtype float, means of the output distributions. out_log_std (Variable): shape(B, T), dtype float, log standard deviations of the output distributions. """ for i, flow in enumerate(self.flows): theta = flow(z, condition) # w, mu, log_std [0: T] w, mu, log_std = F.split(theta, 3, dim=-1) # (B, T, 1) for each mu = F.squeeze(mu, [-1]) #[0: T] log_std = F.squeeze(log_std, [-1]) #[0: T] z = z * F.exp(log_std) + mu #[0: T] if i == 0: out_mu = mu out_log_std = log_std else: out_mu = out_mu * F.exp(log_std) + mu out_log_std += log_std return z, out_mu, out_log_std
def add_input(self, x_t, speaker_embed=None): """ Takes a step of inputs and return a step of outputs. It works similarily with the `forward` method, but in a `step-in-step-out` fashion. Args: x_t (Variable): shape(B, C_in, T=1), dtype float32, the input of Conv1DGLU layer, where B means batch_size, C_in means the input channels. speaker_embed (Variable): Shape(B, C_sp), dtype float32, speaker embed, where C_sp means speaker embedding size. Returns: x (Variable): shape(B, C_out), the output of Conv1DGLU, where C_out means the `num_filter`. """ residual = x_t x_t = F.dropout(x_t, self.dropout, dropout_implementation="upscale_in_train") x_t = self.conv.add_input(x_t) content_t, gate_t = F.split(x_t, num_or_sections=2, dim=1) if speaker_embed is not None: sp = F.softsign(self.fc(speaker_embed)) content_t = F.elementwise_add(content_t, sp, axis=0) # glu x_t = F.sigmoid(gate_t) * content_t if self.residual: x_t = F.scale(x_t + residual, np.sqrt(0.5)) return x_t
def forward(self, x, condition=None): """Conv1D gated-tanh Block. Args: x (Variable): shape(B, C_res, T), the input. (B stands for batch_size, C_res stands for residual channels, T stands for time steps.) dtype float32. condition (Variable, optional): shape(B, C_cond, T), the condition, it has been upsampled in time steps, so it has the same time steps as the input does.(C_cond stands for the condition's channels). Defaults to None. Returns: (residual, skip_connection) residual (Variable): shape(B, C_res, T), the residual, which is used as the input to the next layer of ResidualBlock. skip_connection (Variable): shape(B, C_res, T), the skip connection. This output is accumulated with that of other ResidualBlocks. """ time_steps = x.shape[-1] h = x # dilated conv h = self.conv(h) if h.shape[-1] != time_steps: h = h[:, :, :time_steps] # condition if condition is not None: h += self.condition_proj(condition) # gated tanh content, gate = F.split(h, 2, dim=1) z = F.sigmoid(gate) * F.tanh(content) # projection residual = F.scale(z + x, math.sqrt(.5)) skip_connection = z return residual, skip_connection
def forward(self, x, speaker_embed=None): """ Args: x (Variable): shape(B, C_in, T), dtype float32, the input of Conv1DGLU layer, where B means batch_size, C_in means the input channels T means input time steps. speaker_embed (Variable): shape(B, C_sp), dtype float32, speaker embed, where C_sp means speaker embedding size. Returns: x (Variable): shape(B, C_out, T), the output of Conv1DGLU, where C_out means the `num_filters`. """ residual = x x = F.dropout(x, self.dropout, dropout_implementation="upscale_in_train") x = self.conv(x) content, gate = F.split(x, num_or_sections=2, dim=1) if speaker_embed is not None: sp = F.softsign(self.fc(speaker_embed)) content = F.elementwise_add(content, sp, axis=0) # glu x = F.sigmoid(gate) * content if self.residual: x = F.scale(x + residual, np.sqrt(0.5)) return x
def forward(self, input, bias=None, padding=None): """ input: input feature (B, T, C) padding: only used when using causal conv, we pad mannually """ input_dropped = F.dropout(input, 1. - self.keep_prob, dropout_implementation="upscale_in_train") if self.causal: assert padding is not None input_dropped = F.concat([padding, input_dropped], axis=1) hidden = self.conv(input_dropped) if self.has_bias: assert bias is not None transformed_bias = F.softsign(self.bias_affine(bias)) hidden_embedded = hidden + F.unsqueeze(transformed_bias, [1]) else: hidden_embedded = hidden # glu content, gate = F.split(hidden, num_or_sections=2, dim=-1) content = hidden_embedded[:, :, :self.in_channel] hidden = F.sigmoid(gate) * content # # residual hidden = F.scale(input + hidden, math.sqrt(0.5)) return hidden
def add_input(self, x, condition=None): """Add a step input. This method works similarily with `forward` but in a `step-in-step-out` fashion. Args: x (Variable): shape(B, C_res, T=1), input for a step, dtype float32. condition (Variable, optional): shape(B, C_cond, T=1). condition for a step, dtype float32. Defaults to None. Returns: (residual, skip_connection) residual (Variable): shape(B, C_res, T=1), the residual for a step, which is used as the input to the next layer of ResidualBlock. skip_connection (Variable): shape(B, C_res, T=1), the skip connection for a step. This output is accumulated with that of other ResidualBlocks. """ h = x # dilated conv h = self.conv.add_input(h) # condition if condition is not None: h += self.condition_proj(condition) # gated tanh content, gate = F.split(h, 2, dim=1) z = F.sigmoid(gate) * F.tanh(content) # projection residual = F.scale(z + x, np.sqrt(0.5)) skip_connection = z return residual, skip_connection
def sample_from_mog(self, y): """Sample from the output distribution where the output distribution is a mixture of Gaussians. Args: y (Variable): shape(B, T, C_output), dtype float32, the parameterd of the output distribution. It is the concatenation of 3 parts, the logits of every distribution, the mean of each distribution and the log standard deviation of each distribution. Each part's shape is (B, T, n_mixture), where `n_mixture` means the number of Gaussians in the mixture. Returns: Variable: shape(B, T), waveform sampled from the output distribution. """ batch_size, time_steps, output_dim = y.shape n_mixture = output_dim // 3 w, mu, log_std = F.split(y, 3, dim=-1) reshaped_w = F.reshape(w, (batch_size * time_steps, n_mixture)) prob_ids = F.sampling_id(F.softmax(reshaped_w)) prob_ids = F.reshape(prob_ids, (batch_size, time_steps)) prob_ids = prob_ids.numpy() index = np.array([[[b, t, prob_ids[b, t]] for t in range(time_steps)] for b in range(batch_size)]).astype("int32") index_var = dg.to_variable(index) mu_ = F.gather_nd(mu, index_var) log_std_ = F.gather_nd(log_std, index_var) dist = D.Normal(mu_, F.exp(log_std_)) samples = dist.sample(shape=[]) samples = F.clip(samples, min=-1., max=1.) return samples
def forward(self, x, *cond_inputs, norm_weights=(None, None), **kwargs): """ Spatially Adaptive Normalization (SPADE) forward. """ output = self.norm(x) for i in range(len(cond_inputs)): if cond_inputs[i] is None: continue if type(cond_inputs[i]) == list: cond_input, mask = cond_inputs[i] mask = L.image_resize(mask, size=x.shape[2:], resample='BILINEAR', align_corners=False) else: cond_input = cond_inputs[i] mask = None label_map = L.image_resize(cond_input, x.shape[2:]) if norm_weights is None or norm_weights[0] is None or i != 0: affine_params = self.mlps[i](label_map) else: affine_params = self.mlps[i](label_map, conv_weights=norm_weights) gamma, beta = L.split(affine_params, 2, 1) if mask is not None: gamma = gamma * (1 - mask) beta = beta * (1 - mask) output = output * (1 + gamma) + beta return output
def forward(self, input, class_id, input_class_emb=False): if isinstance(input, list): codes = [input[0]] codes += [ input[2 * i + 1:2 * i + 3] for i in range(len(input) // 2) ] else: codes = layers.split(input, self.num_split, 1) if not input_class_emb: class_emb = self.embed_y(class_id) # 128 else: class_emb = class_id out = self.noise_fc(codes[0]) out = layers.transpose(layers.reshape(out, (out.shape[0], 4, 4, -1)), (0, 3, 1, 2)) for i, (code, gblock) in enumerate(zip(codes[1:], self.blocks)): if isinstance(input, list): condition = [layers.concat([c, class_emb], 1) for c in code] else: condition = layers.concat([code, class_emb], 1) out = gblock(out, condition) out = self.output_layer_bn(out) out = layers.relu(out) out = self.output_layer_conv(out) return (layers.tanh(out) + 1) / 2
def forward(self, input, pre_hidden): xu_t, xr_t, xc_t = layers.split(input, num_or_sections=3, dim=-1) gate_input = layers.matmul(x=pre_hidden, y=self._gate_weight) gate_input = layers.elementwise_add(gate_input, self._gate_bias) hu_t, hr_t = layers.split(gate_input, num_or_sections=2, dim=-1) u_add = layers.elementwise_add(xu_t, hu_t) r_add = layers.elementwise_add(xr_t, hr_t) u = self._gate_activation(u_add) r = self._gate_activation(r_add) r_hidden = r * pre_hidden candidate = layers.matmul(r_hidden, self._candidate_weight) candidate = layers.elementwise_add(xc_t, candidate) candidate = layers.elementwise_add(candidate, self._candidate_bias) c = self._activation(candidate) new_hidden = (1 - u) * pre_hidden + u * c return new_hidden
def gru_step(self, input, hidden, mask=None): """ gru step """ hidden_array = [] for i in range(self.num_layers): hidden_temp = layers.slice(hidden, axes=[0], starts=[i], ends=[i + 1]) hidden_temp = layers.reshape(hidden_temp, shape=[-1, self.hidden_size]) hidden_array.append(hidden_temp) last_hidden_array = [] for k in range(self.num_layers): trans_input = layers.matmul(input, self.weight_input_array[k]) trans_input += self.bias_input_array[k] trans_hidden = layers.matmul(hidden_array[k], self.weight_hidden_array[k]) trans_hidden += self.bias_hidden_array[k] input_array = layers.split(trans_input, num_or_sections=3, dim=-1) trans_array = layers.split(trans_hidden, num_or_sections=3, dim=-1) reset_gate = layers.sigmoid(input_array[0] + trans_array[0]) input_gate = layers.sigmoid(input_array[1] + trans_array[1]) new_gate = layers.tanh(input_array[2] + reset_gate * trans_array[2]) new_hidden = new_gate + input_gate * (hidden_array[k] - new_gate) if mask: neg_mask = layers.fill_constant_batch_size_like( input=mask, shape=[1], value=1.0, dtype='float32') - mask new_hidden = new_hidden * mask + hidden_array[k] * neg_mask last_hidden_array.append(new_hidden) input = new_hidden if self.dropout and self.dropout > 0.0: input = layers.dropout(input, dropout_prob=self.dropout) last_hidden = layers.concat(last_hidden_array, 0) last_hidden = layers.reshape( last_hidden, shape=[self.num_layers, -1, self.hidden_size]) return input, last_hidden
def forward(self, audio, mel, audio_start, clip_kl=True): """Compute loss of Clarinet model. Args: audio (Variable): shape(B, T_audio), dtype flaot32, ground truth waveform. mel (Variable): shape(B, F, T_mel), dtype flaot32, condition(mel spectrogram here). audio_start (Variable): shape(B, ), dtype int64, audio starts positions. clip_kl (bool, optional): whether to clip kl_loss by maximum=100. Defaults to True. Returns: Dict(str, Variable) loss (Variable): shape(1, ), dtype flaot32, total loss. kl (Variable): shape(1, ), dtype flaot32, kl divergence between the teacher's output distribution and student's output distribution. regularization (Variable): shape(1, ), dtype flaot32, a regularization term of the KL divergence. spectrogram_frame_loss (Variable): shape(1, ), dytpe: float, stft loss, the L1-distance of the magnitudes of the spectrograms of the ground truth waveform and synthesized waveform. """ batch_size, audio_length = audio.shape # audio clip's length z = F.gaussian_random(audio.shape) condition = self.encoder(mel) # (B, C, T) condition_slice = crop(condition, audio_start, audio_length) x, s_means, s_scales = self.student(z, condition_slice) # all [0: T] s_means = s_means[:, 1:] # (B, T-1), time steps [1: T] s_scales = s_scales[:, 1:] # (B, T-1), time steps [1: T] s_clipped_scales = F.clip(s_scales, self.min_log_scale, 100.) # teacher outputs single gaussian y = self.teacher(x[:, :-1], condition_slice[:, :, 1:]) _, t_means, t_scales = F.split(y, 3, -1) # time steps [1: T] t_means = F.squeeze(t_means, [-1]) # (B, T-1), time steps [1: T] t_scales = F.squeeze(t_scales, [-1]) # (B, T-1), time steps [1: T] t_clipped_scales = F.clip(t_scales, self.min_log_scale, 100.) s_distribution = D.Normal(s_means, F.exp(s_clipped_scales)) t_distribution = D.Normal(t_means, F.exp(t_clipped_scales)) # kl divergence loss, so we only need to sample once? no MC kl = s_distribution.kl_divergence(t_distribution) if clip_kl: kl = F.clip(kl, -100., 10.) # context size dropped kl = F.reduce_mean(kl[:, self.teacher.context_size:]) # major diff here regularization = F.mse_loss(t_scales[:, self.teacher.context_size:], s_scales[:, self.teacher.context_size:]) # introduce information from real target spectrogram_frame_loss = F.mse_loss(self.stft.magnitude(audio), self.stft.magnitude(x)) loss = kl + self.lmd * regularization + spectrogram_frame_loss loss_dict = { "loss": loss, "kl_divergence": kl, "regularization": regularization, "stft_loss": spectrogram_frame_loss } return loss_dict
def forward(self, x): """Forward network""" mask = layers.reduce_any(x != self.pad_index, -1) lens = nn.reduce_sum(mask, -1) masked_x = nn.masked_select(x, mask) h, _ = self.transformer(masked_x) feat_embed = nn.pad_sequence_paddle( layers.split(h, lens.numpy().tolist(), dim=0), self.pad_index) return feat_embed
def _build_distribution(self, enc_final_state=None): enc_hidden = [ layers.concat(state, axis=-1) for state in enc_final_state ] enc_hidden = layers.concat(enc_hidden, axis=-1) z_mean_log_var = layers.fc(input=enc_hidden, size=self.latent_size * 2, name='fc_dist') z_mean, z_log_var = layers.split(z_mean_log_var, 2, -1) return z_mean, z_log_var
def epoch_predict(env, args, model, loader): """Predict in one epoch""" connections, deprels, probabilities = [], [], [] pad_index = args.pad_index bos_index = args.bos_index eos_index = args.eos_index for batch, inputs in enumerate(loader(), start=1): if args.encoding_model.startswith("ernie"): words = inputs[0] connection_prob, deprel_prob, words = model(words) else: words, feats = inputs connection_prob, deprel_prob, words = model(words, feats) mask = layers.logical_and( layers.logical_and(words != pad_index, words != bos_index), words != eos_index, ) lens = nn.reduce_sum(mask, -1) connection_predicts, deprel_predicts = decode(args, connection_prob, deprel_prob, mask) connections.extend( layers.split(nn.masked_select(connection_predicts, mask), lens.numpy().tolist())) deprels.extend( layers.split(nn.masked_select(deprel_predicts, mask), lens.numpy().tolist())) if args.prob: arc_probs = nn.index_sample( layers.softmax(connection_prob, -1), layers.unsqueeze(connection_predicts, -1)) probabilities.extend( layers.split( nn.masked_select(layers.squeeze(arc_probs, axes=[-1]), mask), lens.numpy().tolist(), )) connections = [seq.numpy().tolist() for seq in connections] deprels = [env.REL.vocab[seq.numpy().tolist()] for seq in deprels] probabilities = [[round(p, 3) for p in seq.numpy().tolist()] for seq in probabilities] return connections, deprels, probabilities
def forward(self, x): """Forward network""" mask = layers.reduce_any(x != self.pad_index, -1) lens = nn.reduce_sum(mask, -1) masked_x = nn.masked_select(x, mask) char_mask = masked_x != self.pad_index emb = self.embed(masked_x) _, (h, _) = self.lstm(emb, char_mask, self.pad_index) h = layers.concat(layers.unstack(h), axis=-1) feat_embed = nn.pad_sequence_paddle( layers.split(h, lens.numpy().tolist(), dim=0), self.pad_index) return feat_embed
def flat_words(self, words): pad_index = self.args.pad_index lens = nn.reduce_sum(words != pad_index, dim=-1) position = layers.cumsum(lens + layers.cast((lens == 0), "int32"), axis=1) - 1 flat_words = nn.masked_select(words, words != pad_index) flat_words = nn.pad_sequence_paddle( layers.split(flat_words, layers.reduce_sum(lens, -1).numpy().tolist(), pad_index)) max_len = flat_words.shape[1] position = nn.mask_fill(position, position >= max_len, max_len - 1) return flat_words, position
def pad_packed_sequence(self, x, batch_sizes, unsorted_indices): """Pads a packed sequences.""" h_size = x.shape[1] split_x = layers.split(x, batch_sizes, dim=0) max_bs = batch_sizes[0] step_embs = [] for step, cur_bs in enumerate(batch_sizes): pad_emb = layers.zeros(shape=(max_bs - cur_bs, h_size), dtype=x.dtype) step_emb = layers.concat(input=(split_x[step], pad_emb)) step_embs.append(step_emb) new_x = layers.stack(step_embs, axis=1) new_x = layers.index_select(new_x, unsorted_indices) return new_x
def forward(self, input, pre_hidden, pre_cell): concat_input_hidden = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=concat_input_hidden, y=self._weight) gate_input = layers.elementwise_add(gate_input, self._bias) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) new_cell = layers.elementwise_add( layers.elementwise_mul( pre_cell, layers.sigmoid(layers.elementwise_add(f, self._forget_bias))), layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j))) new_hidden = layers.tanh(new_cell) * layers.sigmoid(o) return new_hidden, new_cell
def __call__(self, x): bond_feature = get_bond_feature_dims() bond_input = L.split(x, num_or_sections=len(bond_feature), dim=-1) outputs = None count = 0 for _x, _bond_input_dim in zip(bond_input, bond_feature): count += 1 emb = L.embedding(_x, size=(_bond_input_dim, self.emb_dim), param_attr=F.ParamAttr(name=self.name + '_bond_feat_%s' % count)) if outputs is None: outputs = emb else: outputs = outputs + emb return outputs
def for_rnn_net(self): x = layers.data(shape=[BATCH_SIZE, SEQ_LEN, INPUT_DIM], dtype="float32", name="x", append_batch_size=False) split_x = layers.split(x, num_or_sections=SEQ_LEN, dim=1) h_pre = fluid.layers.zeros(shape=[BATCH_SIZE, 1, INPUT_DIM], dtype="float32") for i in range(SEQ_LEN): x_t = split_x[i] h = layers.scale(x=layers.elementwise_add(x=h_pre, y=x_t), scale=self.scale) h_pre = h return layers.mean(h_pre)
def forward(self, x, *cond_inputs, **kwargs): output = self.norm(x) if self.norm is not None else x for i in range(len(cond_inputs)): if cond_inputs[i] is None: continue label_map = L.image_resize(cond_inputs[i], out_shape=x.shape[2:], resample='NEAREST') if self.separate_projection: hidden = self.mlps[i](label_map) gamma = self.gammas[i](hidden) beta = self.betas[i](hidden) else: affine_params = self.mlps[i](label_map) gamma, beta = L.split(affine_params, 2, 1) output = output * (1 + gamma) + beta return output
def forward(self, input, state): #logging.info("input shape: {}".format(input.shape)) pre_hidden, pre_cell = state #logging.info("pre hidden shape: {}".format(pre_hidden.shape)) #logging.info("pre cell shape: {}".format(pre_cell.shape)) # i,f,c,o 四个值均有Wx+Wh+b 即W(x+h)+b # 因此: # 实际相乘为[x, b]·W+b # x,b 横向相连, shape为[batch_size, input_size+hidden_size] # W的shape为[input_size+hidden_size, 4*hidden_size] # b的shape为[4*hidden_size,] # 横向连接 # shape: [batch_size, input_size+hidden_size] concat_input_hidden = L.concat([input, pre_hidden], axis=1) #logging.info("x concat h shape: {}".format(concat_input_hidden.shape)) # 计算Wx+Wh+b # shape: [batch_size, 4*hidden_size] gate_input = L.matmul(x=concat_input_hidden, y=self._weight) #logging.info("[x, b]·W shape: {}".format(gate_input.shape)) # shape: [batch_size, 4*hidden_size] gate_input = L.elementwise_add(gate_input, self._bias) #logging.info("[x, b]·W+b shape: {}".format(gate_input.shape)) # i,f,c,o四值按最后一维分开 因此每个的最后一维都是hidden_size i, f, c, o = L.split(gate_input, num_or_sections=4, dim=-1) # new_c = pre_c·sigmoid(f+forget_bias) + sigmoid(i)·tanh(c) # shape: [batch_size, hidden_size] new_cell = L.elementwise_add( L.elementwise_mul( pre_cell, L.sigmoid(L.elementwise_add(f, self._forget_bias))), L.elementwise_mul(L.sigmoid(i), L.tanh(c)) ) #logging.info("new_cell shape: {}".format(new_cell.shape)) # new_h = tanh(new_c)*sigmoid(o) # shape: [batch_size, hidden_size] new_hidden = L.tanh(new_cell) * L.sigmoid(o) #logging.info("new_hidden shape: {}".format(new_hidden.shape)) return new_hidden, [new_hidden, new_cell]
def pairwise_hinge(self): """pairwise model""" poi_repr = L.split(self.poi_repr, 2, dim=0) pos_repr, neg_repr = poi_repr pos_pred = L.cos_sim(self.query_repr, pos_repr) neg_pred = L.cos_sim(self.query_repr, neg_repr) mode = 'hinge_loss' # log(1 + e-z), max(0, 1 - z) if 'hinge_loss' == mode: theta_z = L.relu(1 + neg_pred - pos_pred) elif 'logistic_loss' == mode: theta_z = L.log(1 + L.exp(neg_pred - pos_pred)) self.loss = L.reduce_mean(theta_z) pos_cnt = L.reduce_sum(L.cast(L.greater_than(pos_pred, neg_pred), dtype="float32")) neg_cnt = L.reduce_sum(L.cast(L.less_than(pos_pred, neg_pred), dtype="float32")) self.order = pos_cnt / (1e-5 + neg_cnt) self.metrics = [self.loss, self.order]
def weight_layers(lm_embeddings, name="", l2_coef=0.0): ''' Weight the layers of a biLM with trainable scalar weights to compute ELMo representations. Input: lm_embeddings(list): representations of 2 layers from biLM. name = a string prefix used for the trainable variable names l2_coef: the l2 regularization coefficient $\lambda$. Pass None or 0.0 for no regularization. Output: weighted_lm_layers: weighted embeddings form biLM ''' n_lm_layers = len(lm_embeddings) W = layers.create_parameter( [ n_lm_layers, ], dtype="float32", name=name + "ELMo_w", attr=fluid.ParamAttr(name=name + "ELMo_w", initializer=fluid.initializer.Constant(0.0), regularizer=fluid.regularizer.L2Decay(l2_coef))) normed_weights = layers.softmax(W + 1.0 / n_lm_layers) splited_normed_weights = layers.split(normed_weights, n_lm_layers, dim=0) # compute the weighted, normalized LM activations pieces = [] for w, t in zip(splited_normed_weights, lm_embeddings): pieces.append(t * w) sum_pieces = layers.sums(pieces) # scale the weighted sum by gamma gamma = layers.create_parameter( [1], dtype="float32", name=name + "ELMo_gamma", attr=fluid.ParamAttr(name=name + "ELMo_gamma", initializer=fluid.initializer.Constant(1.0))) weighted_lm_layers = sum_pieces * gamma return weighted_lm_layers
def forward(self, input, pre_hidden): concat_input_hidden = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight) gate_input = layers.elementwise_add(gate_input, self._gate_bias) gate_input = self._gate_activation(gate_input) r, u = layers.split(gate_input, num_or_sections=2, dim=1) r_hidden = r * pre_hidden candidate = layers.matmul(layers.concat([input, r_hidden], 1), self._candidate_weight) candidate = layers.elementwise_add(candidate, self._candidate_bias) c = self._activation(candidate) new_hidden = u * pre_hidden + (1 - u) * c return new_hidden