def forward(self, src_tokens): # embed tokens and positions x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens) x = F.dropout(x, p=self.dropout, training=self.training) input_embedding = x # project to size of convolution x = self.fc1(x) # B x T x C -> T x B x C x = x.transpose(0, 1) # temporal convolutions for proj, conv in zip(self.projections, self.convolutions): residual = x if proj is None else proj(x) x = F.dropout(x, p=self.dropout, training=self.training) x = conv(x) x = F.glu(x, dim=2) x = (x + residual) * math.sqrt(0.5) # T x B x C -> B x T x C x = x.transpose(1, 0) # project back to size of embedding x = self.fc2(x) # scale gradients (this only affects backward, not forward) x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + input_embedding) * math.sqrt(0.5) return x, y
def forward(self, src_tokens, src_lengths): # embed tokens and positions x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens) x = F.dropout(x, p=self.dropout, training=self.training) input_embedding = x.transpose(0, 1) # project to size of convolution x = self.fc1(x) encoder_padding_mask = src_tokens.eq(self.padding_idx).t() # -> T x B if not encoder_padding_mask.any(): encoder_padding_mask = None # B x T x C -> T x B x C x = x.transpose(0, 1) # temporal convolutions for proj, conv, attention in zip(self.projections, self.convolutions, self.attention): residual = x if proj is None else proj(x) if encoder_padding_mask is not None: x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) x = F.dropout(x, p=self.dropout, training=self.training) padding_l = (conv.kernel_size[0] - 1) // 2 padding_r = conv.kernel_size[0] // 2 x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r)) x = conv(x) x = F.glu(x, dim=2) if attention is not None: x = attention(x) x = (x + residual) * math.sqrt(0.5) # T x B x C -> B x T x C x = x.transpose(1, 0) # project back to size of embedding x = self.fc2(x) if encoder_padding_mask is not None: encoder_padding_mask = encoder_padding_mask.t() # -> B x T x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) # scale gradients (this only affects backward, not forward) x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + input_embedding.transpose(0, 1)) * math.sqrt(0.5) # transposing the output to T x B x C and saving to a file if self._encoder_states_dir: self._save_encoder_state(x.transpose(1, 0), "batch-%s.pt") return { 'encoder_out': (x, y), 'encoder_padding_mask': encoder_padding_mask, # B x T }
def forward_features(self, source: torch.Tensor) -> torch.Tensor: if self.feature_grad_mult > 0: features = self.feature_extractor(source) if self.feature_grad_mult != 1.0: features = GradMultiply.apply(features, self.feature_grad_mult) else: with torch.no_grad(): features = self.feature_extractor(source) return features
def forward(self, source, padding_mask, **kwargs): features_only = kwargs.get("features_only", False) mask = kwargs.get("mask", True) if self.feature_grad_mult > 0: features = self.feature_extractor(source) if self.feature_grad_mult != 1.0: features = GradMultiply.apply(features, self.feature_grad_mult) else: with torch.no_grad(): features = self.feature_extractor(source) features = features.transpose(1, 2) if padding_mask is not None: extra = padding_mask.size(1) % features.size(1) if extra > 0: padding_mask = padding_mask[:, :-extra] padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1) padding_mask = padding_mask.all(-1) num_frames = (~padding_mask).float().sum() features_pen = (features.float().pow(2).sum(-1) * (~padding_mask).float()).sum() / num_frames else: features_pen = features.float().pow(2).mean() if self.post_extract_proj is not None: features = self.post_extract_proj(features) features = self.dropout_input(features) if mask: x, mask_indices = self.apply_mask(features, padding_mask) else: x = features mask_indices = None x = self.encoder(x, padding_mask=padding_mask) if features_only: return {"x": x, "padding_mask": padding_mask} x = self.phone_proj(x) result = { "x": x, "padding_mask": padding_mask, "features_pen": features_pen } return result
def forward(self, guess_tokens, guess_lengths, marker): # embed tokens and positions #print(self.embed_positions(guess_tokens)[0,0]) x = self.embed_tokens(guess_tokens) + self.embed_positions0( guess_tokens, marker=marker, mark=0) + self.embed_positions1( guess_tokens, marker=marker, mark=1) x = F.dropout(x, p=self.dropout, training=self.training) guess_embedding = x #-------------------------------------------------------------------------------------------- # project to size of convolution x = self.fc1(x) # B x T x C -> T x B x C x = x.transpose(0, 1) # temporal convolutions for proj, conv in zip(self.projections, self.convolutions): residual = x if proj is None else proj(x) x = F.dropout(x, p=self.dropout, training=self.training) if conv.kernel_size[0] % 2 == 1: # padding is implicit in the conv x = conv(x) else: padding_l = (conv.kernel_size[0] - 1) // 2 padding_r = conv.kernel_size[0] // 2 x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r)) x = conv(x) x = F.glu(x, dim=2) x = (x + residual) * math.sqrt(0.5) # T x B x C -> B x T x C x = x.transpose(1, 0) # project back to size of embedding x = self.fc2(x) # scale gradients (this only affects backward, not forward) x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + guess_embedding) * math.sqrt( 0.5 ) #----------------------------------------------------------------------- return x, y
def forward(self, source, padding_mask=None, mask=True, features_only=False, **kwargs): if self.feature_grad_mult > 0: features = self.feature_extractor(source) if self.feature_grad_mult != 1.0: features = GradMultiply.apply(features, self.feature_grad_mult) else: with torch.no_grad(): features = self.feature_extractor(source) features_pen = features.float().pow(2).mean() features = features.transpose(1, 2) features = self.layer_norm(features) if padding_mask is not None: extra = padding_mask.size(1) % features.size(1) if extra > 0: padding_mask = padding_mask[:, :-extra] padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1) padding_mask = padding_mask.all(-1) if self.post_extract_proj is not None: features = self.post_extract_proj(features) features = self.dropout_input(features) if mask: x, mask_indices = self.apply_mask(features, padding_mask) else: x = features; mask_indices = None x = self.encoder(x, padding_mask=padding_mask) # B, T, V size = x.size() x = x[mask_indices] # B, N, V if features_only: return {"x": x, "padding_mask": padding_mask, "mask_indices": mask_indices} x = self.phone_proj(x) # B, N, V result = {"x": x, "padding_mask": padding_mask, "mask_indices": mask_indices, "features_pen": features_pen, "size": size} return result
def forward(self, src_tokens, src_lengths): # embed tokens and positions x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens) x = F.dropout(x, p=self.dropout, training=self.training) input_embedding = x.transpose(0, 1) # project to size of convolution x = self.fc1(x) # B x T x C -> T x B x C x = x.transpose(0, 1) # temporal convolutions for proj, conv, attention in zip(self.projections, self.convolutions, self.attention): residual = x if proj is None else proj(x) x = F.dropout(x, p=self.dropout, training=self.training) padding_l = (conv.kernel_size[0] - 1) // 2 padding_r = conv.kernel_size[0] // 2 x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r)) x = conv(x) x = F.glu(x, dim=2) if attention is not None: x = attention(x) x = (x + residual) * math.sqrt(0.5) # T x B x C -> B x T x C x = x.transpose(1, 0) # project back to size of embedding x = self.fc2(x) # scale gradients (this only affects backward, not forward) x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + input_embedding.transpose(0, 1)) * math.sqrt(0.5) return { 'encoder_out': (x, y), }
def forward(self, src_tokens): positions = Variable( make_positions(src_tokens.data, self.dictionary.pad(), left_pad=LanguagePairDataset.LEFT_PAD_SOURCE) ) #left_pad=True # position embedding # embed tokens and positions x = self.embed_tokens(src_tokens) + self.embed_positions(positions) x = F.dropout(x, p=self.dropout, training=self.training) input_embedding = x # project to size of convolution x = self.fc1(x) # if embedding size does not equal with channel size # B x T x C -> T x B x C C=Channel x = x.transpose(0, 1) # temporal convolutions for proj, conv in zip(self.projections, self.convolutions): residual = x if proj is None else proj( x) # if dim of x does not equal with dim of conv output x = F.dropout(x, p=self.dropout, training=self.training) x = conv(x) x = F.glu(x, dim=-1) x = (x + residual) * math.sqrt(0.5) # residual connection # T x B x C -> B x T x C x = x.transpose(1, 0) # project back to size of embedding x = self.fc2(x) # encoder_out[0], embed_dim # scale gradients (this only affects backward, not forward) x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + input_embedding) * math.sqrt(0.5) # encoder_out[1] # z + e for conditional input c, scale by √0.5 return x, y #
def forward(self, src_tokens): positions = Variable(make_positions(src_tokens.data, self.dictionary.pad(), left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)) # embed tokens and positions x = self.embed_tokens(src_tokens) + self.embed_positions(positions) x = F.dropout(x, p=self.dropout, training=self.training) input_embedding = x # project to size of convolution x = self.fc1(x) # B x T x C -> T x B x C x = x.transpose(0, 1) # temporal convolutions for proj, conv in zip(self.projections, self.convolutions): residual = x if proj is None else proj(x) x = F.dropout(x, p=self.dropout, training=self.training) x = conv(x) x = F.glu(x, dim=-1) x = (x + residual) * math.sqrt(0.5) # T x B x C -> B x T x C x = x.transpose(1, 0) # project back to size of embedding x = self.fc2(x) # scale gradients (this only affects backward, not forward) x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + input_embedding) * math.sqrt(0.5) return x, y
def forward(self, source, padding_mask=None, mask=True, features_only=False, alignments=None): if self.feature_grad_mult > 0: features = self.feature_extractor(source) if self.feature_grad_mult != 1.0: features = GradMultiply.apply(features, self.feature_grad_mult) else: with torch.no_grad(): features = self.feature_extractor(source) compute_alignment_metrics = self.cfg.compute_alignment_metrics and alignments is not None features_pen = features.float().pow(2).mean() features = features.transpose(1, 2) features = self.layer_norm(features) unmasked_features = features.clone() if padding_mask is not None: assert padding_mask.size(1) == 1 padding_mask = padding_mask.squeeze(1) scale = padding_mask.size(1) // features.size(1) extra = padding_mask.size(1) % features.size( 1 ) # should be 0 since 1st CNN reduces number of features [scale] times (due to the architecture choice) assert extra == 0 padding_mask = padding_mask[:, ::scale] assert np.all(padding_mask.shape == features.shape[:-1]) if compute_alignment_metrics: alignments = alignments[:, ::scale] if self.post_extract_proj is not None: features = self.post_extract_proj(features) features = self.dropout_input(features) unmasked_features = self.dropout_features(unmasked_features) num_vars = None code_ppl = None prob_ppl = None curr_temp = None if self.input_quantizer: q = self.input_quantizer(features, produce_targets=False) features = q["x"] num_vars = q["num_vars"] code_ppl = q["code_perplexity"] prob_ppl = q["prob_perplexity"] curr_temp = q["temp"] features = self.project_inp(features) # [!] careful with some nasty indirect dependencies - changing this function arg padding_mask -> sth else # seems to break the code completely because indirect dependencies on passthrough **kwargs etc. or so are so horrible if mask: x, mask_indices = self.apply_mask(features, padding_mask) if mask_indices is not None: y = unmasked_features[mask_indices].view( unmasked_features.size(0), -1, unmasked_features.size(-1)) if compute_alignment_metrics: alignments = alignments[mask_indices] else: y = unmasked_features else: x = features y = unmasked_features mask_indices = None x = self.encoder(x, padding_mask=padding_mask) if features_only: return {"x": x, "padding_mask": padding_mask} if self.quantizer: q = self.quantizer(y, produce_targets=compute_alignment_metrics) y = q["x"] num_vars = q["num_vars"] code_ppl = q["code_perplexity"] prob_ppl = q["prob_perplexity"] curr_temp = q["temp"] y = self.project_q(y) if self.negatives_from_everywhere: neg_cands, *_ = self.quantizer(unmasked_features, produce_targets=False) negs, _ = self.sample_negatives(neg_cands, y.size(1)) negs = self.project_q(negs) else: negs, _ = self.sample_negatives(y, y.size(1)) if self.codebook_negatives > 0: cb_negs = self.quantizer.sample_from_codebook( y.size(0) * y.size(1), self.codebook_negatives) cb_negs = cb_negs.view(self.codebook_negatives, y.size(0), y.size(1), -1) # order doesnt matter cb_negs = self.project_q(cb_negs) negs = torch.cat([negs, cb_negs], dim=0) else: y = self.project_q(y) if self.negatives_from_everywhere: negs, _ = self.sample_negatives(unmasked_features, y.size(1)) negs = self.project_q(negs) else: negs, _ = self.sample_negatives(y, y.size(1)) x = x[mask_indices].view(x.size(0), -1, x.size(-1)) if self.target_glu: y = self.target_glu(y) negs = self.target_glu(negs) x = self.final_proj(x) x = self.compute_preds(x, y, negs) result = { "x": x, "padding_mask": padding_mask, "features_pen": features_pen } if prob_ppl is not None: result["prob_perplexity"] = prob_ppl result["code_perplexity"] = code_ppl result["num_vars"] = num_vars result["temp"] = curr_temp if compute_alignment_metrics: result = { **result, **self.get_alignment_metrics(q["targets"], alignments) } return result
def forward(self, src_tokens, src_lengths=None): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (LongTensor): lengths of each source sentence of shape `(batch)` Returns: dict: - **encoder_out** (tuple): a tuple with two elements, where the first element is the last encoder layer's output and the second element is the same quantity summed with the input embedding (used for attention). The shape of both tensors is `(batch, src_len, embed_dim)`. - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` """ src_lengths = (src_tokens.ne(self.pad)).long().sum(dim=1) # embed tokens and positions x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens) x = F.dropout(x, p=self.dropout, training=self.training) input_embedding = x # project to size of convolution x = self.fc1(x) # used to mask padding in input encoder_padding_mask = src_tokens.eq(self.padding_idx).t() # -> T x B if not encoder_padding_mask.any(): encoder_padding_mask = None # B x T x C -> T x B x C x = x.transpose(0, 1) residuals = [x] # temporal convolutions for proj, conv, res_layer in zip(self.projections, self.convolutions, self.residuals): if res_layer > 0: residual = residuals[-res_layer] residual = residual if proj is None else proj(residual) else: residual = None if encoder_padding_mask is not None: x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) x = F.dropout(x, p=self.dropout, training=self.training) if conv.kernel_size[0] % 2 == 1: # padding is implicit in the conv x = conv(x) else: padding_l = (conv.kernel_size[0] - 1) // 2 padding_r = conv.kernel_size[0] // 2 x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r)) x = conv(x) x = F.glu(x, dim=2) if residual is not None: x = (x + residual) * math.sqrt(0.5) residuals.append(x) # T x B x C -> B x T x C x = x.transpose(1, 0) # project back to size of embedding x = self.fc2(x) if encoder_padding_mask is not None: encoder_padding_mask = encoder_padding_mask.t() # -> B x T x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) # scale gradients (this only affects backward, not forward) x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + input_embedding) * math.sqrt(0.5) return { 'encoder_out': (x, y), 'encoder_padding_mask': encoder_padding_mask, # B x T }
def forward( self, source, padding_mask=None, mask=True, features_only=False, layer=None, mask_indices=None, mask_channel_indices=None, padding_count=None, ): features = source if self.feature_grad_mult > 0: features = self.feature_extractor(features) if self.feature_grad_mult != 1.0: features = GradMultiply.apply(features, self.feature_grad_mult) else: with torch.no_grad(): features = self.feature_extractor(features) features = features.transpose(1, 2) features = self.layer_norm(features) orig_padding_mask = padding_mask if padding_mask is not None and padding_mask.any(): input_lengths = (1 - padding_mask.long()).sum(-1) # apply conv formula to get real output_lengths output_lengths = self._get_feat_extract_output_lengths(input_lengths) padding_mask = torch.zeros( features.shape[:2], dtype=features.dtype, device=features.device ) # these two operations makes sure that all values # before the output lengths indices are attended to padding_mask[ ( torch.arange(padding_mask.shape[0], device=padding_mask.device), output_lengths - 1, ) ] = 1 padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool() else: padding_mask = None if self.post_extract_proj is not None: features = self.post_extract_proj(features) pre_encoder_features = None if self.cfg.ema_transformer_only: pre_encoder_features = features.clone() features = self.dropout_input(features) if mask: x, mask_indices = self.apply_mask( features, padding_mask, mask_indices=mask_indices, mask_channel_indices=mask_channel_indices, ) else: x = features mask_indices = None x, layer_results = self.encoder( x, padding_mask=padding_mask, layer=layer, ) if features_only: return { "x": x, "padding_mask": padding_mask, "layer_results": layer_results, } result = { "losses": {}, } with torch.no_grad(): self.ema.model.eval() if self.cfg.ema_transformer_only: y, layer_results = self.ema.model.extract_features( pre_encoder_features, padding_mask=padding_mask, min_layer=self.cfg.encoder_layers - self.average_top_k_layers, ) y = { "x": y, "padding_mask": padding_mask, "layer_results": layer_results, } else: y = self.ema.model.extract_features( source=source, padding_mask=orig_padding_mask, mask=False, ) target_layer_results = [l[2] for l in y["layer_results"]] permuted = False if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer: target_layer_results = [ tl.permute(1, 2, 0) for tl in target_layer_results # TBC -> BCT ] permuted = True if self.cfg.batch_norm_target_layer: target_layer_results = [ F.batch_norm( tl.float(), running_mean=None, running_var=None, training=True ) for tl in target_layer_results ] if self.cfg.instance_norm_target_layer: target_layer_results = [ F.instance_norm(tl.float()) for tl in target_layer_results ] if permuted: target_layer_results = [ tl.transpose(1, 2) for tl in target_layer_results # BCT -> BTC ] if self.cfg.group_norm_target_layer: target_layer_results = [ F.layer_norm(tl.float(), tl.shape[-2:]) for tl in target_layer_results ] if self.cfg.layer_norm_target_layer: target_layer_results = [ F.layer_norm(tl.float(), tl.shape[-1:]) for tl in target_layer_results ] y = sum(target_layer_results) / len(target_layer_results) if self.cfg.layer_norm_targets: y = F.layer_norm(y.float(), y.shape[-1:]) if self.cfg.instance_norm_targets: y = F.instance_norm(y.float().transpose(1, 2)).transpose(1, 2) if not permuted: y = y.transpose(0, 1) y = y[mask_indices] x = x[mask_indices] x = self.final_proj(x) sz = x.size(-1) if self.loss_beta == 0: loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1) else: loss = F.smooth_l1_loss( x.float(), y.float(), reduction="none", beta=self.loss_beta ).sum(dim=-1) if self.loss_scale is not None: scale = self.loss_scale else: scale = 1 / math.sqrt(sz) result["losses"]["regression"] = loss.sum() * scale if "sample_size" not in result: result["sample_size"] = loss.numel() with torch.no_grad(): result["target_var"] = self.compute_var(y) result["pred_var"] = self.compute_var(x.float()) if self.num_updates > 5000 and result["target_var"] < self.cfg.min_target_var: logger.error( f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting" ) raise Exception( f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting" ) if self.num_updates > 5000 and result["pred_var"] < self.cfg.min_pred_var: logger.error( f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting" ) raise Exception( f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting" ) if self.ema is not None: result["ema_decay"] = self.ema.get_decay() * 1000 return result
def forward(self, src_tokens, src_lengths): torch.set_printoptions(threshold=8000) x1 = self.embed_tokens(src_tokens) x2 = self.embed_positions(src_tokens) if src_tokens.lt(0).sum() >0: print("negative voc idx (voc size {})".format(len(self.dictionary))) print(src_tokens) exit() x = x1 + x2 x = F.dropout(x, p=self.dropout, training=self.training) input_embedding = x # project to size of convolution x = self.fc1(x) # used to mask padding in input encoder_padding_mask = src_tokens.eq(self.padding_idx).t() # -> T x B ###puts 1's where the pad index 0's otherwise if not encoder_padding_mask.any(): encoder_padding_mask = None # B x T x C -> T x B x C x = x.transpose(0, 1) residuals = [x] # temporal convolutions for proj, conv, res_layer in zip(self.projections, self.convolutions, self.residuals): if res_layer > 0: residual = residuals[-res_layer] residual = residual if proj is None else proj(residual) else: residual = None if encoder_padding_mask is not None: x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) x = F.dropout(x, p=self.dropout, training=self.training) if conv.kernel_size[0] % 2 == 1: # padding is implicit in the conv x = conv(x) else: padding_l = (conv.kernel_size[0] - 1) // 2 padding_r = conv.kernel_size[0] // 2 x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r)) x = conv(x) x = F.glu(x, dim=2) if residual is not None: x = (x + residual) * math.sqrt(self.normalization_constant) residuals.append(x) # T x B x C -> B x T x C x = x.transpose(1, 0) # project back to size of embedding x = self.fc2(x) if encoder_padding_mask is not None: encoder_padding_mask = encoder_padding_mask.t() # -> B x T x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) # scale gradients (this only affects backward, not forward) if self.num_attention_layers > 0: x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + input_embedding) * math.sqrt(self.normalization_constant) return { 'encoder_out': (x, y), 'encoder_padding_mask': encoder_padding_mask, # B x T }
def target_predict(self, source, padding_mask=None, mask=True, mask_indices=None): with torch.no_grad(): if self.feature_grad_mult > 0: features = self.feature_extractor_target(source) if self.feature_grad_mult != 1.0: features = GradMultiply.apply(features, self.feature_grad_mult) else: with torch.no_grad(): features = self.feature_extractor_target(source) features_pen = features.float().pow(2).mean() features = features.transpose(1, 2) features = self.layer_norm(features) unmasked_features = features.clone() if padding_mask is not None: extra = padding_mask.size(1) % features.size(1) if extra > 0: padding_mask = padding_mask[:, :-extra] padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1) padding_mask = padding_mask.all(-1) if self.post_extract_proj_target is not None: features = self.post_extract_proj_target(features) #print("target feature", features.size()) features = self.dropout_input(features) unmasked_features = self.dropout_features(unmasked_features) num_vars = None code_ppl = None prob_ppl = None curr_temp = None if self.input_quantizer: if self.shared_quantizer: q = self.input_quantizer(features, produce_targets=False) else: q = self.input_quantizer(features, produce_targets=False) features = q["x"] num_vars = q["num_vars"] code_ppl = q["code_perplexity"] prob_ppl = q["prob_perplexity"] curr_temp = q["temp"] if self.shared_quantizer: features = self.project_inp(features) else: features = self.project_inp_target(features) if mask: x, mask_indices = self.apply_mask(features, padding_mask, mask_indices) if mask_indices is not None: y = unmasked_features[mask_indices].view( unmasked_features.size(0), -1, unmasked_features.size(-1)) else: y = unmasked_features else: x = features y = unmasked_features mask_indices = None x = self.encoder_target(x, padding_mask=padding_mask) #print("target encoded", x.size()) if self.quantizer: if self.shared_quantizer: q = self.quantizer(y, produce_targets=False) else: q = self.quantizer_target(y, produce_targets=False) y = q["x"] num_vars = q["num_vars"] code_ppl = q["code_perplexity"] prob_ppl = q["prob_perplexity"] curr_temp = q["temp"] if self.shared_quantizer: y = self.project_q(y) else: y = self.project_q_target(y) else: y = self.project_q(y) #print("target before masking", x.size()) x = x[mask_indices].view(x.size(0), -1, x.size(-1)) #print("target after masking", x.size()) if self.target_glu: y = self.target_glu_target(y) negs = self.target_glu_target(negs) x = self.final_proj_target(x) #x = self.compute_preds(x, y, negs) #print("target before prediction", x.size()) result = { "x": x, "padding_mask": padding_mask, "features_pen": features_pen } if prob_ppl is not None: result["prob_perplexity"] = prob_ppl result["code_perplexity"] = code_ppl result["num_vars"] = num_vars result["temp"] = curr_temp return result
def forward( self, source, padding_mask=None, mask=True, features_only=False, layer=None, fix_n=0, mask_indices=None, mask_channel_indices=None, padding_count=None, ): if self.feature_grad_mult > 0: features = self.feature_extractor(source) if self.feature_grad_mult != 1.0: features = GradMultiply.apply(features, self.feature_grad_mult) else: with torch.no_grad(): features = self.feature_extractor(source) features_pen = features.float().pow(2).mean() features = features.transpose(1, 2) features = self.layer_norm(features) unmasked_features = features.clone() if padding_mask is not None and padding_mask.any(): input_lengths = (1 - padding_mask.long()).sum(-1) # apply conv formula to get real output_lengths output_lengths = self._get_feat_extract_output_lengths( input_lengths) padding_mask = torch.zeros(features.shape[:2], dtype=features.dtype, device=features.device) # these two operations makes sure that all values # before the output lengths indices are attended to padding_mask[( torch.arange(padding_mask.shape[0], device=padding_mask.device), output_lengths - 1, )] = 1 padding_mask = ( 1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool() else: padding_mask = None if self.post_extract_proj is not None: features = self.post_extract_proj(features) features = self.dropout_input(features) unmasked_features = self.dropout_features(unmasked_features) num_vars = None code_ppl = None prob_ppl = None curr_temp = None if self.input_quantizer: q = self.input_quantizer(features, produce_targets=False) features = q["x"] num_vars = q["num_vars"] code_ppl = q["code_perplexity"] prob_ppl = q["prob_perplexity"] curr_temp = q["temp"] features = self.project_inp(features) if mask: x, mask_indices = self.apply_mask( features, padding_mask, mask_indices=mask_indices, mask_channel_indices=mask_channel_indices, ) if not is_xla_tensor(x) and mask_indices is not None: # tpu-comment: reducing the size in a dynamic way causes # too many recompilations on xla. y = unmasked_features[mask_indices].view( unmasked_features.size(0), -1, unmasked_features.size(-1)) else: y = unmasked_features else: x = features y = unmasked_features mask_indices = None x, layer_results = self.encoder(x, padding_mask=padding_mask, layer=layer, fix_n=fix_n) if features_only: return { "x": x, "padding_mask": padding_mask, "features": unmasked_features, "layer_results": layer_results, } x = self.final_proj(x) result = { "features": x, # "feautres_padding_mask": padding_mask, # "layer_results": layer_results, } if self.quantizer: q = self.quantizer(y, produce_targets=False) y = q["x"] num_vars = q["num_vars"] code_ppl = q["code_perplexity"] prob_ppl = q["prob_perplexity"] curr_temp = q["temp"] y = self.project_q(y) q_unmasked = self.quantizer(unmasked_features, produce_targets=False) quant_features = q_unmasked["x"] # quant_features = self.q2h_proj(q_unmasked["x"]) if self.negatives_from_everywhere: # neg_cands = self.quantizer(unmasked_features, produce_targets=False)["x"] neg_cands = q_unmasked["x"] negs, _ = self.sample_negatives( neg_cands, y.size(1), padding_count=padding_count, ) negs = self.project_q(negs) else: negs, _ = self.sample_negatives( y, y.size(1), padding_count=padding_count, ) if self.codebook_negatives > 0: cb_negs = self.quantizer.sample_from_codebook( y.size(0) * y.size(1), self.codebook_negatives) cb_negs = cb_negs.view(self.codebook_negatives, y.size(0), y.size(1), -1) # order doesnt matter cb_negs = self.project_q(cb_negs) negs = torch.cat([negs, cb_negs], dim=0) else: y = self.project_q(y) if self.negatives_from_everywhere: negs, _ = self.sample_negatives( unmasked_features, y.size(1), padding_count=padding_count, ) negs = self.project_q(negs) else: negs, _ = self.sample_negatives( y, y.size(1), padding_count=padding_count, ) if not is_xla_tensor(x): # tpu-comment: reducing the size in a dynamic way causes # too many recompilations on xla. x = x[mask_indices].view(x.size(0), -1, x.size(-1)) if self.target_glu: y = self.target_glu(y) negs = self.target_glu(negs) # x = self.final_proj(x) x = self.compute_preds(x, y, negs) result["x"] = x result["padding_mask"] = padding_mask result["features_pen"] = features_pen result["quant_features"] = quant_features # result = { # "x": x, # "padding_mask": padding_mask, # "features_pen": features_pen, # "quant_features":quant_features, # } if prob_ppl is not None: result["prob_perplexity"] = prob_ppl result["code_perplexity"] = code_ppl result["num_vars"] = num_vars result["temp"] = curr_temp return result
def forward(self, source, padding_mask=None, mask=True, features_only=False): if self.feature_grad_mult > 0: features = self.feature_extractor(source) if self.feature_grad_mult != 1.0: features = GradMultiply.apply(features, self.feature_grad_mult) else: with torch.no_grad(): features = self.feature_extractor(source) features_pen = features.float().pow(2).mean() features = features.transpose(1, 2) features = self.layer_norm(features) unmasked_features = features.clone() if padding_mask is not None: input_lengths = (1 - padding_mask.long()).sum(-1) # apply conv formula to get real output_lengths output_lengths = self._get_feat_extract_output_lengths( input_lengths) padding_mask = torch.zeros(features.shape[:2], dtype=features.dtype, device=features.device) # these two operations makes sure that all values # before the output lengths indices are attended to padding_mask[(torch.arange(padding_mask.shape[0], device=padding_mask.device), output_lengths - 1)] = 1 padding_mask = ( 1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool() if self.post_extract_proj is not None: features = self.post_extract_proj(features) features = self.dropout_input(features) unmasked_features = self.dropout_features(unmasked_features) num_vars = None code_ppl = None prob_ppl = None curr_temp = None if self.input_quantizer: q = self.input_quantizer(features, produce_targets=False) features = q["x"] num_vars = q["num_vars"] code_ppl = q["code_perplexity"] prob_ppl = q["prob_perplexity"] curr_temp = q["temp"] features = self.project_inp(features) if mask: x, mask_indices = self.apply_mask(features, padding_mask) if mask_indices is not None: y = unmasked_features[mask_indices].view( unmasked_features.size(0), -1, unmasked_features.size(-1)) else: y = unmasked_features else: x = features y = unmasked_features mask_indices = None x = self.encoder(x, padding_mask=padding_mask) if features_only: return {"x": x, "padding_mask": padding_mask} if self.quantizer: q = self.quantizer(y, produce_targets=False) y = q["x"] num_vars = q["num_vars"] code_ppl = q["code_perplexity"] prob_ppl = q["prob_perplexity"] curr_temp = q["temp"] y = self.project_q(y) if self.negatives_from_everywhere: neg_cands, *_ = self.quantizer(unmasked_features, produce_targets=False) negs, _ = self.sample_negatives(neg_cands, y.size(1)) negs = self.project_q(negs) else: negs, _ = self.sample_negatives(y, y.size(1)) if self.codebook_negatives > 0: cb_negs = self.quantizer.sample_from_codebook( y.size(0) * y.size(1), self.codebook_negatives) cb_negs = cb_negs.view(self.codebook_negatives, y.size(0), y.size(1), -1) # order doesnt matter cb_negs = self.project_q(cb_negs) negs = torch.cat([negs, cb_negs], dim=0) else: y = self.project_q(y) if self.negatives_from_everywhere: negs, _ = self.sample_negatives(unmasked_features, y.size(1)) negs = self.project_q(negs) else: negs, _ = self.sample_negatives(y, y.size(1)) x = x[mask_indices].view(x.size(0), -1, x.size(-1)) if self.target_glu: y = self.target_glu(y) negs = self.target_glu(negs) x = self.final_proj(x) x = self.compute_preds(x, y, negs) result = { "x": x, "padding_mask": padding_mask, "features_pen": features_pen } if prob_ppl is not None: result["prob_perplexity"] = prob_ppl result["code_perplexity"] = code_ppl result["num_vars"] = num_vars result["temp"] = curr_temp return result
def forward(self, src_tokens, src_lengths, src_doctopic, src_wordtopics): # embed tokens and positions # print(self.embed_tokens(src_tokens), self.embed_positions(src_tokens), src_doctopic, src_wordtopics) # ''' 1) # src_doctopic: batchsize x 512 # src_wordtopics: batchsize x wordcount x 512 src_doctopic_ext = src_doctopic.unsqueeze(1) # batchsize x 1 x 512 # print(src_doctopic_ext) src_wordtopics_doctopic = src_wordtopics * src_doctopic_ext # batchsize x wordcount x 512 # print(src_wordtopics_doctopic) # ''' ''' 2) # src_doctopic: batchsize x 512 # src_wordtopics: batchsize x wordcount x 512 src_doctopic_ext = src_doctopic.unsqueeze(1) # batchsize x 1 x 512 # print(src_doctopic_ext) src_wordtopics_doctopic = src_wordtopics * src_doctopic_ext # batchsize x wordcount x 512 # print(src_wordtopics_doctopic) # Normalize src_wordtopics_doctopic (April 29th) src_wordtopics_doctopic = F.normalize(src_wordtopics_doctopic, p=2, dim=2) ''' x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens) # batchsize x wordcount x 512 #word_embedding = torch.FloatTensor(vector_dict.get_embedding(src_tokens.cpu().numpy(), self.embed_dim)).to('cuda') #x = word_embedding + self.embed_positions(src_tokens) # batchsize x wordcount x 512 # print(x) # Concat wordtopics*doctopic to (wordembedding+posembedding) x = torch.cat((x, src_wordtopics_doctopic), 2) # print(x) x = F.dropout(x, p=self.dropout, training=self.training) input_embedding = x # project to size of convolution x = self.fc1(x) # B x T x C -> T x B x C x = x.transpose(0, 1) # temporal convolutions for proj, conv in zip(self.projections, self.convolutions): residual = x if proj is None else proj(x) x = F.dropout(x, p=self.dropout, training=self.training) padding_l = (conv.kernel_size[0] - 1) // 2 padding_r = conv.kernel_size[0] // 2 x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r)) x = conv(x) # NGTU BEGIN part_1, part_2 = x[:,:,:self.embed_dim],x[:,:,self.embed_dim:] part_1 = torch.tanh(part_1) x = torch.cat([part_1,part_2],dim=2) x = F.glu(x, dim=2) x = self.lay_norm(x + residual) # NGTU END ''' # original GLU BEGIN x = F.glu(x, dim=2) x = (x + residual) * math.sqrt(0.5) # GLU END ''' # T x B x C -> B x T x C x = x.transpose(1, 0) # project back to size of embedding x = self.fc2(x) # scale gradients (this only affects backward, not forward) x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + input_embedding) * math.sqrt(0.5) # print(x,y) return x, y
def forward( self, src_tokens, src_lengths, return_all_hiddens=False, padding_mask=None, features_only=True, ): mask = self.training or self.alway_mask if self.feature_grad_mult > 0 and self.training: features = self.subsample(src_tokens) if self.feature_grad_mult != 1.0: features = GradMultiply.apply(features, self.feature_grad_mult) else: with torch.no_grad(): features = self.subsample(src_tokens) features = features.transpose(1, 2) features = self.feat_layer_norm(features) if self.feat_proj is not None: features = self.feat_proj(features) if padding_mask is not None: input_lengths = (1 - padding_mask.long()).sum(-1) # apply conv formula to get real output_lengths output_lengths = self._get_feat_extract_output_lengths( input_lengths) padding_mask = torch.zeros(features.shape[:2], dtype=features.dtype, device=features.device) # these two operations makes sure that all values # before the output lengths indices are attended to padding_mask[( torch.arange(padding_mask.shape[0], device=padding_mask.device), output_lengths - 1, )] = 1 padding_mask = ( 1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool() features = self.feat_scale * features if self.feat_scale != 1.0 else features unmasked_features = features.clone() features = self.dropout_input(features) unmasked_features = self.dropout_features(unmasked_features) if mask: x, mask_indices = self.apply_mask(features, padding_mask) else: x = features mask_indices = None def cal_transformer_layers(x, encoder_padding_mask, return_all_hiddens=False): # x: B x T x C positions = self.embed_positions(x.transpose(1, 2)).transpose(1, 2) x = x + positions if not self.normalize_before: x = self.layer_norm(x) # B x T x C -> T x B x C x = x.transpose(0, 1) encoder_states = [] for layer in self.layers: x = layer(x, encoder_padding_mask) if return_all_hiddens: encoder_states.append(x) if self.normalize_before: x = self.layer_norm(x) return x, encoder_states x, encoder_states = cal_transformer_layers(x, padding_mask, return_all_hiddens) if features_only: return { "encoder_out": [x], # [T x B x C] "encoder_padding_mask": [padding_mask] if padding_mask is not None else [], # B x T "encoder_embedding": [], # "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], "src_lengths": [], "mask_indices": [mask_indices], } x_unmasked = x if self.mask_prob > 0 or self.mask_channel_prob > 0: x_unmasked, _ = cal_transformer_layers(unmasked_features, padding_mask) return { "encoder_out": [x], # [T x B x C] "encoder_unmasked_out": [x_unmasked], # [T x B x C] "encoder_padding_mask": [padding_mask] if padding_mask is not None else [], # B x T "encoder_embedding": [], # "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], "src_lengths": [], "mask_indices": [mask_indices] if mask_indices is not None else [], # B X T }
def forward(self, source, padding_mask=None, mask=True, features_only=False): if self.feature_grad_mult > 0: features = self.feature_extractor(source) if self.feature_grad_mult != 1.0: features = GradMultiply.apply(features, self.feature_grad_mult) else: with torch.no_grad(): features = self.feature_extractor(source) features_pen = features.float().pow(2).mean() features = features.transpose(1, 2) features = self.layer_norm(features) unmasked_features = features.clone() if padding_mask is not None: extra = padding_mask.size(1) % features.size(1) if extra > 0: padding_mask = padding_mask[:, :-extra] padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1) padding_mask = padding_mask.all(-1) if self.post_extract_proj is not None: features = self.post_extract_proj(features) features = self.dropout_input(features) unmasked_features = self.dropout_features(unmasked_features) num_vars = None code_ppl = None prob_ppl = None curr_temp = None if mask: x, mask_indices = self.apply_mask(features, padding_mask) if mask_indices is not None: y = unmasked_features[mask_indices].view( unmasked_features.size(0), -1, unmasked_features.size(-1)) else: y = unmasked_features else: x = features y = unmasked_features mask_indices = None x = self.encoder(x, padding_mask=padding_mask) if features_only: return {"x": x, "padding_mask": padding_mask} y = self.project_q(y) if self.negatives_from_everywhere: negs, _ = self.sample_negatives(unmasked_features, y.size(1)) negs = self.project_q(negs) else: negs, _ = self.sample_negatives(y, y.size(1)) logits_ali = self.phone_proj(x[mask_indices]) # N, V x = x[mask_indices].view(x.size(0), -1, x.size(-1)) if self.target_glu: y = self.target_glu(y) negs = self.target_glu(negs) x = self.final_proj(x) x = self.compute_preds(x, y, negs) result = { "x": x, "padding_mask": padding_mask, "mask_indices": mask_indices, "features_pen": features_pen, "logits_ali": logits_ali } return result
def forward(self, src_tokens, src_lengths, src_doctopic, src_wordtopics): # embed tokens and positions # print("------------------------------------------------------") # print(self.embed_tokens(src_tokens)) # print("------------------------------------------------------") # print(self.embed_positions(src_tokens)) # print("------------------------------------------------------") # print(src_doctopic) # print("------------------------------------------------------") # print(src_wordtopics) # exit(0) ''' Shapes src_doctopic: batchsize x 512 src_wordtopics: batchsize x wordcount x 512 src_doctopic_ext = src_doctopic.unsqueeze(1) # batchsize x 1 x 512 src_wordtopics_doctopic = src_wordtopics * src_doctopic_ext # batchsize x wordcount x 512 src_wordtopics_doctopic = src_wordtopics # batchsize x wordcount x 512 ''' # if self.variant == 1: # print("USING ENCODER DECODER WITH enc(t', tD) dec(tD)") src_doctopic_ext = src_doctopic.unsqueeze(1) # batchsize x 1 x 512 src_wordtopics_doctopic = src_wordtopics * src_doctopic_ext # batchsize x wordcount x 512 # else: # print("USING ENCODER DECODER WITH enc(t') dec(tD)") # src_wordtopics_doctopic = src_wordtopics# batchsize x wordcount x 512 x = self.embed_tokens(src_tokens) + self.embed_positions( src_tokens) # batchsize x wordcount x 512 # print(x) # Concat wordtopics*doctopic to (wordembedding+posembedding) x = torch.cat((x, src_wordtopics_doctopic), 2) # print(x) x = F.dropout(x, p=self.dropout, training=self.training) input_embedding = x # project to size of convolution x = self.fc1(x) # B x T x C -> T x B x C x = x.transpose(0, 1) # temporal convolutions for proj, conv in zip(self.projections, self.convolutions): residual = x if proj is None else proj(x) x = F.dropout(x, p=self.dropout, training=self.training) padding_l = (conv.kernel_size[0] - 1) // 2 padding_r = conv.kernel_size[0] // 2 x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r)) x = conv(x) x = F.glu(x, dim=2) # GLU x = (x + residual) * math.sqrt(0.5) # T x B x C -> B x T x C x = x.transpose(1, 0) # project back to size of embedding x = self.fc2(x) # scale gradients (this only affects backward, not forward) x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + input_embedding) * math.sqrt(0.5) # print(x,y) return x, y
def forward(self, src_tokens, src_lengths): # embed tokens and positions x = self.embed_tokens(src_tokens) x = F.dropout2d(x, p=self.token_dropout, training=self.training) x += self.embed_positions(src_tokens) x = F.dropout(x, p=self.dropout, training=self.training) input_embedding = x # project to size of convolution x = self.fc1(x) # used to mask padding in input encoder_padding_mask = src_tokens.eq(self.padding_idx).t() # -> T x B if not encoder_padding_mask.any(): encoder_padding_mask = None # B x T x C -> T x B x C x = x.transpose(0, 1) residuals = [x] # temporal convolutions for proj, conv, res_layer in zip(self.projections, self.convolutions, self.residuals): if res_layer > 0: residual = residuals[-res_layer] residual = residual if proj is None else proj(residual) else: residual = None if encoder_padding_mask is not None: x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) x = F.dropout(x, p=self.dropout, training=self.training) if conv.kernel_size[0] % 2 == 1: # padding is implicit in the conv x = conv(x) else: padding_l = (conv.kernel_size[0] - 1) // 2 padding_r = conv.kernel_size[0] // 2 x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r)) x = conv(x) x = F.glu(x, dim=2) if residual is not None: x = (x + residual) * math.sqrt(self.normalization_constant) residuals.append(x) # T x B x C -> B x T x C x = x.transpose(1, 0) # project back to size of embedding x = self.fc2(x) if encoder_padding_mask is not None: encoder_padding_mask = encoder_padding_mask.t() # -> B x T x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) # scale gradients (this only affects backward, not forward) x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + input_embedding) * math.sqrt(self.normalization_constant) return { 'encoder_out': (x, y), 'encoder_padding_mask': encoder_padding_mask, # B x T }
def mult_rst_grad(self, rst, ratio): assert isinstance(rst, dict) # instead of EncoderOut assert len(rst["encoder_out"]) == 1 rst["encoder_out"][0] = GradMultiply.apply(rst["encoder_out"][0], ratio) return rst
def forward(self, source, padding_mask=None, mask=True, features_only=False): if self.feature_grad_mult > 0: features = self.feature_extractor(source) if self.feature_grad_mult != 1.0: features = GradMultiply.apply(features, self.feature_grad_mult) else: with torch.no_grad(): features = self.feature_extractor(source) features_pen = features.float().pow(2).mean() features = features.transpose(1, 2) features = self.layer_norm(features) unmasked_features = features.clone() if padding_mask is not None: extra = padding_mask.size(1) % features.size(1) if extra > 0: padding_mask = padding_mask[:, :-extra] padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1) padding_mask = padding_mask.all(-1) if self.post_extract_proj is not None: features = self.post_extract_proj(features) features = self.dropout_input(features) unmasked_features = self.dropout_features(unmasked_features) num_vars = None code_ppl = None prob_ppl = None curr_temp = None if self.input_quantizer: q = self.input_quantizer(features, produce_targets=False) features = q["x"] num_vars = q["num_vars"] code_ppl = q["code_perplexity"] prob_ppl = q["prob_perplexity"] curr_temp = q["temp"] features = self.project_inp(features) if mask: x, mask_indices = self.apply_mask(features, padding_mask) if mask_indices is not None: y = unmasked_features[mask_indices].view( unmasked_features.size(0), -1, unmasked_features.size(-1)) else: y = unmasked_features else: x = features y = unmasked_features mask_indices = None x = self.encoder(x, padding_mask=padding_mask) if features_only: return {"x": x, "padding_mask": padding_mask} if self.quantizer: q = self.quantizer(y, produce_targets=False) y = q["x"] num_vars = q["num_vars"] code_ppl = q["code_perplexity"] prob_ppl = q["prob_perplexity"] curr_temp = q["temp"] y = self.project_q(y) if self.negatives_from_everywhere: neg_cands, *_ = self.quantizer(unmasked_features, produce_targets=False) negs, _ = self.sample_negatives(neg_cands, y.size(1)) negs = self.project_q(negs) else: negs, _ = self.sample_negatives(y, y.size(1)) if self.codebook_negatives > 0: cb_negs = self.quantizer.sample_from_codebook( y.size(0) * y.size(1), self.codebook_negatives) cb_negs = cb_negs.view(self.codebook_negatives, y.size(0), y.size(1), -1) # order doesnt matter cb_negs = self.project_q(cb_negs) negs = torch.cat([negs, cb_negs], dim=0) else: y = self.project_q(y) if self.negatives_from_everywhere: negs, _ = self.sample_negatives(unmasked_features, y.size(1)) negs = self.project_q(negs) else: negs, _ = self.sample_negatives(y, y.size(1)) x = x[mask_indices].view(x.size(0), -1, x.size(-1)) if self.target_glu: y = self.target_glu(y) negs = self.target_glu(negs) x = self.final_proj(x) x = self.compute_preds(x, y, negs) result = { "x": x, "padding_mask": padding_mask, "features_pen": features_pen } if prob_ppl is not None: result["prob_perplexity"] = prob_ppl result["code_perplexity"] = code_ppl result["num_vars"] = num_vars result["temp"] = curr_temp return result
def forward(self, text_sequences, text_positions=None, lengths=None, speaker_embed=None): assert self.n_speakers == 1 or speaker_embed is not None # embed text_sequences x = self.embed_tokens(text_sequences) if text_positions is not None: x += self.embed_text_positions(text_positions) x = F.dropout(x, p=self.dropout, training=self.training) # embed speakers if speaker_embed is not None: # expand speaker embedding for all time steps # (B, N) -> (B, T, N) ss = speaker_embed.size() speaker_embed = speaker_embed.unsqueeze(1).expand( ss[0], x.size(1), ss[-1]) speaker_embed_btc = speaker_embed speaker_embed_tbc = speaker_embed.transpose(0, 1) x += F.softsign(self.speaker_fc1(speaker_embed_btc)) input_embedding = x # project to size of convolution x = self.fc1(x) use_convtbc = isinstance(self.convolutions[0], _ConvTBC) # TBC case: B x T x C -> T x B x C # Generic case: B x T x C -> B x C x T x = x.transpose(0, 1) if use_convtbc else x.transpose(1, 2) # 1D conv blocks for proj, speaker_proj, conv in zip(self.projections, self.speaker_projections, self.convolutions): residual = x if proj is None else proj(x) x = F.dropout(x, p=self.dropout, training=self.training) x = conv(x) splitdim = -1 if use_convtbc else 1 a, b = x.split(x.size(splitdim) // 2, dim=splitdim) if speaker_proj is not None: softsign = F.softsign( speaker_proj(speaker_embed_tbc if use_convtbc else speaker_embed_btc)) softsign = softsign if use_convtbc else softsign.transpose( 1, 2) a = a + softsign x = a * F.sigmoid(b) x = (x + residual) * math.sqrt(0.5) # Back to batch first x = x.transpose(0, 1) if use_convtbc else x.transpose(1, 2) # project back to size of embedding keys = self.fc2(x) if speaker_embed is not None: keys += F.softsign(self.speaker_fc2(speaker_embed_btc)) # scale gradients (this only affects backward, not forward) if self.num_attention_layers is not None: keys = GradMultiply.apply(keys, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention values = (keys + input_embedding) * math.sqrt(0.5) return keys, values
def forward(self, src_tokens, src_lengths): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (LongTensor): lengths of each source sentence of shape `(batch)` Returns: dict: - **encoder_out** (tuple): a tuple with two elements, where the first element is the last encoder layer's output and the second element is the same quantity summed with the input embedding (used for attention). The shape of both tensors is `(batch, src_len, embed_dim)`. - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` """ # ELMo src_sens = src_tokens.cpu().numpy().tolist() for i in range(len(src_sens)): for j in range(len(src_sens[i])): src_sens[i][j] = self.id2token[src_sens[i][j]] char_ids = batch_to_ids(src_sens) elmo_embeds = self.elmo(char_ids.to('cuda')) elmo_embeds = elmo_embeds['elmo_representations'] input_elmo_embeds = elmo_embeds[0] output_elmo_embeds = None if self.args.num_output_repr == 2: output_elmo_embeds = elmo_embeds[1] if torch.cuda.is_available(): input_elmo_embeds = input_elmo_embeds.to('cuda') if output_elmo_embeds is not None: output_elmo_embeds = output_elmo_embeds.to('cuda') if self.args.merge_mode == 'sum': input_elmo_embeds = self.elmo_projection(input_elmo_embeds) if output_elmo_embeds is not None: output_elmo_embeds = self.elmo_projection(output_elmo_embeds) if self.args.use_other_embed: # embed tokens and positions if self.args.merge_mode == 'sum': x = self.embed_tokens(src_tokens) + self.embed_positions( src_tokens) + input_elmo_embeds else: x = self.embed_tokens(src_tokens) + self.embed_positions( src_tokens) x = torch.cat((x, input_elmo_embeds), dim=-1) else: # just use ELMo repr if self.args.merge_mode == 'sum': x = input_elmo_embeds + self.embed_positions(src_tokens) else: x = torch.cat( (self.embed_positions(src_tokens), input_elmo_embeds), dim=-1) x = F.dropout(x, p=self.dropout, training=self.training) input_embedding = x # project to size of convolution x = self.fc1(x) # used to mask padding in input encoder_padding_mask = src_tokens.eq(self.padding_idx).t() # -> T x B if not encoder_padding_mask.any(): encoder_padding_mask = None # B x T x C -> T x B x C x = x.transpose(0, 1) residuals = [x] # temporal convolutions for proj, conv, res_layer in zip(self.projections, self.convolutions, self.residuals): if res_layer > 0: residual = residuals[-res_layer] residual = residual if proj is None else proj(residual) else: residual = None if encoder_padding_mask is not None: x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) x = F.dropout(x, p=self.dropout, training=self.training) if conv.kernel_size[0] % 2 == 1: # padding is implicit in the conv x = conv(x) else: padding_l = (conv.kernel_size[0] - 1) // 2 padding_r = conv.kernel_size[0] // 2 x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r)) x = conv(x) x = F.glu(x, dim=2) if residual is not None: x = (x + residual) * math.sqrt(0.5) residuals.append(x) # T x B x C -> B x T x C x = x.transpose(1, 0) # ELMo and `concat` if output_elmo_embeds is not None and self.args.merge_mode == 'concat': x = torch.cat((x, output_elmo_embeds), dim=-1) # project back to size of embedding x = self.fc2(x) # ELMo and `sum` if output_elmo_embeds is not None and self.args.merge_mode == 'sum': x += output_elmo_embeds if encoder_padding_mask is not None: encoder_padding_mask = encoder_padding_mask.t() # -> B x T x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) # scale gradients (this only affects backward, not forward) x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + input_embedding) * math.sqrt(0.5) return { 'encoder_out': (x, y), 'encoder_padding_mask': encoder_padding_mask, # B x T }
def forward(self, src_tokens, src_lengths): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (LongTensor): lengths of each source sentence of shape `(batch)` Returns: dict: - **encoder_out** (tuple): a tuple with two elements, where the first element is the last encoder layer's output and the second element is the same quantity summed with the input embedding (used for attention). The shape of both tensors is `(batch, src_len, embed_dim)`. - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` """ # embed tokens and positions x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens) x = F.dropout(x, p=self.dropout, training=self.training) input_embedding = x # project to size of convolution x = self.fc1(x) # used to mask padding in input encoder_padding_mask = src_tokens.eq(self.padding_idx).t() # -> T x B if not encoder_padding_mask.any(): encoder_padding_mask = None # B x T x C -> T x B x C x = x.transpose(0, 1) residuals = [x] # temporal convolutions for proj, convs, res_layer in zip(self.projections, self.inner_convolutions, self.residuals): if res_layer > 0: residual = residuals[-res_layer] residual = residual if proj is None else proj(residual) else: residual = None if encoder_padding_mask is not None: x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) x = F.dropout(x, p=self.dropout, training=self.training) conv_list = [] for conv in convs: if conv.kernel_size[0] % 2 == 1: # padding is implicit in the conv t = conv(x) else: padding_l = (conv.kernel_size[0] - 1) // 2 padding_r = conv.kernel_size[0] // 2 t = F.pad(x, (0, 0, 0, 0, padding_l, padding_r)) t = conv(t) conv_list.append(t) #print("CList: {}".format(conv_list)) x = torch.stack(conv_list, dim=1) #.sum(dim=0) #print("SHAPE: {}".format(x.shape)) #x = x.T #print("SHAPE: {}".format(x.shape)) #x = self.mp2d(x)#.sum(dim=0) #print("SHAPE: {}".format(x.shape)) #x = x.T #print("SHAPE: {}".format(x.shape)) #print("SHAPE: {}".format(x.shape)) #x.cuda() x = x.sum(dim=1) #print("SHAPE: {}".format(x.shape)) x = F.glu(x, dim=2) # TODO(naetherm): With the layer normalization below we only did that after the # last encoder but we want to place the normalization between the encoders #if self.batch_norm: x = F.layer_norm(x, x.shape) if residual is not None: x = (x + residual) * math.sqrt(0.5) residuals.append(x) # T x B x C -> B x T x C x = x.transpose(1, 0) # TODO(naetherm): First plausible position for the layer normalization #if self.batch_norm: # project back to size of embedding x = self.fc2(x) # TODO(naetherm): Second plausible position for the layer normalization #x = F.layer_norm(x, x.shape) if encoder_padding_mask is not None: encoder_padding_mask = encoder_padding_mask.t() # -> B x T x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) # TODO(naetherm): Third plausible position for the layer normalization # scale gradients (this only affects backward, not forward) x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + input_embedding) * math.sqrt(0.5) return { 'encoder_out': (x, y), 'encoder_padding_mask': encoder_padding_mask, # B x T }