raise RevertingUnsavedModelException( "revert_to_best_model() is illegal because this model has never been saved." ) for subcol_name, subcol in self.subcols.items(): data_file = os.path.join(self._data_files[0], subcol_name) loaded = torch.load(data_file) subcol.load_state_dict(loaded.state_dict()) def global_collection(self): return self.subcols def parameter_count(self) -> numbers.Integral: return sum(p.numel() for p in self.subcols.parameters()) ParamCollection = xnmt.resolve_backend(ParamCollectionDynet, ParamCollectionTorch) class ResourceCollection(object): def __init__(self, subcol_name): self.resources = [] self.subcol_name = subcol_name def add(self, orig_file, save_postfix): self.resources.append((orig_file, save_postfix)) return ResourceFile(filename=f"{self.subcol_name}-{save_postfix}") def save(self, data_dir): for orig_file, save_postfix in self.resources: shutil.copyfile( orig_file,
@xnmt.require_torch class LayerNormTorch(Serializable, transforms.Transform): yaml_tag = "!LayerNorm" @serializable_init def __init__(self, hidden_dim: numbers.Integral) -> None: my_params = param_collections.ParamManager.my_params(self) self.layer_norm = nn.LayerNorm(normalized_shape=hidden_dim).to( xnmt.device) my_params.append(self.layer_norm) def transform(self, x: tt.Tensor) -> tt.Tensor: return self.layer_norm(x) LayerNorm = xnmt.resolve_backend(LayerNormDynet, LayerNormTorch) BN_EPS = 0.1 BN_MOMENTUM = 0.1 @xnmt.require_dynet class BatchNormDynet(Serializable, transforms.Transform, transducers.SeqTransducer): """ Implements batch normalization according to Ioffe and Szegedy, 2015. Supports application to matrices or higher-order tensors, in which case one dimension is interpreted as the time dimension and sequential batch norm is applied. A known issue is that the running mean / std is not reverted when reverting the parameters to the best model,
elif comb_method == "avg": return batched_expr.mean() else: raise ValueError( f"Unknown batch combination method '{comb_method}', expected 'sum' or 'avg'.'" ) def get_factored_loss_val(self, comb_method: str = "sum") -> 'FactoredLossVal': return FactoredLossVal({ k: self._combine_batches(v, comb_method).cpu().data.numpy() for k, v in self.expr_factors.items() }) FactoredLossExpr = xnmt.resolve_backend(FactoredLossExprDynet, FactoredLossExprTorch) class FactoredLossVal(object): """ Loss consisting of (unbatched) float values, with one value per loss factor. Used to represent losses accumulated across several training steps. """ def __init__(self, loss_dict=None) -> None: if loss_dict is None: loss_dict = {} self._loss_dict = loss_dict def __iadd__(self, other: 'FactoredLossVal') -> 'FactoredLossVal': """
multiplicator=-100.0) normalized = F.softmax(scores, dim=-1) self.attention_vecs.append(normalized) return normalized def params_from_dynet(self, arrays, state_dict): assert len(arrays) == 4 return { '0.weight': arrays[0], '0.bias': arrays[2], '1.weight': arrays[1], '2.weight': arrays[3] } MlpAttender = xnmt.resolve_backend(MlpAttenderDynet, MlpAttenderTorch) @xnmt.require_dynet class DotAttenderDynet(Attender, Serializable): """ Implements dot product attention of https://arxiv.org/abs/1508.04025 Also (optionally) perform scaling of https://arxiv.org/abs/1706.03762 Args: scale: whether to perform scaling """ yaml_tag = '!DotAttender' @serializable_init
def get_final_states(self) -> List[FinalTransducerState]: return self._final_states def transduce( self, src: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: src_tensor = src.as_tensor() out_mask = src.mask if self.downsample_by > 1: assert src_tensor.dim()==3, \ f"Downsampling only supported for tensors of order two (+ batch). Found dims {src_tensor.size()}" batch_size, seq_len, hidden_dim = src_tensor.size() if seq_len % self.downsample_by != 0: raise ValueError( "For downsampling, sequence lengths must be multiples of the total reduce factor. " "Configure batcher accordingly.") src_tensor = src_tensor.view( (batch_size, seq_len // self.downsample_by, hidden_dim * self.downsample_by)) if out_mask: out_mask = out_mask.lin_subsampled( reduce_factor=self.downsample_by) output = self.transform.transform(src_tensor) output_seq = expression_seqs.ExpressionSequence(expr_tensor=output, mask=out_mask) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq TransformSeqTransducer = xnmt.resolve_backend(TransformSeqTransducerDynet, TransformSeqTransducerTorch)
if multiplicator is not None: mask_expr = torch.as_tensor( np.expand_dims(self.np_arr, axis=1) * multiplicator, dtype=tensor_expr.dtype, device=xnmt.device) else: mask_expr = torch.as_tensor(np.expand_dims(self.np_arr, axis=1), dtype=tensor_expr.dtype, device=xnmt.device) ret = tensor_expr + mask_expr assert ret.size() == tensor_expr.size() return ret Mask = xnmt.resolve_backend(MaskDynet, MaskTorch) class Batcher(object): """ A template class to convert a list of sentences to several batches of sentences. Args: batch_size: batch size granularity: 'sent' or 'word' pad_src_to_multiple: pad source sentences so its length is multiple of this integer. sort_within_by_trg_len: whether to sort by reverse trg len inside a batch """ def __init__(self, batch_size: numbers.Integral, granularity: str = 'sent',
axis=0) arrays[2] = np.concatenate([ arrays[2][:h_dim], arrays[2][h_dim:h_dim * 2] + 1, arrays[2][h_dim * 3:], arrays[2][h_dim * 2:h_dim * 3] ], axis=0) return { '0.0.weight_ih': arrays[0], '0.0.weight_hh': arrays[1], '0.0.bias_ih': arrays[2], '0.0.bias_hh': np.zeros_like(arrays[2]) } UniLSTMSeqTransducer = xnmt.resolve_backend(UniLSTMSeqTransducerDynet, UniLSTMSeqTransducerTorch) class BiLSTMSeqTransducer(transducers.SeqTransducer, Serializable): """ This implements a bidirectional LSTM and requires about 8.5% less memory per timestep than DyNet's CompactVanillaLSTMBuilder due to avoiding concat operations. It uses 2 :class:`xnmt.lstm.UniLSTMSeqTransducer` objects in each layer. Args: layers: number of layers input_dim: input dimension hidden_dim: hidden dimension var_dropout: dropout probability (variational recurrent + vertical dropout) param_init: how to initialize weight matrices. In case of an InitializerSequence, the order is fwd_l0, bwd_l0, fwd_l1, bwd_l1, .. bias_init: how to initialize bias vectors. In case of an InitializerSequence, the order is fwd_l0, bwd_l0, fwd_l1, bwd_l1, ..
def get_final_states(self) -> List[transducers.FinalTransducerState]: return self._final_states def transduce( self, src: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: sent_len = src.sent_len() batch_size = tt.batch_size(src[0]) embeddings = self.embeddings( torch.tensor([list(range(sent_len))] * batch_size).to(xnmt.device)) # embeddings = dy.strided_select(dy.parameter(self.embedder), [1,1], [0,0], [self.input_dim, sent_len]) if self.op == 'sum': output = embeddings + src.as_tensor() elif self.op == 'concat': output = tt.concatenate([embeddings, src.as_tensor()]) else: raise ValueError( f'Illegal op {op} in PositionalTransducer (options are "sum"/"concat")' ) if self.train and self.dropout > 0.0: output = tt.dropout(output, self.dropout) output_seq = expression_seqs.ExpressionSequence(expr_tensor=output, mask=src.mask) self._final_states = [transducers.FinalTransducerState(output_seq[-1])] return output_seq PositionalSeqTransducer = xnmt.resolve_backend(PositionalSeqTransducerDynet, PositionalSeqTransducerTorch)
self.expr_transposed_tensor = self.as_tensor().transpose(1,2) return self.expr_transposed_tensor # should get rid of this: # def dim(self) -> tuple: # """ # Return dimension of the expression sequence # # Returns: # result of self.as_tensor().dim(), without explicitly constructing that tensor # """ # if self.has_tensor(): return self.as_tensor().dim() # else: # return tuple(list(self[0].dim()[0]) + [len(self)]), self[0].dim()[1] ExpressionSequence = xnmt.resolve_backend(ExpressionSequenceDynet, ExpressionSequenceTorch) @xnmt.require_dynet class LazyNumpyExpressionSequenceDynet(ExpressionSequence): """ This is initialized via numpy arrays, and dynet expressions are only created once a consumer requests representation as list or tensor. """ def __init__(self, lazy_data: np.ndarray, mask: Optional['batchers.Mask'] = None) -> None: """ Args: lazy_data: numpy array, or Batcher.Batch of numpy arrays """ self.lazy_data = lazy_data self.expr_list, self.expr_tensor, self.expr_transposed_tensor = None, None, None self.mask = mask
else: embeddings = [] seq_len = x.sent_len() for single_sent in x: assert single_sent.sent_len() == seq_len for word_i in range(seq_len): batch = batchers.mark_as_batch( [single_sent[word_i] for single_sent in x]) embeddings.append(self.embed(batch)) return expression_seqs.ExpressionSequence( expr_list=embeddings, mask=x.mask if batchers.is_batched(x) else None) SentEmbedder = xnmt.resolve_backend(SentEmbedderDy, SentEmbedderTorch) @xnmt.require_dynet class DenseWordEmbedderDynet(SentEmbedder, transforms.Linear, Serializable): """ Word embeddings via full matrix. Args: emb_dim: embedding dimension weight_noise: apply Gaussian noise with given standard deviation to embeddings word_dropout: drop out word types with a certain probability, sampling word types on a per-sentence level, see https://arxiv.org/abs/1512.05287 fix_norm: fix the norm of word vectors to be radius r, see https://arxiv.org/abs/1710.01329 param_init: how to initialize weight matrices bias_init: how to initialize bias vectors vocab_size: vocab size or None
momentum: numbers.Real = 0.0, weight_decay: numbers.Real = 0.0, dampening: numbers.Real = 0.0, nesterov: bool = False, skip_noisy: bool = False, rescale_grads: numbers.Real = 5.0) -> None: super().__init__(optimizer=torch.optim.SGD(params=ParamManager.global_collection().parameters(), lr=e0, momentum=momentum, weight_decay=weight_decay, dampening=dampening, nesterov=nesterov), skip_noisy=skip_noisy, rescale_grads=rescale_grads) SimpleSGDTrainer = xnmt.resolve_backend(SimpleSGDTrainerDynet, SimpleSGDTrainerTorch) @xnmt.require_dynet class MomentumSGDTrainer(XnmtOptimizerDynet, Serializable): """ Stochastic gradient descent with momentum This is a modified version of the SGD algorithm with momentum to stablize the gradient trajectory. Args: e0: Initial learning rate mom: Momentum skip_noisy: keep track of a moving average and a moving standard deviation of the log of the gradient norm values, and abort a step if the norm of the gradient exceeds four standard deviations of the moving average. Reference: https://arxiv.org/pdf/1804.09849.pdf rescale_grads: rescale gradients if the observed norm should be larger than this given norm
self.bias = bias self.input_dim = input_dim self.output_dim = output_dim my_params = param_collections.ParamManager.my_params(self) self.linear = nn.Linear(in_features=input_dim, out_features=output_dim, bias=bias).to(xnmt.device) my_params.append(self.linear) my_params.init_params(param_init, bias_init) def transform(self, input_expr: tt.Tensor) -> tt.Tensor: return self.linear(input_expr) Linear = xnmt.resolve_backend(LinearDynet, LinearTorch) @xnmt.require_dynet class NonLinearDynet(Transform, Serializable): """ Linear projection with optional bias and non-linearity. Args: input_dim: input dimension output_dim: hidden dimension bias: whether to add a bias activation: One of ``tanh``, ``relu``, ``sigmoid``, ``elu``, ``selu``, ``asinh`` or ``identity``. param_init: how to initialize weight matrices bias_init: how to initialize bias vectors """
""" Initialize given weights. Args: weights: parameter tensor to be initialized """ raise NotImplementedError("subclasses must implement initializer()") def __getitem__(self, key) -> None: """ Initialize using position-specific initializer. Default is to use same initializer across positions, unless InitializerSequence is used. """ return self ParamInitializer = xnmt.resolve_backend(ParamInitializerDynet, ParamInitializerTorch) class InitializerSequence(Serializable, ParamInitializer): """ Sequence of position-specific initializers. This can be used when a componenent has several parameter tensors that should each be initialized using a different initializer. Examples would be components with multiple layers, and/or several sets of weight matrices that serve different purposes. The most commonly needed use case of this may be the case of a NumpyInitializer, where one wants to manually specify all network weights using respective numpy arrays. Args: sequence: sequence of initializers
# Do scaled dot product [batch*num_heads, length, length], rows are keys, columns are queries attn_score = torch.matmul(k.transpose(1, 2), q) / sqrt(self.head_dim) if expr_seq.mask is not None: mask = torch.Tensor( np.repeat(expr_seq.mask.np_arr, self.num_heads, axis=0) * -1e10).to(xnmt.device) attn_score = attn_score + mask.unsqueeze(2) attn_prob = torch.nn.Softmax(dim=1)(attn_score) # attn_prob = dy.softmax(attn_score, d=1) if self.train and self.dropout > 0.0: attn_prob = tt.dropout(attn_prob, self.dropout) # Reduce using attention and resize to match [(length, model_size) x batch] o = torch.matmul(v, attn_prob).view(x_batch, self.input_dim, x_len).transpose(1, 2) # Final transformation o = self.lin_o(o) # o = bo + o * Wo expr_seq = expression_seqs.ExpressionSequence(expr_tensor=o, mask=expr_seq.mask) self._final_states = [ transducers.FinalTransducerState(expr_seq[-1], None) ] return expr_seq MultiHeadAttentionSeqTransducer = xnmt.resolve_backend( MultiHeadAttentionSeqTransducerDynet, MultiHeadAttentionSeqTransducerTorch)
def sample(self, x: tt.Tensor, n: numbers.Integral, temperature: numbers.Real=1.0): raise NotImplementedError() def calc_loss(self, x: tt.Tensor, y: Union[numbers.Integral, List[numbers.Integral]]) -> tt.Tensor: if self.label_smoothing: # following this implementation: # https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/train.py pred = self.calc_scores(x) eps = self.label_smoothing n_class = self.output_dim gold = torch.tensor(y).to(xnmt.device) one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1,1), 1) # one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / n_class # original version does not add up to 1 one_hot = one_hot * (1 - eps) + eps / n_class log_prb = F.log_softmax(pred, dim=1) return -(torch.matmul(one_hot.unsqueeze(1), log_prb.unsqueeze(2)).squeeze(2)) # neg dot product else: # scores = torch.nn.LogSoftmax(dim=-1)(self.calc_scores(x)) # return F.nll_loss(input=scores, target=torch.tensor(y).to(xnmt.device), reduction='none') if np.isscalar(y): y = [y] return F.cross_entropy(self.calc_scores(x), target=torch.tensor(y,dtype=torch.long).to(xnmt.device), reduction='none') def calc_probs(self, x: tt.Tensor) -> tt.Tensor: return torch.nn.Softmax(dim=-1)(self.calc_scores(x)) def calc_log_probs(self, x: tt.Tensor) -> tt.Tensor: return torch.nn.LogSoftmax(dim=-1)(self.calc_scores(x)) Softmax = xnmt.resolve_backend(SoftmaxDynet, SoftmaxTorch)