Example #1
0
            raise RevertingUnsavedModelException(
                "revert_to_best_model() is illegal because this model has never been saved."
            )
        for subcol_name, subcol in self.subcols.items():
            data_file = os.path.join(self._data_files[0], subcol_name)
            loaded = torch.load(data_file)
            subcol.load_state_dict(loaded.state_dict())

    def global_collection(self):
        return self.subcols

    def parameter_count(self) -> numbers.Integral:
        return sum(p.numel() for p in self.subcols.parameters())


ParamCollection = xnmt.resolve_backend(ParamCollectionDynet,
                                       ParamCollectionTorch)


class ResourceCollection(object):
    def __init__(self, subcol_name):
        self.resources = []
        self.subcol_name = subcol_name

    def add(self, orig_file, save_postfix):
        self.resources.append((orig_file, save_postfix))
        return ResourceFile(filename=f"{self.subcol_name}-{save_postfix}")

    def save(self, data_dir):
        for orig_file, save_postfix in self.resources:
            shutil.copyfile(
                orig_file,
Example #2
0
@xnmt.require_torch
class LayerNormTorch(Serializable, transforms.Transform):
    yaml_tag = "!LayerNorm"

    @serializable_init
    def __init__(self, hidden_dim: numbers.Integral) -> None:
        my_params = param_collections.ParamManager.my_params(self)
        self.layer_norm = nn.LayerNorm(normalized_shape=hidden_dim).to(
            xnmt.device)
        my_params.append(self.layer_norm)

    def transform(self, x: tt.Tensor) -> tt.Tensor:
        return self.layer_norm(x)


LayerNorm = xnmt.resolve_backend(LayerNormDynet, LayerNormTorch)

BN_EPS = 0.1
BN_MOMENTUM = 0.1


@xnmt.require_dynet
class BatchNormDynet(Serializable, transforms.Transform,
                     transducers.SeqTransducer):
    """
  Implements batch normalization according to Ioffe and Szegedy, 2015.

  Supports application to matrices or higher-order tensors, in which case one dimension is interpreted as the time
  dimension and sequential batch norm is applied.

  A known issue is that the running mean / std is not reverted when reverting the parameters to the best model,
Example #3
0
        elif comb_method == "avg":
            return batched_expr.mean()
        else:
            raise ValueError(
                f"Unknown batch combination method '{comb_method}', expected 'sum' or 'avg'.'"
            )

    def get_factored_loss_val(self,
                              comb_method: str = "sum") -> 'FactoredLossVal':
        return FactoredLossVal({
            k: self._combine_batches(v, comb_method).cpu().data.numpy()
            for k, v in self.expr_factors.items()
        })


FactoredLossExpr = xnmt.resolve_backend(FactoredLossExprDynet,
                                        FactoredLossExprTorch)


class FactoredLossVal(object):
    """
  Loss consisting of (unbatched) float values, with one value per loss factor.

  Used to represent losses accumulated across several training steps.
  """
    def __init__(self, loss_dict=None) -> None:
        if loss_dict is None:
            loss_dict = {}
        self._loss_dict = loss_dict

    def __iadd__(self, other: 'FactoredLossVal') -> 'FactoredLossVal':
        """
Example #4
0
                                                       multiplicator=-100.0)
        normalized = F.softmax(scores, dim=-1)
        self.attention_vecs.append(normalized)
        return normalized

    def params_from_dynet(self, arrays, state_dict):
        assert len(arrays) == 4
        return {
            '0.weight': arrays[0],
            '0.bias': arrays[2],
            '1.weight': arrays[1],
            '2.weight': arrays[3]
        }


MlpAttender = xnmt.resolve_backend(MlpAttenderDynet, MlpAttenderTorch)


@xnmt.require_dynet
class DotAttenderDynet(Attender, Serializable):
    """
  Implements dot product attention of https://arxiv.org/abs/1508.04025
  Also (optionally) perform scaling of https://arxiv.org/abs/1706.03762

  Args:
    scale: whether to perform scaling
  """

    yaml_tag = '!DotAttender'

    @serializable_init
Example #5
0
    def get_final_states(self) -> List[FinalTransducerState]:
        return self._final_states

    def transduce(
        self, src: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        src_tensor = src.as_tensor()
        out_mask = src.mask
        if self.downsample_by > 1:
            assert src_tensor.dim()==3, \
              f"Downsampling only supported for tensors of order two (+ batch). Found dims {src_tensor.size()}"
            batch_size, seq_len, hidden_dim = src_tensor.size()
            if seq_len % self.downsample_by != 0:
                raise ValueError(
                    "For downsampling, sequence lengths must be multiples of the total reduce factor. "
                    "Configure batcher accordingly.")
            src_tensor = src_tensor.view(
                (batch_size, seq_len // self.downsample_by,
                 hidden_dim * self.downsample_by))
            if out_mask:
                out_mask = out_mask.lin_subsampled(
                    reduce_factor=self.downsample_by)
        output = self.transform.transform(src_tensor)
        output_seq = expression_seqs.ExpressionSequence(expr_tensor=output,
                                                        mask=out_mask)
        self._final_states = [FinalTransducerState(output_seq[-1])]
        return output_seq


TransformSeqTransducer = xnmt.resolve_backend(TransformSeqTransducerDynet,
                                              TransformSeqTransducerTorch)
Example #6
0
            if multiplicator is not None:
                mask_expr = torch.as_tensor(
                    np.expand_dims(self.np_arr, axis=1) * multiplicator,
                    dtype=tensor_expr.dtype,
                    device=xnmt.device)
            else:
                mask_expr = torch.as_tensor(np.expand_dims(self.np_arr,
                                                           axis=1),
                                            dtype=tensor_expr.dtype,
                                            device=xnmt.device)
            ret = tensor_expr + mask_expr
            assert ret.size() == tensor_expr.size()
            return ret


Mask = xnmt.resolve_backend(MaskDynet, MaskTorch)


class Batcher(object):
    """
  A template class to convert a list of sentences to several batches of sentences.

  Args:
    batch_size: batch size
    granularity: 'sent' or 'word'
    pad_src_to_multiple: pad source sentences so its length is multiple of this integer.
    sort_within_by_trg_len: whether to sort by reverse trg len inside a batch
  """
    def __init__(self,
                 batch_size: numbers.Integral,
                 granularity: str = 'sent',
Example #7
0
                                   axis=0)
        arrays[2] = np.concatenate([
            arrays[2][:h_dim], arrays[2][h_dim:h_dim * 2] + 1,
            arrays[2][h_dim * 3:], arrays[2][h_dim * 2:h_dim * 3]
        ],
                                   axis=0)

        return {
            '0.0.weight_ih': arrays[0],
            '0.0.weight_hh': arrays[1],
            '0.0.bias_ih': arrays[2],
            '0.0.bias_hh': np.zeros_like(arrays[2])
        }


UniLSTMSeqTransducer = xnmt.resolve_backend(UniLSTMSeqTransducerDynet,
                                            UniLSTMSeqTransducerTorch)


class BiLSTMSeqTransducer(transducers.SeqTransducer, Serializable):
    """
  This implements a bidirectional LSTM and requires about 8.5% less memory per timestep
  than DyNet's CompactVanillaLSTMBuilder due to avoiding concat operations.
  It uses 2 :class:`xnmt.lstm.UniLSTMSeqTransducer` objects in each layer.

  Args:
    layers: number of layers
    input_dim: input dimension
    hidden_dim: hidden dimension
    var_dropout: dropout probability (variational recurrent + vertical dropout)
    param_init: how to initialize weight matrices. In case of an InitializerSequence, the order is fwd_l0, bwd_l0, fwd_l1, bwd_l1, ..
    bias_init: how to initialize bias vectors. In case of an InitializerSequence, the order is fwd_l0, bwd_l0, fwd_l1, bwd_l1, ..
Example #8
0
    def get_final_states(self) -> List[transducers.FinalTransducerState]:
        return self._final_states

    def transduce(
        self, src: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        sent_len = src.sent_len()
        batch_size = tt.batch_size(src[0])
        embeddings = self.embeddings(
            torch.tensor([list(range(sent_len))] * batch_size).to(xnmt.device))
        # embeddings = dy.strided_select(dy.parameter(self.embedder), [1,1], [0,0], [self.input_dim, sent_len])
        if self.op == 'sum':
            output = embeddings + src.as_tensor()
        elif self.op == 'concat':
            output = tt.concatenate([embeddings, src.as_tensor()])
        else:
            raise ValueError(
                f'Illegal op {op} in PositionalTransducer (options are "sum"/"concat")'
            )
        if self.train and self.dropout > 0.0:
            output = tt.dropout(output, self.dropout)
        output_seq = expression_seqs.ExpressionSequence(expr_tensor=output,
                                                        mask=src.mask)
        self._final_states = [transducers.FinalTransducerState(output_seq[-1])]
        return output_seq


PositionalSeqTransducer = xnmt.resolve_backend(PositionalSeqTransducerDynet,
                                               PositionalSeqTransducerTorch)
Example #9
0
      self.expr_transposed_tensor = self.as_tensor().transpose(1,2)
    return self.expr_transposed_tensor

  # should get rid of this:
  # def dim(self) -> tuple:
  #   """
  #   Return dimension of the expression sequence
  #
  #   Returns:
  #     result of self.as_tensor().dim(), without explicitly constructing that tensor
  #   """
  #   if self.has_tensor(): return self.as_tensor().dim()
  #   else:
  #     return tuple(list(self[0].dim()[0]) + [len(self)]), self[0].dim()[1]

ExpressionSequence = xnmt.resolve_backend(ExpressionSequenceDynet, ExpressionSequenceTorch)

@xnmt.require_dynet
class LazyNumpyExpressionSequenceDynet(ExpressionSequence):
  """
  This is initialized via numpy arrays, and dynet expressions are only created
  once a consumer requests representation as list or tensor.
  """
  def __init__(self, lazy_data: np.ndarray, mask: Optional['batchers.Mask'] = None) -> None:
    """
    Args:
      lazy_data: numpy array, or Batcher.Batch of numpy arrays
    """
    self.lazy_data = lazy_data
    self.expr_list, self.expr_tensor, self.expr_transposed_tensor = None, None, None
    self.mask = mask
Example #10
0
        else:
            embeddings = []
            seq_len = x.sent_len()
            for single_sent in x:
                assert single_sent.sent_len() == seq_len
            for word_i in range(seq_len):
                batch = batchers.mark_as_batch(
                    [single_sent[word_i] for single_sent in x])
                embeddings.append(self.embed(batch))

        return expression_seqs.ExpressionSequence(
            expr_list=embeddings,
            mask=x.mask if batchers.is_batched(x) else None)


SentEmbedder = xnmt.resolve_backend(SentEmbedderDy, SentEmbedderTorch)


@xnmt.require_dynet
class DenseWordEmbedderDynet(SentEmbedder, transforms.Linear, Serializable):
    """
  Word embeddings via full matrix.

  Args:
    emb_dim: embedding dimension
    weight_noise: apply Gaussian noise with given standard deviation to embeddings
    word_dropout: drop out word types with a certain probability, sampling word types on a per-sentence level, see https://arxiv.org/abs/1512.05287
    fix_norm: fix the norm of word vectors to be radius r, see https://arxiv.org/abs/1710.01329
    param_init: how to initialize weight matrices
    bias_init: how to initialize bias vectors
    vocab_size: vocab size or None
Example #11
0
               momentum: numbers.Real = 0.0,
               weight_decay: numbers.Real = 0.0,
               dampening: numbers.Real = 0.0,
               nesterov: bool = False,
               skip_noisy: bool = False,
               rescale_grads: numbers.Real = 5.0) -> None:
    super().__init__(optimizer=torch.optim.SGD(params=ParamManager.global_collection().parameters(),
                                               lr=e0,
                                               momentum=momentum,
                                               weight_decay=weight_decay,
                                               dampening=dampening,
                                               nesterov=nesterov),
                     skip_noisy=skip_noisy,
                     rescale_grads=rescale_grads)

SimpleSGDTrainer = xnmt.resolve_backend(SimpleSGDTrainerDynet, SimpleSGDTrainerTorch)

@xnmt.require_dynet
class MomentumSGDTrainer(XnmtOptimizerDynet, Serializable):
  """
  Stochastic gradient descent with momentum

  This is a modified version of the SGD algorithm with momentum to stablize the gradient trajectory.

  Args:
    e0: Initial learning rate
    mom: Momentum
    skip_noisy: keep track of a moving average and a moving standard deviation of the log of the gradient norm
                          values, and abort a step if the norm of the gradient exceeds four standard deviations of the
                          moving average. Reference: https://arxiv.org/pdf/1804.09849.pdf
    rescale_grads: rescale gradients if the observed norm should be larger than this given norm
Example #12
0
        self.bias = bias
        self.input_dim = input_dim
        self.output_dim = output_dim

        my_params = param_collections.ParamManager.my_params(self)
        self.linear = nn.Linear(in_features=input_dim,
                                out_features=output_dim,
                                bias=bias).to(xnmt.device)
        my_params.append(self.linear)
        my_params.init_params(param_init, bias_init)

    def transform(self, input_expr: tt.Tensor) -> tt.Tensor:
        return self.linear(input_expr)


Linear = xnmt.resolve_backend(LinearDynet, LinearTorch)


@xnmt.require_dynet
class NonLinearDynet(Transform, Serializable):
    """
  Linear projection with optional bias and non-linearity.

  Args:
    input_dim: input dimension
    output_dim: hidden dimension
    bias: whether to add a bias
    activation: One of ``tanh``, ``relu``, ``sigmoid``, ``elu``, ``selu``, ``asinh`` or ``identity``.
    param_init: how to initialize weight matrices
    bias_init: how to initialize bias vectors
  """
Example #13
0
        """
    Initialize given weights.

    Args:
      weights: parameter tensor to be initialized
    """
        raise NotImplementedError("subclasses must implement initializer()")

    def __getitem__(self, key) -> None:
        """
    Initialize using position-specific initializer. Default is to use same initializer across positions, unless InitializerSequence is used.
    """
        return self


ParamInitializer = xnmt.resolve_backend(ParamInitializerDynet,
                                        ParamInitializerTorch)


class InitializerSequence(Serializable, ParamInitializer):
    """
  Sequence of position-specific initializers.

  This can be used when a componenent has several parameter tensors that should each be initialized using a different
  initializer. Examples would be components with multiple layers, and/or several sets of weight matrices that serve
  different purposes.

  The most commonly needed use case of this may be the case of a NumpyInitializer, where one wants to manually specify
  all network weights using respective numpy arrays.

  Args:
    sequence: sequence of initializers
Example #14
0
        # Do scaled dot product [batch*num_heads, length, length], rows are keys, columns are queries
        attn_score = torch.matmul(k.transpose(1, 2), q) / sqrt(self.head_dim)
        if expr_seq.mask is not None:
            mask = torch.Tensor(
                np.repeat(expr_seq.mask.np_arr, self.num_heads, axis=0) *
                -1e10).to(xnmt.device)
            attn_score = attn_score + mask.unsqueeze(2)
        attn_prob = torch.nn.Softmax(dim=1)(attn_score)
        # attn_prob = dy.softmax(attn_score, d=1)
        if self.train and self.dropout > 0.0:
            attn_prob = tt.dropout(attn_prob, self.dropout)
        # Reduce using attention and resize to match [(length, model_size) x batch]
        o = torch.matmul(v, attn_prob).view(x_batch, self.input_dim,
                                            x_len).transpose(1, 2)
        # Final transformation
        o = self.lin_o(o)
        # o = bo + o * Wo

        expr_seq = expression_seqs.ExpressionSequence(expr_tensor=o,
                                                      mask=expr_seq.mask)

        self._final_states = [
            transducers.FinalTransducerState(expr_seq[-1], None)
        ]

        return expr_seq


MultiHeadAttentionSeqTransducer = xnmt.resolve_backend(
    MultiHeadAttentionSeqTransducerDynet, MultiHeadAttentionSeqTransducerTorch)
Example #15
0
  def sample(self, x: tt.Tensor, n: numbers.Integral, temperature: numbers.Real=1.0):
    raise NotImplementedError()

  def calc_loss(self, x: tt.Tensor, y: Union[numbers.Integral, List[numbers.Integral]]) -> tt.Tensor:
    if self.label_smoothing:
      # following this implementation:
      # https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/train.py
      pred = self.calc_scores(x)
      eps = self.label_smoothing
      n_class = self.output_dim
      gold = torch.tensor(y).to(xnmt.device)
      one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1,1), 1)
      # one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / n_class  # original version does not add up to 1
      one_hot = one_hot * (1 - eps) + eps / n_class
      log_prb = F.log_softmax(pred, dim=1)
      return -(torch.matmul(one_hot.unsqueeze(1), log_prb.unsqueeze(2)).squeeze(2)) # neg dot product
    else:
      # scores = torch.nn.LogSoftmax(dim=-1)(self.calc_scores(x))
      # return F.nll_loss(input=scores, target=torch.tensor(y).to(xnmt.device), reduction='none')
      if np.isscalar(y): y = [y]
      return F.cross_entropy(self.calc_scores(x), target=torch.tensor(y,dtype=torch.long).to(xnmt.device), reduction='none')

  def calc_probs(self, x: tt.Tensor) -> tt.Tensor:
    return torch.nn.Softmax(dim=-1)(self.calc_scores(x))

  def calc_log_probs(self, x: tt.Tensor) -> tt.Tensor:
    return torch.nn.LogSoftmax(dim=-1)(self.calc_scores(x))

Softmax = xnmt.resolve_backend(SoftmaxDynet, SoftmaxTorch)