def __init__(self, name, **kwargs):
        super(CharConvEmbeddings, self).__init__()
        self.vsz = kwargs.get('vsz')
        self.dsz = kwargs.get('dsz')
        self.finetune = kwargs.get('finetune', True)
        weights = kwargs.get('weights')
        if weights is None:
            self.embeddings = nn.Embedding(self.vsz, self.dsz, padding_idx=0)
        else:
            self.embeddings = pytorch_embedding(weights)
        char_filtsz = kwargs.get('cfiltsz', [3])
        if is_sequence(char_filtsz[0]):
            char_hsz = [pair[1] for pair in char_filtsz]
            char_filtsz = [pair[0] for pair in char_filtsz]
        else:
            char_hsz = kwargs.get('wsz', 30)

        activation_type = kwargs.get('activation', 'tanh')
        pdrop = kwargs.get('pdrop', 0.5)
        self.char_comp = ParallelConv(self.dsz, char_hsz, char_filtsz, activation_type, pdrop)
        wchsz = self.char_comp.outsz
        self.linear = pytorch_linear(wchsz, wchsz)
        gating = kwargs.get('gating', 'skip')
        GatingConnection = SkipConnection if gating == 'skip' else Highway
        num_gates = kwargs.get('num_gates', 1)

        gates = [('gate-{}'.format(i), GatingConnection(wchsz)) for i in range(num_gates)]
        projsz = kwargs.get('projsz')
        if projsz is not None:
            gates.append(('proj', pytorch_linear(self.char_comp.outsz, projsz)))
            self.char_comp.outsz = projsz
        self.gating_seq = nn.Sequential(OrderedDict(gates))
Exemple #2
0
    def __init__(self,
                 tgt_embeddings,
                 dropout=0.5,
                 layers=1,
                 hsz=None,
                 num_heads=4,
                 scale=True,
                 **kwargs):
        super(TransformerDecoderWrapper, self).__init__()
        self.tgt_embeddings = tgt_embeddings
        dsz = self.tgt_embeddings.get_dsz()
        if hsz is None:
            hsz = dsz

        self.transformer_decoder = TransformerDecoderStack(num_heads,
                                                           d_model=hsz,
                                                           pdrop=dropout,
                                                           scale=scale,
                                                           layers=layers)

        self.proj_to_dsz = self._identity
        self.proj_to_hsz = self._identity
        if hsz != dsz:
            self.proj_to_hsz = pytorch_linear(dsz, hsz)
            self.proj_to_dsz = pytorch_linear(hsz, dsz)
            del self.proj_to_dsz.weight
            self.proj_to_dsz.weight = torch.nn.Parameter(
                self.proj_to_hsz.weight.transpose(0, 1), requires_grad=True)

        self.preds = pytorch_linear(dsz, self.tgt_embeddings.get_vsz())

        do_weight_tying = bool(kwargs.get('tie_weights', False))
        if do_weight_tying:
            self.preds.weight = self.tgt_embeddings.weight.transpose(0, 1)
Exemple #3
0
 def __init__(self, num_heads, d_model, pdrop, scale=True, activation_type='relu', d_ff=None):
     super(TransformerEncoder, self).__init__()
     self.d_model = d_model
     self.d_ff = d_ff if d_ff is not None else 4 * d_model
     self.self_attn = MultiHeadedAttention(num_heads, d_model, pdrop, scale=scale)
     self.ffn = nn.Sequential(pytorch_linear(self.d_model, self.d_ff),
                              pytorch_activation(activation_type),
                              pytorch_linear(self.d_ff, self.d_model))
     self.ln1 = nn.LayerNorm(self.d_model, eps=1e-12)
     self.ln2 = nn.LayerNorm(self.d_model, eps=1e-12)
     self.dropout = nn.Dropout(pdrop)
Exemple #4
0
    def __init__(self,
                 tgt_embeddings,
                 dropout=0.5,
                 layers=1,
                 hsz=None,
                 num_heads=4,
                 scale=True,
                 **kwargs):
        super().__init__()
        self.tgt_embeddings = tgt_embeddings
        dsz = self.tgt_embeddings.get_dsz()
        if hsz is None:
            hsz = dsz

        d_ff = int(kwargs.get('d_ff', 4 * hsz))
        rpr_k = kwargs.get('rpr_k')
        d_k = kwargs.get('d_k')
        activation = kwargs.get('activation', 'relu')
        layer_drop = float(kwargs.get('layer_drop', 0.0))
        scale = bool(kwargs.get('scale', True))

        self.transformer_decoder = TransformerDecoderStack(
            num_heads,
            d_model=hsz,
            d_ff=d_ff,
            pdrop=dropout,
            scale=scale,
            layers=layers,
            rpr_k=rpr_k,
            d_k=d_k,
            activation_type=activation,
            layer_drop=layer_drop)

        self.proj_to_dsz = self._identity
        self.proj_to_hsz = self._identity
        if hsz != dsz:
            self.proj_to_hsz = pytorch_linear(dsz, hsz)
            self.proj_to_dsz = pytorch_linear(hsz, dsz)
            del self.proj_to_dsz.weight
            self.proj_to_dsz.weight = torch.nn.Parameter(
                self.proj_to_hsz.weight.transpose(0, 1), requires_grad=True)

        do_weight_tying = bool(kwargs.get('tie_weights', True))
        if do_weight_tying:
            if hsz != self.tgt_embeddings.get_dsz():
                raise ValueError(
                    "weight tying requires hsz == embedding dsz, got {} hsz and {} dsz"
                    .format(self.hsz, self.tgt_embeddings.get_dsz()))
            self.preds = WeightTieDense(self.tgt_embeddings)
        else:
            self.preds = pytorch_linear(dsz, self.tgt_embeddings.get_vsz())
Exemple #5
0
    def __init__(self, d_model, pdrop, activation_type='relu', d_ff=None):
        """Constructor, takes in model size (which is the external currency of each block) and the feed-forward size

        :param d_model: The model size.  This is the size passed through each block
        :param d_ff: The feed-forward internal size, which is typical 4x larger, used internally
        :param pdrop: The probability of dropping output
        """
        super(FFN, self).__init__()
        if d_ff is None:
            d_ff = 4 * d_model
        self.expansion = pytorch_linear(d_model, d_ff)
        self.squeeze = pytorch_linear(d_ff, d_model)
        self.dropout = nn.Dropout(pdrop)
        self.act = pytorch_activation(activation_type)
Exemple #6
0
 def __init__(self,
              dsz,
              hsz=None,
              num_heads=4,
              layers=1,
              dropout=0.5,
              **kwargs):
     super().__init__()
     if hsz is None:
         hsz = dsz
     self.proj = pytorch_linear(dsz, hsz) if hsz != dsz else self._identity
     d_ff = int(kwargs.get('d_ff', 4 * hsz))
     rpr_k = kwargs.get('rpr_k')
     d_k = kwargs.get('d_k')
     layer_drop = float(kwargs.get('layer_drop', 0.0))
     activation = kwargs.get('activation', 'relu')
     scale = bool(kwargs.get('scale', True))
     self.transformer = TransformerEncoderStack(num_heads,
                                                d_model=hsz,
                                                d_ff=d_ff,
                                                pdrop=dropout,
                                                scale=scale,
                                                layers=layers,
                                                rpr_k=rpr_k,
                                                d_k=d_k,
                                                activation=activation,
                                                layer_drop=layer_drop)
Exemple #7
0
    def __init__(self, tgt_embeddings, **kwargs):
        """Construct an RNN decoder.  It provides the input size, the rest is up to the impl.

        The default implementation provides an RNN cell, followed by a linear projection, out to a softmax

        :param input_dim: The input size
        :param kwargs:
        :return: void
        """
        super().__init__()
        self.hsz = kwargs['hsz']
        self.arc_policy = create_seq2seq_arc_policy(**kwargs)
        self.tgt_embeddings = tgt_embeddings
        rnntype = kwargs.get('rnntype', 'lstm')
        layers = kwargs.get('layers', 1)
        feed_input = kwargs.get('feed_input', True)
        dsz = tgt_embeddings.get_dsz()
        if feed_input:
            self.input_i = self._feed_input
            dsz += self.hsz
        else:
            self.input_i = self._basic_input
        pdrop = kwargs.get('dropout', 0.5)
        self.decoder_rnn = rnn_cell(dsz, self.hsz, rnntype, layers, pdrop)
        self.dropout = torch.nn.Dropout(pdrop)
        self.init_attn(**kwargs)

        do_weight_tying = bool(kwargs.get('tie_weights', True))

        if do_weight_tying:
            if self.hsz != self.tgt_embeddings.get_dsz():
                raise ValueError("weight tying requires hsz == embedding dsz, got {} hsz and {} dsz".format(self.hsz, self.tgt_embeddings.get_dsz()))
            self.preds = WeightTieDense(self.tgt_embeddings)
        else:
            self.preds = pytorch_linear(self.hsz, self.tgt_embeddings.get_vsz())
Exemple #8
0
    def __init__(self, h, d_model, dropout=0.1, scale=False):
        """Constructor for multi-headed attention

        :param h: The number of heads
        :param d_model: The model hidden size
        :param dropout (``float``): The amount of dropout to use
        :param attn_fn: A function to apply attention, defaults to SDP
        """
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        self.d_k = d_model // h
        self.h = h
        self.w_Q = pytorch_linear(d_model, d_model)
        self.w_K = pytorch_linear(d_model, d_model)
        self.w_V = pytorch_linear(d_model, d_model)
        self.w_O = pytorch_linear(d_model, d_model)
        self.attn_fn = scaled_dot_product_attention if scale else dot_product_attention
        self.attn = None
        self.dropout = nn.Dropout(dropout)
Exemple #9
0
    def __init__(self, tgt_embeddings, dropout=0.5, layers=1, hsz=None, num_heads=4,
                 activation='gelu',
                 rpr_k=None,
                 layer_norm_eps=1e-6,
                 layer_drop=0.0, scale=True, rpr_value_on=True, ra_type=None,
                 d_k=None,
                 d_ff=None,
                 transformer_type=None,
                 **kwargs):
        super().__init__()
        self.tgt_embeddings = tgt_embeddings
        dsz = self.tgt_embeddings.get_dsz()
        if hsz is None:
            hsz = dsz

        if d_ff is None:
            d_ff = 4 * hsz

        self.transformer_decoder = TransformerDecoderStack(num_heads, d_model=hsz, d_ff=d_ff,
                                                           pdrop=dropout, scale=scale, layers=layers,
                                                           rpr_k=rpr_k, d_k=d_k, activation_type=activation,
                                                           layer_drop=layer_drop, layer_norm_eps=layer_norm_eps,
                                                           rpr_value_on=rpr_value_on, ra_type=ra_type, transformer_type=transformer_type)

        self.proj_to_hsz = self._identity
        self.proj_to_dsz = self._identity
        if hsz != dsz:
            self.proj_to_hsz = pytorch_linear(dsz, hsz)
            self.proj_to_dsz = pytorch_linear(hsz, dsz)
            del self.proj_to_dsz.weight
            self.proj_to_dsz.weight = torch.nn.Parameter(self.proj_to_hsz.weight.transpose(0, 1), requires_grad=True)

        do_weight_tying = bool(kwargs.get('tie_weights', True))
        if do_weight_tying:
            if hsz != self.tgt_embeddings.get_dsz():
                raise ValueError("weight tying requires hsz == embedding dsz, got {} hsz and {} dsz".format(self.hsz, self.tgt_embeddings.get_dsz()))
            self.preds = WeightTieDense(self.tgt_embeddings)
        else:
            self.preds = pytorch_linear(dsz, self.tgt_embeddings.get_vsz())
Exemple #10
0
    def __init__(self, tgt_embeddings, dropout=0.5, layers=1, hsz=None, num_heads=4, scale=True, **kwargs):
        super(TransformerDecoderWrapper, self).__init__()
        self.tgt_embeddings = tgt_embeddings
        dsz = self.tgt_embeddings.get_dsz()
        if hsz is None:
            hsz = dsz

        self.transformer_decoder = TransformerDecoderStack(num_heads, d_model=hsz, pdrop=dropout, scale=scale, layers=layers)

        self.proj_to_dsz = self._identity
        self.proj_to_hsz = self._identity
        if hsz != dsz:
            self.proj_to_hsz = pytorch_linear(dsz, hsz)
            self.proj_to_dsz = pytorch_linear(hsz, dsz)
            del self.proj_to_dsz.weight
            self.proj_to_dsz.weight = torch.nn.Parameter(self.proj_to_hsz.weight.transpose(0, 1), requires_grad=True)

        self.preds = pytorch_linear(dsz, self.tgt_embeddings.get_vsz())

        do_weight_tying = bool(kwargs.get('tie_weights', False))

        self.preds = pytorch_linear(hsz, self.tgt_embeddings.get_vsz())
        if do_weight_tying:
            self.preds.weight = self.tgt_embeddings.weight.transpose(0, 1)
Exemple #11
0
 def __init__(self,
              dsz,
              hsz=None,
              num_heads=4,
              layers=1,
              dropout=0.5,
              **kwargs):
     super(TransformerEncoderWrapper, self).__init__()
     if hsz is None:
         hsz = dsz
     self.proj = pytorch_linear(dsz, hsz) if hsz != dsz else self._identity
     self.transformer = TransformerEncoderStack(num_heads,
                                                d_model=hsz,
                                                pdrop=dropout,
                                                scale=True,
                                                layers=layers)
Exemple #12
0
    def __init__(self, dsz, hsz=None, num_heads=4, layers=1, dropout=0.5,
                 activation='relu',
                 rpr_k=None,
                 layer_norm_eps=1e-6,
                 layer_drop=0.0, scale=True, rpr_value_on=True, ra_type=None,
                 d_k=None,
                 d_ff=None,
                 transformer_type=None,
                 **kwargs):
        super().__init__()
        if hsz is None:
            hsz = dsz
        self.proj = pytorch_linear(dsz, hsz) if hsz != dsz else self._identity
        if d_ff is None:
            d_ff = 4 * hsz

        self.transformer = TransformerEncoderStack(num_heads, d_model=hsz, d_ff=d_ff,
                                                   pdrop=dropout, scale=scale, layers=layers,
                                                   rpr_k=rpr_k, d_k=d_k, activation=activation, layer_drop=layer_drop,
                                                   layer_norm_eps=layer_norm_eps,
                                                   rpr_value_on=rpr_value_on, ra_type=ra_type, transformer_type=transformer_type)
Exemple #13
0
    def __init__(self, tgt_embeddings, **kwargs):
        """Construct an RNN decoder.  It provides the input size, the rest is up to the impl.

        The default implementation provides an RNN cell, followed by a linear projection, out to a softmax

        :param input_dim: The input size
        :param kwargs:
        :return: void
        """
        super(RNNDecoder, self).__init__()
        self.hsz = kwargs['hsz']
        self.arc_policy = create_seq2seq_arc_policy(**kwargs)
        self.tgt_embeddings = tgt_embeddings
        rnntype = kwargs['rnntype']
        layers = kwargs['layers']
        feed_input = kwargs.get('feed_input', True)
        dsz = tgt_embeddings.get_dsz()
        if feed_input:
            self.input_i = self._feed_input
            dsz += self.hsz
        else:
            self.input_i = self._basic_input
        pdrop = kwargs.get('dropout', 0.5)
        self.decoder_rnn = pytorch_rnn_cell(dsz, self.hsz, rnntype, layers,
                                            pdrop)
        self.dropout = torch.nn.Dropout(pdrop)
        self.init_attn(**kwargs)

        do_weight_tying = bool(kwargs.get('tie_weights', False))
        is_valid_tying = self.hsz == self.tgt_embeddings.get_dsz()

        self.preds = pytorch_linear(self.hsz, self.tgt_embeddings.get_vsz())
        if do_weight_tying:
            if is_valid_tying:
                tie_weight(self.preds, self.tgt_embeddings.embeddings)
            else:
                raise ValueError(
                    "weight tying only valid when prediction projection \
layer's hidden size == embedding weight dimensions")
Exemple #14
0
 def __init__(self, name, **kwargs):
     super(CharConvEmbeddings, self).__init__()
     self.vsz = kwargs.get('vsz')
     self.dsz = kwargs.get('dsz')
     self.finetune = kwargs.get('finetune', True)
     weights = kwargs.get('weights')
     if weights is None:
         self.embeddings = nn.Embedding(self.vsz, self.dsz, padding_idx=0)
     else:
         self.embeddings = pytorch_embedding(weights)
     char_filtsz = kwargs.get('cfiltsz', [3])
     char_hsz = kwargs.get('wsz', 30)
     activation_type = kwargs.get('activation', 'tanh')
     pdrop = kwargs.get('pdrop', 0.5)
     self.char_comp = ParallelConv(self.dsz, char_hsz, char_filtsz, activation_type, pdrop)
     wchsz = self.char_comp.outsz
     self.linear = pytorch_linear(wchsz, wchsz)
     gating = kwargs.get('gating', 'skip')
     GatingConnection = SkipConnection if gating == 'skip' else Highway
     num_gates = kwargs.get('num_gates', 1)
     self.gating_seq = nn.Sequential(OrderedDict(
         [('gate-{}'.format(i), GatingConnection(wchsz)) for i in range(num_gates)]
     ))
Exemple #15
0
    def __init__(self, tgt_embeddings, **kwargs):
        """Construct an RNN decoder.  It provides the input size, the rest is up to the impl.

        The default implementation provides an RNN cell, followed by a linear projection, out to a softmax

        :param input_dim: The input size
        :param kwargs:
        :return: void
        """
        super(RNNDecoder, self).__init__()
        self.hsz = kwargs['hsz']
        self.arc_policy = create_seq2seq_arc_policy(**kwargs)
        self.tgt_embeddings = tgt_embeddings
        rnntype = kwargs['rnntype']
        layers = kwargs['layers']
        feed_input = kwargs.get('feed_input', True)
        dsz = tgt_embeddings.get_dsz()
        if feed_input:
            self.input_i = self._feed_input
            dsz += self.hsz
        else:
            self.input_i = self._basic_input
        pdrop = kwargs.get('dropout', 0.5)
        self.decoder_rnn = pytorch_rnn_cell(dsz, self.hsz, rnntype, layers, pdrop)
        self.dropout = torch.nn.Dropout(pdrop)
        self.init_attn(**kwargs)

        do_weight_tying = bool(kwargs.get('tie_weights', False))
        is_valid_tying = self.hsz == self.tgt_embeddings.get_dsz()

        self.preds = pytorch_linear(self.hsz, self.tgt_embeddings.get_vsz())
        if do_weight_tying:
            if is_valid_tying:
                tie_weight(self.preds, self.tgt_embeddings.embeddings)
            else:
                raise ValueError("weight tying only valid when prediction projection \
layer's hidden size == embedding weight dimensions")
 def __init__(self):
     super().__init__()
     self.tgt_embeddings = nn.Embedding(100, 10)  # vsz, dsz
     self.preds = pytorch_linear(10, 100)  # hsz, output_sz
     self.preds.weight = self.tgt_embeddings.weight  # tied weights
Exemple #17
0
 def __init__(self, dsz, hsz=None, num_heads=4, layers=1, dropout=0.5, **kwargs):
     super(TransformerEncoderWrapper, self).__init__()
     if hsz is None:
         hsz = dsz
     self.proj = pytorch_linear(dsz, hsz) if hsz != dsz else self._identity
     self.transformer = TransformerEncoderStack(num_heads, d_model=hsz, pdrop=dropout, scale=True, layers=layers)