def __init__(self, name, **kwargs): super(CharConvEmbeddings, self).__init__() self.vsz = kwargs.get('vsz') self.dsz = kwargs.get('dsz') self.finetune = kwargs.get('finetune', True) weights = kwargs.get('weights') if weights is None: self.embeddings = nn.Embedding(self.vsz, self.dsz, padding_idx=0) else: self.embeddings = pytorch_embedding(weights) char_filtsz = kwargs.get('cfiltsz', [3]) if is_sequence(char_filtsz[0]): char_hsz = [pair[1] for pair in char_filtsz] char_filtsz = [pair[0] for pair in char_filtsz] else: char_hsz = kwargs.get('wsz', 30) activation_type = kwargs.get('activation', 'tanh') pdrop = kwargs.get('pdrop', 0.5) self.char_comp = ParallelConv(self.dsz, char_hsz, char_filtsz, activation_type, pdrop) wchsz = self.char_comp.outsz self.linear = pytorch_linear(wchsz, wchsz) gating = kwargs.get('gating', 'skip') GatingConnection = SkipConnection if gating == 'skip' else Highway num_gates = kwargs.get('num_gates', 1) gates = [('gate-{}'.format(i), GatingConnection(wchsz)) for i in range(num_gates)] projsz = kwargs.get('projsz') if projsz is not None: gates.append(('proj', pytorch_linear(self.char_comp.outsz, projsz))) self.char_comp.outsz = projsz self.gating_seq = nn.Sequential(OrderedDict(gates))
def __init__(self, tgt_embeddings, dropout=0.5, layers=1, hsz=None, num_heads=4, scale=True, **kwargs): super(TransformerDecoderWrapper, self).__init__() self.tgt_embeddings = tgt_embeddings dsz = self.tgt_embeddings.get_dsz() if hsz is None: hsz = dsz self.transformer_decoder = TransformerDecoderStack(num_heads, d_model=hsz, pdrop=dropout, scale=scale, layers=layers) self.proj_to_dsz = self._identity self.proj_to_hsz = self._identity if hsz != dsz: self.proj_to_hsz = pytorch_linear(dsz, hsz) self.proj_to_dsz = pytorch_linear(hsz, dsz) del self.proj_to_dsz.weight self.proj_to_dsz.weight = torch.nn.Parameter( self.proj_to_hsz.weight.transpose(0, 1), requires_grad=True) self.preds = pytorch_linear(dsz, self.tgt_embeddings.get_vsz()) do_weight_tying = bool(kwargs.get('tie_weights', False)) if do_weight_tying: self.preds.weight = self.tgt_embeddings.weight.transpose(0, 1)
def __init__(self, num_heads, d_model, pdrop, scale=True, activation_type='relu', d_ff=None): super(TransformerEncoder, self).__init__() self.d_model = d_model self.d_ff = d_ff if d_ff is not None else 4 * d_model self.self_attn = MultiHeadedAttention(num_heads, d_model, pdrop, scale=scale) self.ffn = nn.Sequential(pytorch_linear(self.d_model, self.d_ff), pytorch_activation(activation_type), pytorch_linear(self.d_ff, self.d_model)) self.ln1 = nn.LayerNorm(self.d_model, eps=1e-12) self.ln2 = nn.LayerNorm(self.d_model, eps=1e-12) self.dropout = nn.Dropout(pdrop)
def __init__(self, tgt_embeddings, dropout=0.5, layers=1, hsz=None, num_heads=4, scale=True, **kwargs): super().__init__() self.tgt_embeddings = tgt_embeddings dsz = self.tgt_embeddings.get_dsz() if hsz is None: hsz = dsz d_ff = int(kwargs.get('d_ff', 4 * hsz)) rpr_k = kwargs.get('rpr_k') d_k = kwargs.get('d_k') activation = kwargs.get('activation', 'relu') layer_drop = float(kwargs.get('layer_drop', 0.0)) scale = bool(kwargs.get('scale', True)) self.transformer_decoder = TransformerDecoderStack( num_heads, d_model=hsz, d_ff=d_ff, pdrop=dropout, scale=scale, layers=layers, rpr_k=rpr_k, d_k=d_k, activation_type=activation, layer_drop=layer_drop) self.proj_to_dsz = self._identity self.proj_to_hsz = self._identity if hsz != dsz: self.proj_to_hsz = pytorch_linear(dsz, hsz) self.proj_to_dsz = pytorch_linear(hsz, dsz) del self.proj_to_dsz.weight self.proj_to_dsz.weight = torch.nn.Parameter( self.proj_to_hsz.weight.transpose(0, 1), requires_grad=True) do_weight_tying = bool(kwargs.get('tie_weights', True)) if do_weight_tying: if hsz != self.tgt_embeddings.get_dsz(): raise ValueError( "weight tying requires hsz == embedding dsz, got {} hsz and {} dsz" .format(self.hsz, self.tgt_embeddings.get_dsz())) self.preds = WeightTieDense(self.tgt_embeddings) else: self.preds = pytorch_linear(dsz, self.tgt_embeddings.get_vsz())
def __init__(self, d_model, pdrop, activation_type='relu', d_ff=None): """Constructor, takes in model size (which is the external currency of each block) and the feed-forward size :param d_model: The model size. This is the size passed through each block :param d_ff: The feed-forward internal size, which is typical 4x larger, used internally :param pdrop: The probability of dropping output """ super(FFN, self).__init__() if d_ff is None: d_ff = 4 * d_model self.expansion = pytorch_linear(d_model, d_ff) self.squeeze = pytorch_linear(d_ff, d_model) self.dropout = nn.Dropout(pdrop) self.act = pytorch_activation(activation_type)
def __init__(self, dsz, hsz=None, num_heads=4, layers=1, dropout=0.5, **kwargs): super().__init__() if hsz is None: hsz = dsz self.proj = pytorch_linear(dsz, hsz) if hsz != dsz else self._identity d_ff = int(kwargs.get('d_ff', 4 * hsz)) rpr_k = kwargs.get('rpr_k') d_k = kwargs.get('d_k') layer_drop = float(kwargs.get('layer_drop', 0.0)) activation = kwargs.get('activation', 'relu') scale = bool(kwargs.get('scale', True)) self.transformer = TransformerEncoderStack(num_heads, d_model=hsz, d_ff=d_ff, pdrop=dropout, scale=scale, layers=layers, rpr_k=rpr_k, d_k=d_k, activation=activation, layer_drop=layer_drop)
def __init__(self, tgt_embeddings, **kwargs): """Construct an RNN decoder. It provides the input size, the rest is up to the impl. The default implementation provides an RNN cell, followed by a linear projection, out to a softmax :param input_dim: The input size :param kwargs: :return: void """ super().__init__() self.hsz = kwargs['hsz'] self.arc_policy = create_seq2seq_arc_policy(**kwargs) self.tgt_embeddings = tgt_embeddings rnntype = kwargs.get('rnntype', 'lstm') layers = kwargs.get('layers', 1) feed_input = kwargs.get('feed_input', True) dsz = tgt_embeddings.get_dsz() if feed_input: self.input_i = self._feed_input dsz += self.hsz else: self.input_i = self._basic_input pdrop = kwargs.get('dropout', 0.5) self.decoder_rnn = rnn_cell(dsz, self.hsz, rnntype, layers, pdrop) self.dropout = torch.nn.Dropout(pdrop) self.init_attn(**kwargs) do_weight_tying = bool(kwargs.get('tie_weights', True)) if do_weight_tying: if self.hsz != self.tgt_embeddings.get_dsz(): raise ValueError("weight tying requires hsz == embedding dsz, got {} hsz and {} dsz".format(self.hsz, self.tgt_embeddings.get_dsz())) self.preds = WeightTieDense(self.tgt_embeddings) else: self.preds = pytorch_linear(self.hsz, self.tgt_embeddings.get_vsz())
def __init__(self, h, d_model, dropout=0.1, scale=False): """Constructor for multi-headed attention :param h: The number of heads :param d_model: The model hidden size :param dropout (``float``): The amount of dropout to use :param attn_fn: A function to apply attention, defaults to SDP """ super(MultiHeadedAttention, self).__init__() assert d_model % h == 0 self.d_k = d_model // h self.h = h self.w_Q = pytorch_linear(d_model, d_model) self.w_K = pytorch_linear(d_model, d_model) self.w_V = pytorch_linear(d_model, d_model) self.w_O = pytorch_linear(d_model, d_model) self.attn_fn = scaled_dot_product_attention if scale else dot_product_attention self.attn = None self.dropout = nn.Dropout(dropout)
def __init__(self, tgt_embeddings, dropout=0.5, layers=1, hsz=None, num_heads=4, activation='gelu', rpr_k=None, layer_norm_eps=1e-6, layer_drop=0.0, scale=True, rpr_value_on=True, ra_type=None, d_k=None, d_ff=None, transformer_type=None, **kwargs): super().__init__() self.tgt_embeddings = tgt_embeddings dsz = self.tgt_embeddings.get_dsz() if hsz is None: hsz = dsz if d_ff is None: d_ff = 4 * hsz self.transformer_decoder = TransformerDecoderStack(num_heads, d_model=hsz, d_ff=d_ff, pdrop=dropout, scale=scale, layers=layers, rpr_k=rpr_k, d_k=d_k, activation_type=activation, layer_drop=layer_drop, layer_norm_eps=layer_norm_eps, rpr_value_on=rpr_value_on, ra_type=ra_type, transformer_type=transformer_type) self.proj_to_hsz = self._identity self.proj_to_dsz = self._identity if hsz != dsz: self.proj_to_hsz = pytorch_linear(dsz, hsz) self.proj_to_dsz = pytorch_linear(hsz, dsz) del self.proj_to_dsz.weight self.proj_to_dsz.weight = torch.nn.Parameter(self.proj_to_hsz.weight.transpose(0, 1), requires_grad=True) do_weight_tying = bool(kwargs.get('tie_weights', True)) if do_weight_tying: if hsz != self.tgt_embeddings.get_dsz(): raise ValueError("weight tying requires hsz == embedding dsz, got {} hsz and {} dsz".format(self.hsz, self.tgt_embeddings.get_dsz())) self.preds = WeightTieDense(self.tgt_embeddings) else: self.preds = pytorch_linear(dsz, self.tgt_embeddings.get_vsz())
def __init__(self, tgt_embeddings, dropout=0.5, layers=1, hsz=None, num_heads=4, scale=True, **kwargs): super(TransformerDecoderWrapper, self).__init__() self.tgt_embeddings = tgt_embeddings dsz = self.tgt_embeddings.get_dsz() if hsz is None: hsz = dsz self.transformer_decoder = TransformerDecoderStack(num_heads, d_model=hsz, pdrop=dropout, scale=scale, layers=layers) self.proj_to_dsz = self._identity self.proj_to_hsz = self._identity if hsz != dsz: self.proj_to_hsz = pytorch_linear(dsz, hsz) self.proj_to_dsz = pytorch_linear(hsz, dsz) del self.proj_to_dsz.weight self.proj_to_dsz.weight = torch.nn.Parameter(self.proj_to_hsz.weight.transpose(0, 1), requires_grad=True) self.preds = pytorch_linear(dsz, self.tgt_embeddings.get_vsz()) do_weight_tying = bool(kwargs.get('tie_weights', False)) self.preds = pytorch_linear(hsz, self.tgt_embeddings.get_vsz()) if do_weight_tying: self.preds.weight = self.tgt_embeddings.weight.transpose(0, 1)
def __init__(self, dsz, hsz=None, num_heads=4, layers=1, dropout=0.5, **kwargs): super(TransformerEncoderWrapper, self).__init__() if hsz is None: hsz = dsz self.proj = pytorch_linear(dsz, hsz) if hsz != dsz else self._identity self.transformer = TransformerEncoderStack(num_heads, d_model=hsz, pdrop=dropout, scale=True, layers=layers)
def __init__(self, dsz, hsz=None, num_heads=4, layers=1, dropout=0.5, activation='relu', rpr_k=None, layer_norm_eps=1e-6, layer_drop=0.0, scale=True, rpr_value_on=True, ra_type=None, d_k=None, d_ff=None, transformer_type=None, **kwargs): super().__init__() if hsz is None: hsz = dsz self.proj = pytorch_linear(dsz, hsz) if hsz != dsz else self._identity if d_ff is None: d_ff = 4 * hsz self.transformer = TransformerEncoderStack(num_heads, d_model=hsz, d_ff=d_ff, pdrop=dropout, scale=scale, layers=layers, rpr_k=rpr_k, d_k=d_k, activation=activation, layer_drop=layer_drop, layer_norm_eps=layer_norm_eps, rpr_value_on=rpr_value_on, ra_type=ra_type, transformer_type=transformer_type)
def __init__(self, tgt_embeddings, **kwargs): """Construct an RNN decoder. It provides the input size, the rest is up to the impl. The default implementation provides an RNN cell, followed by a linear projection, out to a softmax :param input_dim: The input size :param kwargs: :return: void """ super(RNNDecoder, self).__init__() self.hsz = kwargs['hsz'] self.arc_policy = create_seq2seq_arc_policy(**kwargs) self.tgt_embeddings = tgt_embeddings rnntype = kwargs['rnntype'] layers = kwargs['layers'] feed_input = kwargs.get('feed_input', True) dsz = tgt_embeddings.get_dsz() if feed_input: self.input_i = self._feed_input dsz += self.hsz else: self.input_i = self._basic_input pdrop = kwargs.get('dropout', 0.5) self.decoder_rnn = pytorch_rnn_cell(dsz, self.hsz, rnntype, layers, pdrop) self.dropout = torch.nn.Dropout(pdrop) self.init_attn(**kwargs) do_weight_tying = bool(kwargs.get('tie_weights', False)) is_valid_tying = self.hsz == self.tgt_embeddings.get_dsz() self.preds = pytorch_linear(self.hsz, self.tgt_embeddings.get_vsz()) if do_weight_tying: if is_valid_tying: tie_weight(self.preds, self.tgt_embeddings.embeddings) else: raise ValueError( "weight tying only valid when prediction projection \ layer's hidden size == embedding weight dimensions")
def __init__(self, name, **kwargs): super(CharConvEmbeddings, self).__init__() self.vsz = kwargs.get('vsz') self.dsz = kwargs.get('dsz') self.finetune = kwargs.get('finetune', True) weights = kwargs.get('weights') if weights is None: self.embeddings = nn.Embedding(self.vsz, self.dsz, padding_idx=0) else: self.embeddings = pytorch_embedding(weights) char_filtsz = kwargs.get('cfiltsz', [3]) char_hsz = kwargs.get('wsz', 30) activation_type = kwargs.get('activation', 'tanh') pdrop = kwargs.get('pdrop', 0.5) self.char_comp = ParallelConv(self.dsz, char_hsz, char_filtsz, activation_type, pdrop) wchsz = self.char_comp.outsz self.linear = pytorch_linear(wchsz, wchsz) gating = kwargs.get('gating', 'skip') GatingConnection = SkipConnection if gating == 'skip' else Highway num_gates = kwargs.get('num_gates', 1) self.gating_seq = nn.Sequential(OrderedDict( [('gate-{}'.format(i), GatingConnection(wchsz)) for i in range(num_gates)] ))
def __init__(self, tgt_embeddings, **kwargs): """Construct an RNN decoder. It provides the input size, the rest is up to the impl. The default implementation provides an RNN cell, followed by a linear projection, out to a softmax :param input_dim: The input size :param kwargs: :return: void """ super(RNNDecoder, self).__init__() self.hsz = kwargs['hsz'] self.arc_policy = create_seq2seq_arc_policy(**kwargs) self.tgt_embeddings = tgt_embeddings rnntype = kwargs['rnntype'] layers = kwargs['layers'] feed_input = kwargs.get('feed_input', True) dsz = tgt_embeddings.get_dsz() if feed_input: self.input_i = self._feed_input dsz += self.hsz else: self.input_i = self._basic_input pdrop = kwargs.get('dropout', 0.5) self.decoder_rnn = pytorch_rnn_cell(dsz, self.hsz, rnntype, layers, pdrop) self.dropout = torch.nn.Dropout(pdrop) self.init_attn(**kwargs) do_weight_tying = bool(kwargs.get('tie_weights', False)) is_valid_tying = self.hsz == self.tgt_embeddings.get_dsz() self.preds = pytorch_linear(self.hsz, self.tgt_embeddings.get_vsz()) if do_weight_tying: if is_valid_tying: tie_weight(self.preds, self.tgt_embeddings.embeddings) else: raise ValueError("weight tying only valid when prediction projection \ layer's hidden size == embedding weight dimensions")
def __init__(self): super().__init__() self.tgt_embeddings = nn.Embedding(100, 10) # vsz, dsz self.preds = pytorch_linear(10, 100) # hsz, output_sz self.preds.weight = self.tgt_embeddings.weight # tied weights
def __init__(self, dsz, hsz=None, num_heads=4, layers=1, dropout=0.5, **kwargs): super(TransformerEncoderWrapper, self).__init__() if hsz is None: hsz = dsz self.proj = pytorch_linear(dsz, hsz) if hsz != dsz else self._identity self.transformer = TransformerEncoderStack(num_heads, d_model=hsz, pdrop=dropout, scale=True, layers=layers)