def __init__(self, d_model, d_ff, dropout, activation, param_init, bottleneck_dim=0): super().__init__() self.bottleneck_dim = bottleneck_dim if bottleneck_dim > 0: self.w_1_e = nn.Linear(d_model, bottleneck_dim) self.w_1_d = nn.Linear(bottleneck_dim, d_ff) self.w_2_e = nn.Linear(d_ff, bottleneck_dim) self.w_2_d = nn.Linear(bottleneck_dim, d_model) else: self.w_1 = nn.Linear(d_model, d_ff) self.w_2 = nn.Linear(d_ff, d_model) self.dropout = nn.Dropout(p=dropout) if activation == 'relu': self.activation = torch.relu elif activation == 'gelu': self.activation = lambda x: gelu(x) elif activation == 'gelu_accurate': self.activation = lambda x: gelu_accurate(x) elif activation == 'glu': self.activation = LinearGLUBlock(d_ff) elif activation == 'swish': self.activation = Swish() else: raise NotImplementedError(activation) logger.info('FFN activation: %s' % activation) if param_init == 'xavier_uniform': self.reset_parameters() else: logger.info('Parameter initialization is skipped.')
def __init__(self, d_in, d_ff, d_out, dropout, activation, param_init): super(PositionwiseFeedForward, self).__init__() self.w_1 = nn.Linear(d_in, d_ff) self.w_2 = nn.Linear(d_ff, d_out) self.dropout = nn.Dropout(p=dropout) if activation == 'relu': self.activation = torch.relu elif activation == 'gelu': self.activation = lambda x: gelu(x) elif activation == 'gelu_accurate': self.activation = lambda x: gelu_accurate(x) elif activation == 'glu': self.activation = LinearGLUBlock(d_ff) else: raise NotImplementedError(activation) logger.info('FFN activation: %s' % activation) if param_init == 'xavier_uniform': self.reset_parameters()
def __init__(self, args, save_path=None): super(LMBase, self).__init__() logger.info(self.__class__.__name__) self.lm_type = args.lm_type self.save_path = save_path self.emb_dim = args.emb_dim self.rnn_type = args.lm_type assert args.lm_type in ['lstm', 'gru'] self.n_units = args.n_units self.n_projs = args.n_projs self.n_layers = args.n_layers self.residual = args.residual self.n_units_cv = args.n_units_null_context self.lsm_prob = args.lsm_prob self.vocab = args.vocab self.eos = 2 self.pad = 3 # NOTE: reserved in advance # for cache self.cache_theta = 0.2 # smoothing parameter self.cache_lambda = 0.2 # cache weight self.cache_ids = [] self.cache_keys = [] self.cache_attn = [] self.embed_cache = None self.embed = nn.Embedding(self.vocab, args.emb_dim, padding_idx=self.pad) self.dropout_emb = nn.Dropout(p=args.dropout_in) rnn = nn.LSTM if args.lm_type == 'lstm' else nn.GRU self.rnn = nn.ModuleList() self.dropout = nn.Dropout(p=args.dropout_hidden) if args.n_projs > 0: self.proj = repeat(nn.Linear(args.n_units, args.n_projs), args.n_layers) rnn_idim = args.emb_dim + args.n_units_null_context for _ in range(args.n_layers): self.rnn += [rnn(rnn_idim, args.n_units, 1, batch_first=True)] rnn_idim = args.n_units if args.n_projs > 0: rnn_idim = args.n_projs self.glu = None if args.use_glu: self.glu = LinearGLUBlock(rnn_idim) self._odim = rnn_idim self.adaptive_softmax = None self.output_proj = None self.output = None if args.adaptive_softmax: self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss( rnn_idim, self.vocab, # cutoffs=[self.vocab // 10, 3 * self.vocab // 10], cutoffs=[self.vocab // 25, self.vocab // 5], div_value=4.0) elif args.tie_embedding: if rnn_idim != args.emb_dim: self.output_proj = nn.Linear(rnn_idim, args.emb_dim) rnn_idim = args.emb_dim self._odim = rnn_idim self.output = nn.Linear(rnn_idim, self.vocab) self.output.weight = self.embed.weight else: self.output = nn.Linear(rnn_idim, self.vocab) self.reset_parameters(args.param_init)