def __init__(self, input_dim, n_units, n_layers, bottleneck_dim, dropout, param_init=0.1): super(SequenceSummaryNetwork, self).__init__() self.n_layers = n_layers self.ssn = nn.ModuleList() self.ssn += [Linear(input_dim, n_units, bias=False, dropout=dropout)] for l in range(1, n_layers - 1, 1): self.ssn += [ Linear(n_units, bottleneck_dim if l == n_layers - 2 else n_units, bias=False, dropout=dropout) ] self.ssn += [ Linear(bottleneck_dim, input_dim, bias=False, dropout=dropout) ] # Initialize parameters self.reset_parameters(param_init)
def __init__(self, enc_dim, dec_dim, attn_type, attn_dim, init_r, conv_out_channels=10, conv_kernel_size=100): """Energy function.""" super().__init__() self.attn_type = attn_type self.key = None self.mask = None self.w_key = Linear(enc_dim, attn_dim, bias=False) self.w_query = Linear(dec_dim, attn_dim, bias=False) if attn_type == 'location': self.w_conv = Linear(conv_out_channels, attn_dim, bias=False) self.conv = nn.Conv2d(in_channels=1, out_channels=conv_out_channels, kernel_size=(1, conv_kernel_size * 2 + 1), stride=1, padding=(0, conv_kernel_size), bias=False) else: assert attn_type == 'add' self.b = nn.Parameter(torch.Tensor(attn_dim).normal_()) self.v = nn.utils.weight_norm(nn.Linear(attn_dim, 1)) self.v.weight_g.data = torch.Tensor([1 / attn_dim]).sqrt() self.r = nn.Parameter(torch.Tensor([init_r]))
def __init__(self, key_dim, query_dim, attn_type, attn_dim, dropout=0, n_heads=4): super(MultiheadAttentionMechanism, self).__init__() self.attn_type = attn_type assert attn_dim % n_heads == 0 self.d_k = attn_dim // n_heads self.n_heads = n_heads self.key = None self.value = None self.mask = None # attention dropout applied AFTER the softmax layer self.attn_dropout = nn.Dropout(p=dropout) if attn_type == 'scaled_dot': self.w_key = Linear(key_dim, attn_dim, bias=False) self.w_value = Linear(key_dim, attn_dim, bias=False) self.w_query = Linear(query_dim, attn_dim, bias=False) elif attn_type == 'add': self.w_key = Linear(key_dim, attn_dim, bias=True) self.w_value = Linear(key_dim, attn_dim, bias=False) self.w_query = Linear(query_dim, attn_dim, bias=False) self.v = Linear(attn_dim, n_heads, bias=False) else: raise NotImplementedError(attn_type) self.w_out = Linear(attn_dim, key_dim)
def __init__(self, factor, n_units, n_dirs): super(ConcatSubsampler, self).__init__() self.factor = factor if factor > 1: self.proj = Linear(n_units * n_dirs * factor, n_units * n_dirs) self.batch_norm = nn.BatchNorm1d(n_units * n_dirs)
def __init__(self, input_dim, in_channel, channels, kernel_sizes, dropout, bottleneck_dim=0, param_init=0.1): super(GatedConvEncoder, self).__init__() logger = logging.getLogger("training") self.in_channel = in_channel assert input_dim % in_channel == 0 self.input_freq = input_dim // in_channel self.bridge = None assert len(channels) > 0 assert len(channels) == len(kernel_sizes) layers = OrderedDict() for l in range(len(channels)): layers['conv%d' % l] = GLUBlock(kernel_sizes[l][0], input_dim, channels[l], weight_norm=True, dropout=0.2) input_dim = channels[l] # weight normalization + GLU for the last fully-connected layer self.fc_glu = Linear(input_dim, input_dim * 2, weight_norm=True) self._output_dim = int(input_dim) if bottleneck_dim > 0: self.bridge = Linear(self._output_dim, bottleneck_dim) self._output_dim = bottleneck_dim self.layers = nn.Sequential(layers) # Initialize parameters self.reset_parameters(param_init)
def __init__(self, input_dim, in_channel, channels, kernel_sizes, strides, poolings, dropout, batch_norm=False, residual=False, bottleneck_dim=0, param_init=0.1): super(ConvEncoder, self).__init__() logger = logging.getLogger("training") self.in_channel = in_channel assert input_dim % in_channel == 0 self.input_freq = input_dim // in_channel self.residual = residual self.bridge = None assert len(channels) > 0 assert len(channels) == len(kernel_sizes) == len(strides) == len( poolings) self.layers = nn.ModuleList() in_ch = in_channel in_freq = self.input_freq for l in range(len(channels)): block = Conv2LBlock(input_dim=in_freq, in_channel=in_ch, out_channel=channels[l], kernel_size=kernel_sizes[l], stride=strides[l], pooling=poolings[l], dropout=dropout, batch_norm=batch_norm, residual=residual) self.layers += [block] in_freq = block.input_dim in_ch = channels[l] self._output_dim = int(in_ch * in_freq) if bottleneck_dim > 0: self.bridge = Linear(self._output_dim, bottleneck_dim) self._output_dim = bottleneck_dim # Initialize parameters self.reset_parameters(param_init)
def __init__(self, eos, blank, enc_n_units, vocab, dropout=0.0, lsm_prob=0.0, fc_list=[], param_init=0.1): super(CTC, self).__init__() logger = logging.getLogger('training') self.eos = eos self.blank = blank self.vocab = vocab self.lsm_prob = lsm_prob self.space = -1 # TODO(hirofumi): fix layer # Fully-connected layers before the softmax if len(fc_list) > 0: fc_layers = OrderedDict() for i in range(len(fc_list)): input_dim = enc_n_units if i == 0 else fc_list[i - 1] fc_layers['fc' + str(i)] = Linear(input_dim, fc_list[i], dropout=dropout) fc_layers['fc' + str(len(fc_list))] = Linear(fc_list[-1], vocab) self.output = nn.Sequential(fc_layers) else: self.output = Linear(enc_n_units, vocab) import warpctc_pytorch self.warpctc_loss = warpctc_pytorch.CTCLoss(size_average=True)
def __init__(self, input_dim, in_channel, channels, kernel_sizes, dropout, bottleneck_dim=0): super(TDSEncoder, self).__init__() logger = logging.getLogger("training") self.in_channel = in_channel assert input_dim % in_channel == 0 self.input_freq = input_dim // in_channel self.bridge = None assert len(channels) > 0 assert len(channels) == len(kernel_sizes) layers = OrderedDict() in_ch = in_channel in_freq = self.input_freq for l in range(len(channels)): # subsample if in_ch != channels[l]: layers['subsample%d' % l] = SubsampelBlock(in_channel=in_ch, out_channel=channels[l], in_freq=in_freq, dropout=dropout) # Conv layers['tds%d_block%d' % (channels[l], l)] = TDSBlock(channel=channels[l], kernel_size=kernel_sizes[l][0], in_freq=in_freq, dropout=dropout) in_ch = channels[l] self._output_dim = int(in_ch * in_freq) if bottleneck_dim > 0: self.bridge = Linear(self._output_dim, bottleneck_dim) self._output_dim = bottleneck_dim self.layers = nn.Sequential(layers) # Initialize parameters self.reset_parameters()
def __init__(self, enc_dim, conv_out_channels, conv_kernel_size, threshold=0.9): super(CIF, self).__init__() self.threshold = threshold self.channel = conv_out_channels self.n_heads = 1 self.conv = nn.Conv1d(in_channels=enc_dim, out_channels=conv_out_channels, kernel_size=conv_kernel_size * 2 + 1, stride=1, padding=conv_kernel_size) self.proj = Linear(conv_out_channels, 1)
def __init__(self, key_dim, query_dim, attn_dim, window, init_r=-4): """Monotonic chunk-wise attention. "Monotonic Chunkwise Attention" (ICLR 2018) https://openreview.net/forum?id=Hko85plCW if window == 1, this is equivalent to Hard monotonic attention "Online and Linear-Time Attention by Enforcing Monotonic Alignment" (ICML 2017) http://arxiv.org/abs/1704.00784 Args: key_dim (int): dimensions of key query_dim (int): dimensions of query attn_dim: (int) dimension of the attention layer window (int): chunk size init_r (int): initial value for parameter 'r' used in monotonic/chunk attention """ super(MoChA, self).__init__() self.window = window self.n_heads = 1 # Monotonic energy self.w_key_mono = Linear(key_dim, attn_dim, bias=True) self.w_query_mono = Linear(query_dim, attn_dim, bias=False) self.v_mono = Linear(attn_dim, 1, bias=False, weight_norm=True) self.r_mono = nn.Parameter(torch.Tensor([init_r])) # Chunk energy if window > 1: self.w_key_chunk = Linear(key_dim, attn_dim, bias=True) self.w_query_chunk = Linear(query_dim, attn_dim, bias=False) self.v_chunk = Linear(attn_dim, 1, bias=False, weight_norm=True) self.r_chunk = nn.Parameter(torch.Tensor([init_r])) # initialization self.v_mono.fc.weight_g.data = torch.Tensor([1 / attn_dim]).sqrt() if window > 1: self.v_mono.fc.weight_g.data = torch.Tensor([1 / attn_dim]).sqrt()
def __init__(self, eos, unk, pad, blank, enc_n_units, rnn_type, n_units, n_projs, n_layers, bottleneck_dim, emb_dim, vocab, tie_embedding=False, attn_conv_kernel_size=0, dropout=0.0, dropout_emb=0.0, lsm_prob=0.0, ctc_weight=0.0, ctc_lsm_prob=0.0, ctc_fc_list=[], backward=False, lm_fusion=None, lm_fusion_type='cold', discourse_aware='', lm_init=None, global_weight=1.0, mtl_per_batch=False, param_init=0.1, replace_sos=False, soft_label_weight=0.0): super(CIFRNNDecoder, self).__init__() logger = logging.getLogger('training') self.eos = eos self.unk = unk self.pad = pad self.blank = blank self.vocab = vocab self.rnn_type = rnn_type assert rnn_type in ['lstm', 'gru'] self.enc_n_units = enc_n_units self.dec_n_units = n_units self.n_projs = n_projs self.n_layers = n_layers self.lsm_prob = lsm_prob self.ctc_weight = ctc_weight self.bwd = backward self.lm_fusion_type = lm_fusion_type self.global_weight = global_weight self.mtl_per_batch = mtl_per_batch self.replace_sos = replace_sos self.soft_label_weight = soft_label_weight self.quantity_loss_weight = 1.0 # for contextualization self.discourse_aware = discourse_aware self.dstate_prev = None # for cache self.prev_spk = '' self.total_step = 0 self.dstates_final = None self.lmstate_final = None if ctc_weight > 0: self.ctc = CTC(eos=eos, blank=blank, enc_n_units=enc_n_units, vocab=vocab, dropout=dropout, lsm_prob=ctc_lsm_prob, fc_list=ctc_fc_list, param_init=param_init) if ctc_weight < global_weight: # Attention layer self.score = CIF(enc_dim=self.enc_n_units, conv_kernel_size=attn_conv_kernel_size, conv_out_channels=self.enc_n_units) # Decoder self.rnn = nn.ModuleList() if self.n_projs > 0: self.proj = nn.ModuleList( [Linear(n_units, n_projs) for _ in range(n_layers)]) self.dropout = nn.ModuleList( [nn.Dropout(p=dropout) for _ in range(n_layers)]) rnn = nn.LSTM if rnn_type == 'lstm' else nn.GRU dec_odim = enc_n_units + emb_dim for l in range(n_layers): self.rnn += [rnn(dec_odim, n_units, 1)] dec_odim = n_units if self.n_projs > 0: dec_odim = n_projs # LM fusion if lm_fusion is not None: self.linear_dec_feat = Linear(dec_odim + enc_n_units, n_units) if lm_fusion_type in ['cold', 'deep']: self.linear_lm_feat = Linear(lm_fusion.n_units, n_units) self.linear_lm_gate = Linear(n_units * 2, n_units) elif lm_fusion_type == 'cold_prob': self.linear_lm_feat = Linear(lm_fusion.vocab, n_units) self.linear_lm_gate = Linear(n_units * 2, n_units) else: raise ValueError(lm_fusion_type) self.output_bn = Linear(n_units * 2, bottleneck_dim) # fix LM parameters for p in lm_fusion.parameters(): p.requires_grad = False elif discourse_aware == 'hierarchical': raise NotImplementedError else: self.output_bn = Linear(dec_odim + enc_n_units, bottleneck_dim) self.embed = Embedding(vocab, emb_dim, dropout=dropout_emb, ignore_index=pad) self.output = Linear(bottleneck_dim, vocab) # NOTE: include bias even when tying weights # Optionally tie weights as in: # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) # https://arxiv.org/abs/1608.05859 # and # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) # https://arxiv.org/abs/1611.01462 if tie_embedding: if emb_dim != bottleneck_dim: raise ValueError( 'When using the tied flag, n_units must be equal to emb_dim.' ) self.output.fc.weight = self.embed.embed.weight # Initialize parameters self.reset_parameters(param_init) # resister the external LM self.lm = lm_fusion # decoder initialization with pre-trained LM if lm_init is not None: assert lm_init.vocab == vocab assert lm_init.n_units == n_units assert lm_init.emb_dim == emb_dim logger.info('===== Initialize the decoder with pre-trained RNNLM') assert lm_init.n_projs == 0 # TODO(hirofumi): fix later assert lm_init.n_units_null_context == enc_n_units # RNN for l in range(lm_init.n_layers): for n, p in lm_init.rnn[l].named_parameters(): assert getattr(self.rnn[l], n).size() == p.size() getattr(self.rnn[l], n).data = p.data logger.info('Overwrite %s' % n) # embedding assert self.embed.embed.weight.size( ) == lm_init.embed.embed.weight.size() self.embed.embed.weight.data = lm_init.embed.embed.weight.data logger.info('Overwrite %s' % 'embed.embed.weight')
def __init__(self, args, save_path=None): super(LMBase, self).__init__() logger = logging.getLogger('training') logger.info(self.__class__.__name__) self.save_path = save_path self.emb_dim = args.emb_dim self.rnn_type = args.lm_type assert args.lm_type in ['lstm', 'gru'] self.n_units = args.n_units self.n_projs = args.n_projs self.n_layers = args.n_layers self.residual = args.residual self.use_glu = args.use_glu self.n_units_cv = args.n_units_null_context self.lsm_prob = args.lsm_prob self.vocab = args.vocab self.eos = 2 self.pad = 3 # NOTE: reserved in advance # for cache self.cache_theta = 0.2 # smoothing parameter self.cache_lambda = 0.2 # cache weight self.cache_ids = [] self.cache_keys = [] self.cache_attn = [] self.embed = Embedding(vocab=self.vocab, emb_dim=args.emb_dim, dropout=args.dropout_in, ignore_index=self.pad) rnn = nn.LSTM if args.lm_type == 'lstm' else nn.GRU self.rnn = nn.ModuleList() self.dropout = nn.ModuleList( [nn.Dropout(p=args.dropout_hidden) for _ in range(args.n_layers)]) if args.n_projs > 0: self.proj = nn.ModuleList([ Linear(args.n_units, args.n_projs) for _ in range(args.n_layers) ]) rnn_idim = args.emb_dim + args.n_units_null_context for l in range(args.n_layers): self.rnn += [ rnn(rnn_idim, args.n_units, 1, bias=True, batch_first=True, dropout=0, bidirectional=False) ] rnn_idim = args.n_units if args.n_projs > 0: rnn_idim = args.n_projs if self.use_glu: self.fc_glu = Linear(rnn_idim, rnn_idim * 2, dropout=args.dropout_hidden) if args.adaptive_softmax: self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss( rnn_idim, self.vocab, # cutoffs=[self.vocab // 10, 3 * self.vocab // 10], cutoffs=[self.vocab // 25, self.vocab // 5], div_value=4.0) self.output = None else: self.adaptive_softmax = None self.output = Linear(rnn_idim, self.vocab, dropout=args.dropout_out) # NOTE: include bias even when tying weights # Optionally tie weights as in: # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) # https://arxiv.org/abs/1608.05859 # and # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) # https://arxiv.org/abs/1611.01462 if args.tie_embedding: if args.n_units != args.emb_dim: raise ValueError( 'When using the tied flag, n_units must be equal to emb_dim.' ) self.output.fc.weight = self.embed.embed.weight # Initialize parameters self.reset_parameters(args.param_init) # Recurrent weights are orthogonalized if args.rec_weight_orthogonal: self.reset_parameters(args.param_init, dist='orthogonal', keys=['rnn', 'weight'])
def __init__(self, args, save_path=None): super(LMBase, self).__init__() logger = logging.getLogger('training') logger.info(self.__class__.__name__) self.save_path = save_path self.emb_dim = args.emb_dim self.n_units = args.n_units self.n_layers = args.n_layers self.lsm_prob = args.lsm_prob self.vocab = args.vocab self.eos = 2 self.pad = 3 # NOTE: reserved in advance # for cache self.cache_theta = 0.2 # smoothing parameter self.cache_lambda = 0.2 # cache weight self.cache_ids = [] self.cache_keys = [] self.cache_attn = [] self.embed = Embedding(vocab=self.vocab, emb_dim=args.emb_dim, dropout=args.dropout_in, ignore_index=self.pad) model_size = args.lm_type.replace('gated_conv_', '') blocks = OrderedDict() if model_size == 'custom': blocks['conv1'] = GLUBlock(args.kernel_size, args.emb_dim, args.n_units, bottlececk_dim=args.n_projs, dropout=args.dropout_hidden) for l in range(args.n_layers - 1): blocks['conv%d' % (l + 2)] = GLUBlock( args.kernel_size, args.n_units, args.n_units, bottlececk_dim=args.n_projs, dropout=args.dropout_hidden) last_dim = args.n_units elif model_size == '8': blocks['conv1'] = GLUBlock(4, args.emb_dim, 900, dropout=args.dropout_hidden) for i in range(1, 8, 1): blocks['conv2-%d' % i] = GLUBlock(4, 900, 900, dropout=args.dropout_hidden) last_dim = 900 elif model_size == '8B': blocks['conv1'] = GLUBlock(1, args.emb_dim, 512, dropout=args.dropout_hidden) for i in range(1, 4, 1): blocks['conv2-%d' % i] = GLUBlock(5, 512, 512, bottlececk_dim=128, dropout=args.dropout_hidden) for i in range(1, 4, 1): blocks['conv3-%d' % i] = GLUBlock(5, 512, 512, bottlececk_dim=256, dropout=args.dropout_hidden) blocks['conv4'] = GLUBlock(1, 512, 2048, bottlececk_dim=1024, dropout=args.dropout_hidden) last_dim = 2048 elif model_size == '9': blocks['conv1'] = GLUBlock(4, args.emb_dim, 807, dropout=args.dropout_hidden) for i in range(1, 4, 1): blocks['conv2-%d-1' % i] = GLUBlock( 4, 807, 807, dropout=args.dropout_hidden) blocks['conv2-%d-2' % i] = GLUBlock( 4, 807, 807, dropout=args.dropout_hidden) last_dim = 807 elif model_size == '13': blocks['conv1'] = GLUBlock(4, args.emb_dim, 1268, dropout=args.dropout_hidden) for i in range(1, 13, 1): blocks['conv2-%d' % i] = GLUBlock(4, 1268, 1268, dropout=args.dropout_hidden) last_dim = 1268 elif model_size == '14': for i in range(1, 4, 1): blocks['conv1-%d' % i] = GLUBlock( 6, args.emb_dim if i == 1 else 850, 850, dropout=args.dropout_hidden) blocks['conv2'] = GLUBlock(1, 850, 850, dropout=args.dropout_hidden) for i in range(1, 5, 1): blocks['conv3-%d' % i] = GLUBlock(5, 850, 850, dropout=args.dropout_hidden) blocks['conv4'] = GLUBlock(1, 850, 850, dropout=args.dropout_hidden) for i in range(1, 4, 1): blocks['conv5-%d' % i] = GLUBlock(4, 850, 850, dropout=args.dropout_hidden) blocks['conv6'] = GLUBlock(4, 850, 1024, dropout=args.dropout_hidden) blocks['conv7'] = GLUBlock(4, 1024, 2048, dropout=args.dropout_hidden) last_dim = 2048 elif model_size == '14B': blocks['conv1'] = GLUBlock(5, args.emb_dim, 512, dropout=args.dropout_hidden) for i in range(1, 4, 1): blocks['conv2-%d' % i] = GLUBlock(5, 512, 512, bottlececk_dim=128, dropout=args.dropout_hidden) for i in range(1, 4, 1): blocks['conv3-%d' % i] = GLUBlock(5, 512 if i == 1 else 1024, 1024, bottlececk_dim=512, dropout=args.dropout_hidden) for i in range(1, 7, 1): blocks['conv4-%d' % i] = GLUBlock(5, 1024 if i == 1 else 2048, 2048, bottlececk_dim=1024, dropout=args.dropout_hidden) blocks['conv5'] = GLUBlock(5, 2048, 4096, bottlececk_dim=1024, dropout=args.dropout_hidden) last_dim = 4096 else: raise NotImplementedError(model_size) self.blocks = nn.Sequential(blocks) if args.adaptive_softmax: self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss( last_dim, self.vocab, # cutoffs=[self.vocab // 10, 3 * self.vocab // 10], cutoffs=[self.vocab // 25, self.vocab // 5], div_value=4.0) self.output = None else: self.adaptive_softmax = None self.output = Linear(last_dim, self.vocab, dropout=args.dropout_out) # NOTE: include bias even when tying weights # Optionally tie weights as in: # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) # https://arxiv.org/abs/1608.05859 # and # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) # https://arxiv.org/abs/1611.01462 if args.tie_embedding: if args.n_units != args.emb_dim: raise ValueError( 'When using the tied flag, n_units must be equal to emb_dim.' ) self.output.fc.weight = self.embed.embed.weight # Initialize parameters self.reset_parameters(args.param_init)
def __init__(self, args, save_path=None): super(LMBase, self).__init__() logger = logging.getLogger('training') logger.info(self.__class__.__name__) self.save_path = save_path self.d_model = args.d_model self.d_ff = args.d_ff self.pe_type = args.pe_type self.n_layers = args.n_layers self.attn_n_heads = args.attn_n_heads self.lsm_prob = args.lsm_prob self.vocab = args.vocab self.eos = 2 self.pad = 3 # NOTE: reserved in advance # self.lsm_prob = lsm_prob # for cache self.cache_theta = 0.2 # smoothing parameter self.cache_lambda = 0.2 # cache weight self.cache_ids = [] self.cache_keys = [] self.cache_attn = [] self.embed = Embedding( vocab=self.vocab, emb_dim=self.d_model, dropout=0, # NOTE: do not apply dropout here ignore_index=self.pad) self.pos_enc = PositionalEncoding(args.d_model, args.dropout_in, args.pe_type) self.layers = nn.ModuleList([ TransformerDecoderBlock(args.d_model, args.d_ff, args.attn_type, args.attn_n_heads, args.dropout_hidden, args.dropout_att, args.layer_norm_eps, src_attention=False) for _ in range(self.n_layers) ]) self.norm_out = nn.LayerNorm(args.d_model, eps=args.layer_norm_eps) if args.adaptive_softmax: self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss( args.d_model, self.vocab, cutoffs=[round(self.vocab / 15), 3 * round(self.vocab / 15)], # cutoffs=[self.vocab // 25, 3 * self.vocab // 5], div_value=4.0) self.output = None else: self.adaptive_softmax = None self.output = Linear(self.d_model, self.vocab, dropout=args.dropout_out) # Optionally tie weights as in: # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) # https://arxiv.org/abs/1608.05859 # and # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) # https://arxiv.org/abs/1611.01462 if args.tie_embedding: self.output.fc.weight = self.embed.embed.weight # Initialize parameters self.reset_parameters()
def __init__(self, eos, unk, pad, blank, enc_n_units, attn_type, attn_n_heads, n_layers, d_model, d_ff, vocab, tie_embedding=False, pe_type='add', layer_norm_eps=1e-12, dropout=0.0, dropout_emb=0.0, dropout_att=0.0, lsm_prob=0.0, focal_loss_weight=0.0, focal_loss_gamma=2.0, ctc_weight=0.0, ctc_lsm_prob=0.0, ctc_fc_list=[], backward=False, global_weight=1.0, mtl_per_batch=False, adaptive_softmax=False): super(TransformerDecoder, self).__init__() logger = logging.getLogger('training') self.eos = eos self.unk = unk self.pad = pad self.blank = blank self.enc_n_units = enc_n_units self.d_model = d_model self.n_layers = n_layers self.attn_n_heads = attn_n_heads self.pe_type = pe_type self.lsm_prob = lsm_prob self.focal_loss_weight = focal_loss_weight self.focal_loss_gamma = focal_loss_gamma self.ctc_weight = ctc_weight self.bwd = backward self.global_weight = global_weight self.mtl_per_batch = mtl_per_batch if ctc_weight > 0: self.ctc = CTC(eos=eos, blank=blank, enc_n_units=enc_n_units, vocab=vocab, dropout=dropout, lsm_prob=ctc_lsm_prob, fc_list=ctc_fc_list, param_init=0.1) if ctc_weight < global_weight: self.embed = Embedding( vocab, d_model, dropout=0, # NOTE: do not apply dropout here ignore_index=pad) self.pos_enc = PositionalEncoding(d_model, dropout_emb, pe_type) self.layers = nn.ModuleList([ TransformerDecoderBlock(d_model, d_ff, attn_type, attn_n_heads, dropout, dropout_att, layer_norm_eps) for _ in range(n_layers) ]) self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps) if adaptive_softmax: self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss( d_model, vocab, cutoffs=[ round(self.vocab / 15), 3 * round(self.vocab / 15) ], # cutoffs=[self.vocab // 25, 3 * self.vocab // 5], div_value=4.0) self.output = None else: self.adaptive_softmax = None self.output = Linear(d_model, vocab) # Optionally tie weights as in: # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) # https://arxiv.org/abs/1608.05859 # and # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) # https://arxiv.org/abs/1611.01462 if tie_embedding: self.output.fc.weight = self.embed.embed.weight # Initialize parameters self.reset_parameters()
def __init__(self, eos, unk, pad, blank, enc_n_units, rnn_type, n_units, n_projs, n_layers, residual, bottleneck_dim, emb_dim, vocab, tie_embedding=False, dropout=0.0, dropout_emb=0.0, lsm_prob=0.0, ctc_weight=0.0, ctc_lsm_prob=0.0, ctc_fc_list=[], lm_init=None, lmobj_weight=0.0, share_lm_softmax=False, global_weight=1.0, mtl_per_batch=False, param_init=0.1, start_pointing=False, end_pointing=True): super(RNNTransducer, self).__init__() logger = logging.getLogger('training') self.eos = eos self.unk = unk self.pad = pad self.blank = blank self.vocab = vocab self.rnn_type = rnn_type assert rnn_type in ['lstm_transducer', 'gru_transducer'] self.enc_n_units = enc_n_units self.dec_n_units = n_units self.n_projs = n_projs self.n_layers = n_layers self.residual = residual self.lsm_prob = lsm_prob self.ctc_weight = ctc_weight self.lmobj_weight = lmobj_weight self.share_lm_softmax = share_lm_softmax self.global_weight = global_weight self.mtl_per_batch = mtl_per_batch # VAD self.start_pointing = start_pointing self.end_pointing = end_pointing # for cache self.prev_spk = '' self.lmstate_final = None self.state_cache = OrderedDict() if ctc_weight > 0: self.ctc = CTC(eos=eos, blank=blank, enc_n_units=enc_n_units, vocab=vocab, dropout=dropout, lsm_prob=ctc_lsm_prob, fc_list=ctc_fc_list, param_init=param_init) if ctc_weight < global_weight: import warprnnt_pytorch self.warprnnt_loss = warprnnt_pytorch.RNNTLoss() # for MTL with LM objective if lmobj_weight > 0: if share_lm_softmax: self.output_lmobj = self.output # share paramters else: self.output_lmobj = Linear(n_units, vocab) # Prediction network self.fast_impl = False rnn = nn.LSTM if rnn_type == 'lstm_transducer' else nn.GRU if n_projs == 0 and not residual: self.fast_impl = True self.rnn = rnn(emb_dim, n_units, n_layers, bias=True, batch_first=True, dropout=dropout, bidirectional=False) # NOTE: pytorch introduces a dropout layer on the outputs of each layer EXCEPT the last layer dec_idim = n_units self.dropout_top = nn.Dropout(p=dropout) else: self.rnn = nn.ModuleList() self.dropout = nn.ModuleList([nn.Dropout(p=dropout) for _ in range(n_layers)]) if n_projs > 0: self.proj = nn.ModuleList([Linear(dec_idim, n_projs) for _ in range(n_layers)]) dec_idim = emb_dim for l in range(n_layers): self.rnn += [rnn(dec_idim, n_units, 1, bias=True, batch_first=True, dropout=0, bidirectional=False)] dec_idim = n_projs if n_projs > 0 else n_units self.embed = Embedding(vocab, emb_dim, dropout=dropout_emb, ignore_index=pad) self.w_enc = Linear(enc_n_units, bottleneck_dim, bias=True) self.w_dec = Linear(dec_idim, bottleneck_dim, bias=False) self.output = Linear(bottleneck_dim, vocab) # Initialize parameters self.reset_parameters(param_init) # prediction network initialization with pre-trained LM if lm_init is not None: assert lm_init.vocab == vocab assert lm_init.n_units == n_units assert lm_init.n_projs == n_projs assert lm_init.n_layers == n_layers assert lm_init.residual == residual param_dict = dict(lm_init.named_parameters()) for n, p in self.named_parameters(): if n in param_dict.keys() and p.size() == param_dict[n].size(): if 'output' in n: continue p.data = param_dict[n].data logger.info('Overwrite %s' % n)
def __init__(self, input_dim, rnn_type, n_units, n_projs, n_layers, dropout_in, dropout, subsample, subsample_type='drop', n_stacks=1, n_splices=1, last_proj_dim=0, conv_in_channel=1, conv_channels=0, conv_kernel_sizes=[], conv_strides=[], conv_poolings=[], conv_batch_norm=False, conv_residual=False, conv_bottleneck_dim=0, residual=False, n_layers_sub1=0, n_layers_sub2=0, nin=False, task_specific_layer=False, param_init=0.1): super(RNNEncoder, self).__init__() logger = logging.getLogger("training") if len(subsample) > 0 and len(subsample) != n_layers: raise ValueError('subsample must be the same size as n_layers.') if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1): raise ValueError('Set n_layers_sub1 between 1 to n_layers.') if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2): raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.') self.rnn_type = rnn_type self.bidirectional = True if rnn_type in ['blstm', 'bgru', 'conv_blstm', 'conv_bgru'] else False self.n_units = n_units self.n_dirs = 2 if self.bidirectional else 1 self.n_layers = n_layers # Setting for hierarchical encoder self.n_layers_sub1 = n_layers_sub1 self.n_layers_sub2 = n_layers_sub2 self.task_specific_layer = task_specific_layer # Setting for bridge layers self.bridge = None self.bridge_sub1 = None self.bridge_sub2 = None # Setting for residual connections self.residual = residual if residual: assert np.prod(subsample) == 1 # Dropout for input-hidden connection self.dropout_in = nn.Dropout(p=dropout_in) # Setting for CNNs before RNNs if conv_channels and rnn_type not in ['blstm', 'lstm', 'bgru', 'gru']: channels = [int(c) for c in conv_channels.split('_')] if len(conv_channels) > 0 else [] kernel_sizes = [[int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', ''))] for c in conv_kernel_sizes.split('_')] if len(conv_kernel_sizes) > 0 else [] if rnn_type in ['tds', 'gated_conv']: strides = [] poolings = [] else: strides = [[int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', ''))] for c in conv_strides.split('_')] if len(conv_strides) > 0 else [] poolings = [[int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', ''))] for c in conv_poolings.split('_')] if len(conv_poolings) > 0 else [] if 'conv_' in rnn_type: subsample = [1] * self.n_layers logger.warning('Subsampling is automatically ignored because CNN layers are used before RNN layers.') else: channels = [] kernel_sizes = [] strides = [] poolings = [] if len(channels) > 0: if rnn_type == 'tds': self.conv = TDSEncoder(input_dim=input_dim * n_stacks, in_channel=conv_in_channel, channels=channels, kernel_sizes=kernel_sizes, dropout=dropout, bottleneck_dim=last_proj_dim) elif rnn_type == 'gated_conv': self.conv = GatedConvEncoder(input_dim=input_dim * n_stacks, in_channel=conv_in_channel, channels=channels, kernel_sizes=kernel_sizes, dropout=dropout, bottleneck_dim=last_proj_dim, param_init=param_init) else: assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=channels, kernel_sizes=kernel_sizes, strides=strides, poolings=poolings, dropout=0, batch_norm=conv_batch_norm, residual=conv_residual, bottleneck_dim=conv_bottleneck_dim, param_init=param_init) self._output_dim = self.conv.output_dim else: self._output_dim = input_dim * n_splices * n_stacks self.conv = None self.padding = Padding() if rnn_type not in ['conv', 'tds', 'gated_conv']: # Fast implementation without processes between each layer self.fast_impl = False if np.prod(subsample) == 1 and n_projs == 0 and not residual and n_layers_sub1 == 0 and not nin: self.fast_impl = True if 'lstm' in rnn_type: rnn = nn.LSTM elif 'gru' in rnn_type: rnn = nn.GRU else: raise ValueError('rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".') self.rnn = rnn(self._output_dim, n_units, n_layers, bias=True, batch_first=True, dropout=dropout, bidirectional=self.bidirectional) # NOTE: pytorch introduces a dropout layer on the outputs of each layer EXCEPT the last layer self._output_dim = n_units * self.n_dirs self.dropout_top = nn.Dropout(p=dropout) else: self.rnn = nn.ModuleList() self.dropout = nn.ModuleList() self.proj = None if n_projs > 0: self.proj = nn.ModuleList() # subsample self.subsample = None if subsample_type == 'max_pool' and np.prod(subsample) > 1: self.subsample = nn.ModuleList([MaxpoolSubsampler(subsample[l]) for l in range(n_layers)]) elif subsample_type == 'concat' and np.prod(subsample) > 1: self.subsample = nn.ModuleList([ConcatSubsampler(subsample[l], n_units, self.n_dirs) for l in range(n_layers)]) elif subsample_type == 'drop' and np.prod(subsample) > 1: self.subsample = nn.ModuleList([DropSubsampler(subsample[l]) for l in range(n_layers)]) # NiN self.nin = None if nin: self.nin = nn.ModuleList() for l in range(n_layers): if 'lstm' in rnn_type: rnn_i = nn.LSTM elif 'gru' in rnn_type: rnn_i = nn.GRU else: raise ValueError('rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".') self.rnn += [rnn_i(self._output_dim, n_units, 1, bias=True, batch_first=True, dropout=0, bidirectional=self.bidirectional)] self.dropout += [nn.Dropout(p=dropout)] self._output_dim = n_units * self.n_dirs # Projection layer if self.proj is not None: if l != n_layers - 1: self.proj += [Linear(n_units * self.n_dirs, n_projs)] self._output_dim = n_projs # Task specific layer if l == n_layers_sub1 - 1 and task_specific_layer: self.rnn_sub1 = rnn_i(self._output_dim, n_units, 1, bias=True, batch_first=True, dropout=0, bidirectional=self.bidirectional) self.dropout_sub1 = nn.Dropout(p=dropout) if last_proj_dim != self.output_dim: self.bridge_sub1 = Linear(n_units, last_proj_dim) if l == n_layers_sub2 - 1 and task_specific_layer: self.rnn_sub2 = rnn_i(self._output_dim, n_units, 1, bias=True, batch_first=True, dropout=0, bidirectional=self.bidirectional) self.dropout_sub2 = nn.Dropout(p=dropout) if last_proj_dim != self.output_dim: self.bridge_sub2 = Linear(n_units, last_proj_dim) # Network in network if self.nin is not None: if l != n_layers - 1: self.nin += [NiN(self._output_dim)] # if n_layers_sub1 > 0 or n_layers_sub2 > 0: # assert task_specific_layer if last_proj_dim != self.output_dim: self.bridge = Linear(self._output_dim, last_proj_dim) self._output_dim = last_proj_dim # Initialize parameters self.reset_parameters(param_init)
def __init__(self, input_dim, attn_type, attn_n_heads, n_layers, d_model, d_ff, pe_type='add', layer_norm_eps=1e-12, dropout_in=0, dropout=0, dropout_att=0, last_proj_dim=0, n_stacks=1, n_splices=1, conv_in_channel=1, conv_channels=0, conv_kernel_sizes=[], conv_strides=[], conv_poolings=[], conv_batch_norm=False, conv_residual=False, conv_bottleneck_dim=0, param_init=0.1): super(TransformerEncoder, self).__init__() logger = logging.getLogger("training") self.d_model = d_model self.n_layers = n_layers self.attn_n_heads = attn_n_heads self.pe_type = pe_type # Setting for CNNs before RNNs if conv_channels: channels = [int(c) for c in conv_channels.split('_') ] if len(conv_channels) > 0 else [] kernel_sizes = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_kernel_sizes.split('_') ] if len(conv_kernel_sizes) > 0 else [] strides = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_strides.split('_') ] if len(conv_strides) > 0 else [] poolings = [[ int(c.split(',')[0].replace('(', '')), int(c.split(',')[1].replace(')', '')) ] for c in conv_poolings.split('_') ] if len(conv_poolings) > 0 else [] else: channels = [] kernel_sizes = [] strides = [] poolings = [] logger.warning( 'Subsampling is automatically ignored because CNN layers are used before RNN layers.' ) if len(channels) > 0: assert n_stacks == 1 and n_splices == 1 self.conv = ConvEncoder(input_dim, in_channel=conv_in_channel, channels=channels, kernel_sizes=kernel_sizes, strides=strides, poolings=poolings, dropout=0, batch_norm=conv_batch_norm, residual=conv_residual, bottleneck_dim=d_model, param_init=param_init) self._output_dim = self.conv.output_dim else: self._output_dim = input_dim * n_splices * n_stacks self.conv = None self.embed = Linear(self._output_dim, d_model) # NOTE: do not apply dropout here self.pos_enc = PositionalEncoding(d_model, dropout_in, pe_type) self.layers = nn.ModuleList([ TransformerEncoderBlock(d_model, d_ff, attn_type, attn_n_heads, dropout, dropout_att, layer_norm_eps) for l in range(n_layers) ]) self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps) if last_proj_dim != self.output_dim: self.bridge = Linear(self._output_dim, last_proj_dim) self._output_dim = last_proj_dim else: self.bridge = None self._output_dim = d_model # Initialize parameters self.reset_parameters()
def __init__(self, key_dim, query_dim, attn_type, attn_dim, sharpening_factor=1, sigmoid_smoothing=False, conv_out_channels=10, conv_kernel_size=100, dropout=0): super(AttentionMechanism, self).__init__() self.attn_type = attn_type self.attn_dim = attn_dim self.sharpening_factor = sharpening_factor self.sigmoid_smoothing = sigmoid_smoothing self.n_heads = 1 self.key = None self.mask = None # attention dropout applied after the softmax layer self.attn_dropout = nn.Dropout(p=dropout) if attn_type == 'no': raise NotImplementedError # NOTE: sequence-to-sequence without attetnion (use the last state as a context vector) elif attn_type == 'add': self.w_key = Linear(key_dim, attn_dim, bias=True) self.w_query = Linear(query_dim, attn_dim, bias=False) self.v = Linear(attn_dim, 1, bias=False) elif attn_type == 'location': self.w_key = Linear(key_dim, attn_dim, bias=True) self.w_query = Linear(query_dim, attn_dim, bias=False) self.w_conv = Linear(conv_out_channels, attn_dim, bias=False) self.conv = nn.Conv2d(in_channels=1, out_channels=conv_out_channels, kernel_size=(1, conv_kernel_size * 2 + 1), stride=1, padding=(0, conv_kernel_size), bias=False) self.v = Linear(attn_dim, 1, bias=False) elif attn_type == 'dot': self.w_key = Linear(key_dim, attn_dim, bias=False) self.w_query = Linear(query_dim, attn_dim, bias=False) elif attn_type == 'luong_dot': pass # NOTE: no additional parameters elif attn_type == 'luong_general': self.w_key = Linear(key_dim, query_dim, bias=False) elif attn_type == 'luong_concat': self.w = Linear(key_dim + query_dim, attn_dim, bias=False) self.v = Linear(attn_dim, 1, bias=False) else: raise ValueError(attn_type)
def __init__(self, d_model, d_ff, dropout): super(PositionwiseFeedForward, self).__init__() self.w_1 = Linear(d_model, d_ff) self.w_2 = Linear(d_ff, d_model) self.dropout = nn.Dropout(dropout)