def __init__( self, dictionary, char_embed_dim=32, word_embed_dim=512, convolutions=((128, 3), (128, 5)), dropout=0.1, num_highway_layers=0, preserve_word=True, ): super().__init__() self.dictionary = dictionary vocab_size = len(self.dictionary) self.embed_char_tokens = nn.Embedding(vocab_size, char_embed_dim) in_channels = convolutions[0][0] self.dropout = dropout self.convolutions = nn.ModuleList([ ConvTBC(in_channels, out_channels * 2, kernel_size) for (out_channels, kernel_size) in convolutions ]) self.fc_input = common_layers.Linear(char_embed_dim, in_channels) conv_output_dim = sum(out_dim for (out_dim, _) in convolutions) self.fc_output = common_layers.Linear(conv_output_dim, word_embed_dim) self.highway_layers = nn.ModuleList([HighwayLayer(conv_output_dim)] * num_highway_layers) self.preserve_word = preserve_word
def __init__(self, dictionary, embed_dim=512, max_positions=1024, convolutions=((512, 3), ) * 20, dropout=0.1): super().__init__(dictionary) self.dropout = dropout self.num_attention_layers = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE, ) in_channels = convolutions[0][0] self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() for (out_channels, kernel_size) in convolutions: self.projections.append( Linear(in_channels, out_channels ) if in_channels != out_channels else None) self.convolutions.append( ConvTBC(in_channels, out_channels * 2, kernel_size, dropout=dropout)) in_channels = out_channels self.fc2 = Linear(in_channels, embed_dim)
def __init__(self, num_embeddings, embed_dim=512, max_positions=1024, convolutions=((512, 3), ) * 20, dropout=0.1, padding_idx=1): super(Encoder, self).__init__() self.dropout = dropout self.num_attention_layers = None self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) self.embed_positions = Embedding(max_positions, embed_dim, padding_idx) in_channels = convolutions[0][0] self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() for (out_channels, kernel_size) in convolutions: pad = (kernel_size - 1) // 2 self.projections.append( Linear(in_channels, out_channels ) if in_channels != out_channels else None) self.convolutions.append( ConvTBC(in_channels, out_channels * 2, kernel_size, padding=pad, dropout=dropout)) in_channels = out_channels self.fc2 = Linear(in_channels, embed_dim)
def test_convtbc(self): # ksz, in_channels, out_channels conv_tbc = ConvTBC(4, 5, kernel_size=3, padding=1) # out_channels, in_channels, ksz conv1d = nn.Conv1d(4, 5, kernel_size=3, padding=1) conv_tbc.weight.data.copy_(conv1d.weight.data.transpose(0, 2)) conv_tbc.bias.data.copy_(conv1d.bias.data) input_tbc = torch.randn(7, 2, 4, requires_grad=True) input1d = input_tbc.data.transpose(0, 1).transpose(1, 2) input1d.requires_grad = True output_tbc = conv_tbc(input_tbc) output1d = conv1d(input1d) self.assertAlmostEqual( output_tbc.data.transpose(0, 1).transpose(1, 2), output1d.data) grad_tbc = torch.randn(output_tbc.size()) grad1d = grad_tbc.transpose(0, 1).transpose(1, 2).contiguous() output_tbc.backward(grad_tbc) output1d.backward(grad1d) self.assertAlmostEqual(conv_tbc.weight.grad.data.transpose(0, 2), conv1d.weight.grad.data) self.assertAlmostEqual(conv_tbc.bias.grad.data, conv1d.bias.grad.data) self.assertAlmostEqual( input_tbc.grad.data.transpose(0, 1).transpose(1, 2), input1d.grad.data)
def __init__( self, dictionary, embed_dim=512, embed_dict=None, max_positions=1024, convolutions=((512, 3), ) * 20, dropout=0.1, ): super().__init__(dictionary) self.dropout = dropout self.num_attention_layers = None num_embeddings = len(dictionary) self.padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx) if embed_dict: self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens) self.embed_positions = PositionalEmbedding( max_positions, embed_dim, self.padding_idx, ) convolutions = extend_conv_spec(convolutions) in_channels = convolutions[0][0] self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.residuals = [] layer_in_channels = [in_channels] for _, (out_channels, kernel_size, residual) in enumerate(convolutions): if residual == 0: residual_dim = out_channels else: residual_dim = layer_in_channels[-residual] self.projections.append( Linear(residual_dim, out_channels ) if residual_dim != out_channels else None) if kernel_size % 2 == 1: padding = kernel_size // 2 else: padding = 0 self.convolutions.append( ConvTBC(in_channels, out_channels * 2, kernel_size, dropout=dropout, padding=padding)) self.residuals.append(residual) in_channels = out_channels layer_in_channels.append(out_channels) self.fc2 = Linear(in_channels, embed_dim)
def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs): """Weight-normalized Conv1d layer""" from fairseq.modules import ConvTBC m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs) std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels)) nn.init.normal_(m.weight, mean=0, std=std) nn.init.constant_(m.bias, 0) return nn.utils.weight_norm(m, dim=2)
def ConvTBC(in_channels, out_channels, kernel_size, dropout=0, **kwargs): """Weight-normalized Conv1d layer""" from fairseq.modules import ConvTBC m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs) std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels)) m.weight.data.normal_(mean=0, std=std) m.bias.data.zero_() return m
def __init__( self, dictionary, embed_dim=512, max_positions=1024, convolutions=((512, 3), ) * 20, dropout=0.1, attention=False, attention_nheads=1, left_pad=True, ): super().__init__(dictionary) self.dropout = dropout self.num_attention_layers = None self.left_pad = left_pad num_embeddings = len(dictionary) self.padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx) self.embed_positions = PositionalEmbedding( max_positions, embed_dim, self.padding_idx, left_pad=self.left_pad, ) def expand_bool_array(val): if isinstance(val, bool): # expand True into [True, True, ...] and do the same with False return [val] * len(convolutions) return val attention = expand_bool_array(attention) in_channels = convolutions[0][0] self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.attention = nn.ModuleList() self.attproj = nn.ModuleList() for i, (out_channels, kernel_size) in enumerate(convolutions): self.projections.append( Linear(in_channels, out_channels ) if in_channels != out_channels else None) self.convolutions.append( ConvTBC(in_channels, out_channels * 2, kernel_size, dropout=dropout)) self.attention.append( SelfAttention(out_channels, embed_dim, attention_nheads ) if attention[i] else None) in_channels = out_channels self.fc2 = Linear(in_channels, embed_dim)
def ConvTBC(in_channels, out_channels, kernel_size, dilation=(1, ), dropout=0, **kwargs): """Weight-normalized Conv1d layer""" from fairseq.modules import ConvTBC assert dilation[0] == 1 m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs) std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels)) m.weight.data.normal_(mean=0, std=std) m.bias.data.zero_() return nn.utils.weight_norm(m, dim=2)
def __init__(self, dictionary, embed_dim=512, max_positions=1024, convolutions=((512, 3),) * 20, dropout=0.1): super().__init__(dictionary) embed_dim = vector_dict.embedding_dim convolutions=((vector_dict.embedding_dim, 3),) * 20 self.dropout = dropout self.num_attention_layers = None self.embed_dim = embed_dim num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = nn.Embedding.from_pretrained(torch.FloatTensor(vector_dict.embedding[:,:vector_dict.embedding_dim]), freeze=False) # load pre-trained vector #self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) #self.embed_tokens.weight.data.copy_(torch.from_numpy(vector_dict.embedding)) #self.embed_tokens.weight.requires_grad = True self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, left_pad=LanguagePairDataset.LEFT_PAD_SOURCE, ) in_channels = convolutions[0][0] # Shashi self.fc1 = Linear(embed_dim+512, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() for (out_channels, kernel_size) in convolutions: self.projections.append(Linear(in_channels, out_channels) if in_channels != out_channels else None) self.convolutions.append( ConvTBC(in_channels, out_channels * 2, kernel_size, dropout=dropout) ) in_channels = out_channels self.fc2 = Linear(in_channels, embed_dim+512) self.lay_norm = nn.LayerNorm(embed_dim) # layer nomalization in NGTU
def __init__(self, dictionary, embed_dim=512, max_positions=1024, convolutions=((512, 3), ) * 20, dropout=0.1): # 512*3 is a filter, size=3, dim vec = 512 super().__init__() self.dictionary = dictionary self.dropout = dropout self.num_attention_layers = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) self.embed_positions = Embedding(max_positions, embed_dim, padding_idx) in_channels = convolutions[0][0] self.fc1 = Linear( embed_dim, in_channels, dropout=dropout ) # is word vec dim is not consitent with filter dim[channel] # because text is call 1d, so width is named channel self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() for (out_channels, kernel_size) in convolutions: pad = (kernel_size - 1) // 2 self.projections.append( Linear(in_channels, out_channels ) if in_channels != out_channels else None) self.convolutions.append( ConvTBC(in_channels, out_channels * 2, kernel_size, padding=pad, dropout=dropout)) in_channels = out_channels self.fc2 = Linear(in_channels, embed_dim)
def __init__(self, dictionary, embed_dim=512, embed_dict=None, max_positions=1024, convolutions=((512, 3), ) * 20, dropout=0.1, batch_norm=False, use_linear_se=False): super().__init__(dictionary) self.dropout = dropout self.num_attention_layers = None self.batch_norm = batch_norm num_embeddings = len(dictionary) self.padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx) if embed_dict: self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens) self.embed_positions = PositionalEmbedding( max_positions, embed_dim, self.padding_idx, ) convolutions = extend_conv_spec_extended(convolutions) in_channels = convolutions[0][0] if use_linear_se: self.fc1 = LinearSE(embed_dim, in_channels, dropout=dropout) else: self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.inner_convolutions = nn.ModuleList() #self.se_layers = nn.ModuleList() self.residuals = [] self.kernel_sizes = 0 layer_in_channels = [in_channels] for idx, (out_channels, kernel_sizes, residual) in enumerate(convolutions): self.kernel_sizes = len(kernel_sizes) self.inner_convolutions.append(nn.ModuleList()) if residual == 0: residual_dim = out_channels else: residual_dim = layer_in_channels[-residual] if use_linear_se: self.projections.append( LinearSE(residual_dim, out_channels ) if residual_dim != out_channels else None) else: self.projections.append( Linear(residual_dim, out_channels ) if residual_dim != out_channels else None) for kernel_size in kernel_sizes: if kernel_size % 2 == 1: padding = kernel_size // 2 else: padding = 0 self.inner_convolutions[idx].append( ConvTBC(in_channels, out_channels * 2, kernel_size, dropout=dropout, padding=padding)) # TODO(naetherm): Combine the outputs of the convolution to one single instance max_pooling #self.convolutions.append(torch.stack(self.inner_convolutions[idx], dim=0).sum(dim=0)) #self.se_layers.append(SqueezeExcitationLayer(n_features=16)) self.residuals.append(residual) in_channels = out_channels layer_in_channels.append(out_channels) self.mp2d = torch.nn.MaxPool2d(kernel_size=(self.kernel_sizes, 1)) if use_linear_se: self.fc2 = LinearSE(in_channels, embed_dim) else: self.fc2 = Linear(in_channels, embed_dim)
def __init__(self, dictionary, args, encoder_embed_dim=512, embed_dict=None, max_positions=1024, convolutions=((512, 3), ) * 20, dropout=0.1, left_pad=True): super().__init__(dictionary) self.elmo = Elmo(options_file, weight_file, args.num_output_repr, dropout=args.elmo_dropout, do_layer_norm=args.elmo_do_layer_norm) self.args = args if self.args.merge_mode == 'sum': # just use in `sum` mode self.elmo_projection = Linear(args.elmo_repr_dim, encoder_embed_dim) self.id2token = {v: k for k, v in dictionary.indices.items()} self.dropout = dropout self.left_pad = left_pad self.num_attention_layers = None num_embeddings = len(dictionary) self.padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, args.token_embed_dim, self.padding_idx) if embed_dict: self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens) self.embed_positions = PositionalEmbedding( max_positions, args.token_embed_dim, self.padding_idx, left_pad=self.left_pad, ) convolutions = extend_conv_spec(convolutions) in_channels = convolutions[0][0] self.fc1 = Linear(encoder_embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.residuals = [] layer_in_channels = [in_channels] for _, (out_channels, kernel_size, residual) in enumerate(convolutions): if residual == 0: residual_dim = out_channels else: residual_dim = layer_in_channels[-residual] self.projections.append( Linear(residual_dim, out_channels ) if residual_dim != out_channels else None) if kernel_size % 2 == 1: padding = kernel_size // 2 else: padding = 0 self.convolutions.append( ConvTBC(in_channels, out_channels * 2, kernel_size, dropout=dropout, padding=padding)) self.residuals.append(residual) in_channels = out_channels layer_in_channels.append(out_channels) if args.num_output_repr == 2 and args.merge_mode == 'concat': self.fc2 = Linear(in_channels + args.elmo_repr_dim, encoder_embed_dim) else: self.fc2 = Linear(in_channels, encoder_embed_dim)