def __init__(self, in_dim, out_dim, convolutions=((256, 5, 1), ) * 4, dropout=0.1): super(Converter, self).__init__() self.dropout = dropout self.in_dim = in_dim self.out_dim = out_dim # Non-causual convolutions in_channels = convolutions[0][0] self.fc1 = Linear(in_dim, in_channels) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() Conv1dLayer = Conv1d if has_dilation(convolutions) else ConvTBC for (out_channels, kernel_size, dilation) in convolutions: pad = (kernel_size - 1) // 2 * dilation dilation = (dilation, ) self.projections.append( Linear(in_channels, out_channels ) if in_channels != out_channels else None) self.convolutions.append( Conv1dLayer(in_channels, out_channels * 2, kernel_size, padding=pad, dilation=dilation, dropout=dropout)) in_channels = out_channels self.fc2 = Linear(in_channels, out_dim)
def __init__(self, conv_channels, embed_dim, dropout=0.1): super(AttentionLayer, self).__init__() # projects from output of convolution to embedding dimension self.in_projection = Linear(conv_channels, embed_dim) # projects from embedding dimension to convolution size self.out_projection = Linear(embed_dim, conv_channels) self.dropout = dropout
def __init__( self, conv_layers_before=None, input_size=83, embed_dim=512, convolutions=((512, 3), ) * 20, dropout=0.1, ): super(FConvEncoder, self).__init__(None) # no src dictionary self.dropout = dropout self.num_attention_layers = None self.conv_layers_before = conv_layers_before self.fc0 = Linear(input_size, embed_dim, dropout=dropout) \ if input_size != embed_dim else None convolutions = extend_conv_spec(convolutions) in_channels = convolutions[0][0] self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.residuals = [] layer_in_channels = [in_channels] for _, (out_channels, kernel_size, residual) in enumerate(convolutions): if residual == 0: residual_dim = out_channels else: residual_dim = layer_in_channels[-residual] self.projections.append( Linear(residual_dim, out_channels ) if residual_dim != out_channels else None) if kernel_size % 2 == 1: padding = kernel_size // 2 else: padding = 0 self.convolutions.append( ConvTBC(in_channels, out_channels * 2, kernel_size, dropout=dropout, padding=padding)) self.residuals.append(residual) in_channels = out_channels layer_in_channels.append(out_channels) self.fc2 = Linear(in_channels, embed_dim)
def __init__( self, in_dim, out_dim, convolutions=((256, 5, 1), ) * 6, deconvolutions=((256, 5, 1), ) * 2, # do upsampling dropout=0.1): super(Converter, self).__init__() self.dropout = dropout self.in_dim = in_dim self.out_dim = out_dim # Non-causual convolutions in_channels = convolutions[0][0] self.fc1 = Linear(in_dim, in_channels) # Convlutions self.convolutions = nn.ModuleList() self.deconvolutions = nn.ModuleList() for idx, (out_channels, kernel_size, dilation) in enumerate(convolutions): if idx < len(deconvolutions): self.deconvolutions.append( ConvTranspose1d(in_channels, out_channels, kernel_size=2, padding=0, stride=2)) pad = (kernel_size - 1) // 2 * dilation dilation = (dilation, ) self.convolutions.append( Conv1d(in_channels, out_channels * 2, kernel_size, padding=pad, dilation=dilation, dropout=dropout)) in_channels = out_channels self.fc2 = Linear(in_channels, out_dim)
def __init__(self, n_vocab, embed_dim, n_speakers, speaker_embed_dim, padding_idx=None, convolutions=((64, 5, .1), ) * 7, max_positions=512, dropout=0.1): super(Encoder, self).__init__() self.dropout = dropout self.num_attention_layers = None # Text input embeddings self.embed_tokens = Embedding(n_vocab, embed_dim, padding_idx) # Text position embedding self.embed_text_positions = Embedding(max_positions, embed_dim, padding_idx) self.embed_text_positions.weight.data = position_encoding_init( max_positions, embed_dim) # Speaker embedding if n_speakers > 1: self.speaker_fc1 = Linear(speaker_embed_dim, embed_dim) self.speaker_fc2 = Linear(speaker_embed_dim, embed_dim) self.n_speakers = n_speakers # Non-causual convolutions in_channels = convolutions[0][0] self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.speaker_projections = nn.ModuleList() self.convolutions = nn.ModuleList() Conv1dLayer = Conv1d if has_dilation(convolutions) else ConvTBC for (out_channels, kernel_size, dilation) in convolutions: pad = (kernel_size - 1) // 2 * dilation dilation = (dilation, ) self.projections.append( Linear(in_channels, out_channels ) if in_channels != out_channels else None) self.speaker_projections.append( Linear(speaker_embed_dim, out_channels ) if n_speakers > 1 else None) self.convolutions.append( Conv1dLayer(in_channels, out_channels * 2, kernel_size, padding=pad, dilation=dilation, dropout=dropout)) in_channels = out_channels self.fc2 = Linear(in_channels, embed_dim)
def __init__( self, embed_dim, n_speakers, speaker_embed_dim, in_dim=80, r=5, max_positions=512, padding_idx=None, convolutions=((128, 5, 1), ) * 4, attention=True, dropout=0.1, use_memory_mask=False, force_monotonic_attention=True, query_position_rate=1.0, key_position_rate=1.29, ): super(Decoder, self).__init__() self.dropout = dropout self.in_dim = in_dim self.r = r in_channels = in_dim * r if isinstance(attention, bool): # expand True into [True, True, ...] and do the same with False attention = [attention] * len(convolutions) # Position encodings for query (decoder states) and keys (encoder states) self.embed_query_positions = Embedding(max_positions, convolutions[0][0], padding_idx) self.embed_query_positions.weight.data = position_encoding_init( max_positions, convolutions[0][0], position_rate=query_position_rate) self.embed_keys_positions = Embedding(max_positions, embed_dim, padding_idx) self.embed_keys_positions.weight.data = position_encoding_init( max_positions, embed_dim, position_rate=key_position_rate) self.fc1 = Linear(in_channels, convolutions[0][0], dropout=dropout) in_channels = convolutions[0][0] # Causual convolutions self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.attention = nn.ModuleList() Conv1dLayer = Conv1d if has_dilation( convolutions) else LinearizedConv1d for i, (out_channels, kernel_size, dilation) in enumerate(convolutions): pad = (kernel_size - 1) * dilation dilation = (dilation, ) self.projections.append( Linear(in_channels, out_channels ) if in_channels != out_channels else None) self.convolutions.append( Conv1dLayer(in_channels, out_channels * 2, kernel_size, padding=pad, dilation=dilation, dropout=dropout)) self.attention.append( AttentionLayer(out_channels, embed_dim, dropout=dropout ) if attention[i] else None) in_channels = out_channels self.fc2 = Linear(in_channels, in_dim * r) # decoder states -> Done binary flag self.fc3 = Linear(in_channels, 1) self._is_inference_incremental = False self.max_decoder_steps = 200 self.min_decoder_steps = 10 self.use_memory_mask = use_memory_mask if isinstance(force_monotonic_attention, bool): self.force_monotonic_attention = \ [force_monotonic_attention] * len(convolutions) else: self.force_monotonic_attention = force_monotonic_attention