def __init__( self, conv_layers_before=None, input_size=83, hidden_size=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, bidirectional=False, residual=False, left_pad=False, padding_value=0., src_bucketed=False, max_source_positions=DEFAULT_MAX_SOURCE_POSITIONS, ): super().__init__(None) # no src dictionary self.conv_layers_before = conv_layers_before self.num_layers = num_layers self.dropout_in_module = FairseqDropout(dropout_in, module_name=self.__class__.__name__) self.dropout_out_module = FairseqDropout(dropout_out, module_name=self.__class__.__name__) self.bidirectional = bidirectional self.hidden_size = hidden_size self.residual = residual self.max_source_positions = max_source_positions self.lstm = nn.ModuleList([ LSTM( input_size=input_size if layer == 0 else 2 * hidden_size if self.bidirectional else hidden_size, hidden_size=hidden_size, bidirectional=bidirectional, ) for layer in range(num_layers) ]) self.left_pad = left_pad self.padding_value = padding_value self.src_bucketed = src_bucketed self.output_units = hidden_size if bidirectional: self.output_units *= 2
def __init__(self, args, no_encoder_attn=False, kernel_size=0): super().__init__() self.embed_dim = args.decoder_embed_dim self.conv_dim = args.decoder_conv_dim if args.decoder_glu: self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None if args.decoder_conv_type == "lightweight": self.conv = LightweightConv( self.conv_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout, ) elif args.decoder_conv_type == "dynamic": self.conv = DynamicConv( self.conv_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout, ) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__) self.relu_dropout_module = FairseqDropout( args.relu_dropout, module_name=self.__class__.__name__) self.input_dropout_module = FairseqDropout( args.input_dropout, module_name=self.__class__.__name__) self.normalize_before = args.decoder_normalize_before self.conv_layer_norm = LayerNorm(self.embed_dim) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, encoder_decoder_attention=True, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) self.need_attn = True
def __init__( self, dictionary, embed_dim=512, hidden_size=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, bidirectional=False, left_pad_source=True, pretrained_embed=None, padding_idx=None, max_source_positions=DEFAULT_MAX_SOURCE_POSITIONS, rnn_type="gru" ): super().__init__(dictionary) self.num_layers = num_layers self.dropout_in_module = FairseqDropout( dropout_in, module_name=self.__class__.__name__ ) self.dropout_out_module = FairseqDropout( dropout_out, module_name=self.__class__.__name__ ) self.bidirectional = bidirectional self.hidden_size = hidden_size self.max_source_positions = max_source_positions num_embeddings = len(dictionary) self.padding_idx = padding_idx if padding_idx is not None else dictionary.pad() if pretrained_embed is None: self.embed_tokens = torch.nn.Embedding(num_embeddings, embed_dim, self.padding_idx) else: self.embed_tokens = pretrained_embed self.rnn_type = rnn_type if rnn_type == "gru": self.hidden = GRU( input_size=embed_dim, hidden_size=hidden_size, num_layers=num_layers, dropout=self.dropout_out_module.p if num_layers > 1 else 0.0, bidirectional=bidirectional, ) elif rnn_type == "lstm": self.hidden = LSTM( input_size=embed_dim, hidden_size=hidden_size, num_layers=num_layers, dropout=self.dropout_out_module.p if num_layers > 1 else 0.0, bidirectional=bidirectional, ) self.left_pad_source = left_pad_source self.output_units = hidden_size if bidirectional: self.bidir_dense = torch.nn.Linear(2, 1)
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.encoder_layerdrop = args.encoder_layerdrop embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) self.embed_positions = ( PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None if not args.adaptive_input and args.quant_noise_pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: self.quant_noise = None if self.encoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.encoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend( [self.build_encoder_layer(args) for i in range(args.encoder_layers)] ) self.num_layers = len(self.layers) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None
def __init__(self, args, kernel_size=0): super().__init__() self.embed_dim = args.encoder_embed_dim self.conv_dim = args.encoder_conv_dim padding_l = ( kernel_size // 2 if kernel_size % 2 == 1 else ((kernel_size - 1) // 2, kernel_size // 2) ) if args.encoder_glu: self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None if args.encoder_conv_type == "lightweight": self.conv = LightweightConv( self.conv_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout, ) elif args.encoder_conv_type == "dynamic": self.conv = DynamicConv( self.conv_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout, ) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.relu_dropout_module = FairseqDropout( args.relu_dropout, module_name=self.__class__.__name__ ) self.input_dropout_module = FairseqDropout( args.input_dropout, module_name=self.__class__.__name__ ) self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for _ in range(2)])
def __init__( self, rnn_type: Union[str, Namespace], dictionary, embed_dim=512, hidden_size=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, bidirectional=False, left_pad=True, pretrained_embed=None, padding_idx=None, max_source_positions=DEFAULT_MAX_SOURCE_POSITIONS, ): super().__init__(dictionary) rnn_type = rnn_type.rnn_type if isinstance(rnn_type, Namespace) else rnn_type self.rnn_type = rnn_type.lower().strip() self.is_lstm = True if self.rnn_type == 'lstm' else False # 方便后面做判断,以便处理lstm的 cell值 self.num_layers = num_layers self.dropout_in_module = FairseqDropout( dropout_in, module_name=self.__class__.__name__) self.dropout_out_module = FairseqDropout( dropout_out, module_name=self.__class__.__name__) self.bidirectional = bidirectional self.hidden_size = hidden_size self.max_source_positions = max_source_positions num_embeddings = len(dictionary) self.padding_idx = padding_idx if padding_idx is not None else dictionary.pad( ) if pretrained_embed is None: self.embed_tokens = JqEmbedding(num_embeddings, embed_dim, self.padding_idx) else: self.embed_tokens = pretrained_embed JQRNN, _ = get_rnn_cell(self.rnn_type) # 返回的是方法 self.rnn = JQRNN( input_size=embed_dim, hidden_size=hidden_size, num_layers=num_layers, dropout=self.dropout_out_module.p if num_layers > 1 else 0.0, bidirectional=bidirectional, ) self.left_pad = left_pad self.output_units = hidden_size if bidirectional: self.output_units *= 2
def __init__( self, input_size, output_size, hidden_sizes=256, kernel_sizes=3, strides=1, dilations=3, num_layers=1, dropout_in=0.0, dropout_out=0.0, residual=False, chunk_width=None, chunk_left_context=0, training_stage=True, ): super().__init__(None) # no src dictionary self.num_layers = num_layers if isinstance(hidden_sizes, int): hidden_sizes = [hidden_sizes] * num_layers else: assert len(hidden_sizes) == num_layers if isinstance(kernel_sizes, int): kernel_sizes = [kernel_sizes] * num_layers else: assert len(kernel_sizes) == num_layers if isinstance(strides, int): strides = [strides] * num_layers else: assert len(strides) == num_layers if isinstance(dilations, int): dilations = [dilations] * num_layers else: assert len(dilations) == num_layers self.dropout_in_module = FairseqDropout(dropout_in, module_name=self.__class__.__name__) self.dropout_out_module = FairseqDropout(dropout_out, module_name=self.__class__.__name__) self.residual = residual self.tdnn = nn.ModuleList([ TdnnBNReLU( in_channels=input_size if layer == 0 else hidden_sizes[layer - 1], out_channels=hidden_sizes[layer], kernel_size=kernel_sizes[layer], stride=strides[layer], dilation=dilations[layer], ) for layer in range(num_layers) ]) receptive_field_radius = sum(layer.padding for layer in self.tdnn) assert chunk_width is None or (chunk_width > 0 and chunk_left_context >= receptive_field_radius) if ( chunk_width is not None and chunk_width > 0 and chunk_left_context > receptive_field_radius ): logger.warning("chunk_{{left,right}}_context can be reduced to {}".format(receptive_field_radius)) self.out_chunk_begin = self.output_lengths(chunk_left_context + 1) - 1 self.out_chunk_end = self.output_lengths(chunk_left_context + chunk_width) \ if chunk_width is not None else None self.training_stage = training_stage self.fc_out = Linear(hidden_sizes[-1], output_size, dropout=self.dropout_out_module.p)
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__) embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) self.embed_positions = (PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None) if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.upsample_scale = args.upsample_scale self.upsampler = nn.Linear(embed_dim, self.upsample_scale * embed_dim)
def __init__(self, args): super().__init__(None) self.dropout_module = FairseqDropout( p=args.dropout, module_name=self.__class__.__name__) self.embed_scale = math.sqrt(args.encoder_embed_dim) if args.no_scale_embedding: self.embed_scale = 1.0 self.padding_idx = 1 self.subsample = Conv1dSubsampler( args.input_feat_per_channel * args.input_channels, args.conv_channels, args.encoder_embed_dim, [int(k) for k in args.conv_kernel_sizes.split(",")], ) self.embed_positions = PositionalEmbedding(args.max_source_positions, args.encoder_embed_dim, self.padding_idx) self.transformer_layers = nn.ModuleList([ TransformerEncoderLayer(args) for _ in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(args.encoder_embed_dim) else: self.layer_norm = None
def __init__(self, args): super().__init__() self.conv1 = nn.Sequential( nn.Conv1d( args.encoder_embed_dim, args.var_pred_hidden_dim, kernel_size=args.var_pred_kernel_size, padding=(args.var_pred_kernel_size - 1) // 2, ), nn.ReLU(), ) self.ln1 = nn.LayerNorm(args.var_pred_hidden_dim) self.dropout_module = FairseqDropout( p=args.var_pred_dropout, module_name=self.__class__.__name__) self.conv2 = nn.Sequential( nn.Conv1d( args.var_pred_hidden_dim, args.var_pred_hidden_dim, kernel_size=args.var_pred_kernel_size, padding=1, ), nn.ReLU(), ) self.ln2 = nn.LayerNorm(args.var_pred_hidden_dim) self.proj = nn.Linear(args.var_pred_hidden_dim, 1)
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self.dropout_module = FairseqDropout(args.dropout, module_name=self.__class__.__name__) self.dropword_module = FairseqFeatureDropout(args.word_dropout, module_name=self.__class__.__name__) self.encoder_layerdrop = args.encoder_layerdrop embed_dim = embed_tokens.embedding_dim assert embed_dim == args.encoder_embed_dim, 'encoder embedding dim mismatch.' self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) self.embed_positions = ( PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) assert not args.layernorm_embedding or not args.encoder_normalize_before if args.layernorm_embedding: self.layernorm_embedding = LayerNorm(embed_dim) self.layernorm_porjected_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.layernorm_porjected_embedding = None self.proj_len = args.projection_length self.dynamic_projection = not args.fix_projection_length self.projected_embeddings = Parameter(torch.Tensor(self.proj_len, embed_dim)) nn.init.normal_(self.projected_embeddings, mean=0., std=embed_dim ** -0.5) if not args.no_token_positional_embeddings and not args.encoder_learned_pos: projected_positions = get_sinusoidal_positional_embedding(self.proj_len, embed_dim) else: projected_positions = None self.register_buffer('projected_positions', projected_positions) if self.encoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.encoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([self.build_encoder_layer(i, args) for i in range(args.encoder_layers)]) self.num_layers = len(self.layers) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) self.proj_layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.proj_layer_norm = None
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__) embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = (PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None) self.layers = nn.ModuleList([]) self.layers.extend([ LightConvEncoderLayer(args, kernel_size=args.encoder_kernel_size_list[i]) for i in range(args.encoder_layers) ]) self.register_buffer("version", torch.Tensor([2])) self.normalize = args.encoder_normalize_before if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__( self, args, # Chỉ cần truyền vào cái này là đủ các tham số dòng lệnh cần thiết rồi. Còn nó là gì thì chưa được liệt kê dictionary, embed_tokens): self.args = args super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__) self.encoder_layerdrop = args.encoder_layerdrop embed_dim = embed_tokens.embedding_dim # Số chiều sau khi đã mã hóa thành nhị phân self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) # self.src_word_emb = embed_tokens # Sửa self.embed_positions = (PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None) if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None if not args.adaptive_input and args.quant_noise_pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: self.quant_noise = None if self.encoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.encoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([ self.build_encoder_layer(args, ) for i in range(args.encoder_layers) ]) self.num_layers = len(self.layers) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None
def __init__(self, embed_dim, proj_dim, dropout): super().__init__() self.layer_norm = LayerNorm(embed_dim) self.down_proj = nn.Linear(embed_dim, proj_dim) self.up_proj = nn.Linear(proj_dim, embed_dim) self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__ )
def __init__(self, args, dictionary, embed_tokens, embed_bytes): self.args = args super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__) self.encoder_layerdrop = args.encoder_layerdrop embed_dim = embed_bytes.embedding_dim # assume the padding will be the same for a sequences, self.padding_idx = embed_bytes.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens if not self.args.input_combine == 'drop_bytes': self.embed_bytes = embed_bytes if self.args.input_combine == 'cnn': self.byte_combine = ByteCombineCNN(embed_dim, embed_dim) elif self.args.input_combine == 'drop_bytes': self.byte_combine = None else: # input_combine == 'sum' self.byte_combine = ByteCombineSUM() self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None if not args.adaptive_input and args.quant_noise_pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: self.quant_noise = None if self.encoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.encoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([ self.build_encoder_layer(args) for _ in range(args.encoder_layers) ]) self.num_layers = len(self.layers) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None
def __init__( self, dictionary, embed_dim=512, hidden_size=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, bidirectional=False, left_pad=True, pretrained_embed=None, padding_idx=None, max_source_positions=DEFAULT_MAX_SOURCE_POSITIONS, ): super().__init__(dictionary) self.num_layers = num_layers self.dropout_in_module = FairseqDropout( dropout_in, module_name=self.__class__.__name__) self.dropout_out_module = FairseqDropout( dropout_out, module_name=self.__class__.__name__) self.bidirectional = bidirectional self.hidden_size = hidden_size self.max_source_positions = max_source_positions num_embeddings = len(dictionary) self.padding_idx = padding_idx if padding_idx is not None else dictionary.pad( ) if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim // 2, self.padding_idx) else: self.embed_tokens = pretrained_embed self.lstm = LSTM( input_size=embed_dim, # TODO(hainan): adding the delta field hidden_size=hidden_size, num_layers=num_layers, dropout=self.dropout_out_module.p if num_layers > 1 else 0.0, bidirectional=bidirectional, ) self.left_pad = left_pad self.output_units = hidden_size if bidirectional: self.output_units *= 2
def __init__(self, args, embed_dim): super(TransformerEncoder, self).__init__() self.dropout_module = FairseqDropout(args.dropout, module_name=self.__class__.__name__) self.encoder_layerdrop = args.encoder_layerdrop # embed_dim = embed_tokens.embedding_dim # self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions # self.embed_tokens = embed_tokens # self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) # self.embed_positions = ( # PositionalEmbedding( # args.max_source_positions, # embed_dim, # self.padding_idx, # learned=args.encoder_learned_pos, # ) # if not args.no_token_positional_embeddings # else None # ) self.embed_positions = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None if not args.adaptive_input and args.quant_noise_pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: self.quant_noise = None if self.encoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.encoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend( [self.build_encoder_layer(args) for i in range(args.encoder_layers)] ) self.num_layers = len(self.layers) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None
def __init__(self, args, conv_layers_before=None, input_size=83, transformer_context=None): self.args = args super(TransformerEncoder, self).__init__(None) # no src dictionary self.register_buffer("version", torch.Tensor([3])) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__) self.encoder_layerdrop = args.encoder_layerdrop embed_dim = args.encoder_embed_dim self.max_source_positions = args.max_source_positions self.conv_layers_before = conv_layers_before self.fc0 = Linear(input_size, embed_dim) if input_size != embed_dim else None self.embed_positions = (PositionalEmbedding( self.output_lengths(self.max_source_positions), embed_dim, 0, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None) if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None if not args.adaptive_input and args.quant_noise_pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: self.quant_noise = None if self.encoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.encoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([ self.build_encoder_layer(args) for i in range(args.encoder_layers) ]) self.num_layers = len(self.layers) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.transformer_context = transformer_context
def __init__(self, cfg, dictionary, embed_tokens, return_fc=False): self.cfg = cfg super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self.dropout_module = FairseqDropout( cfg.dropout, module_name=module_name_fordropout(self.__class__.__name__) ) self.encoder_layerdrop = cfg.encoder.layerdrop self.return_fc = return_fc embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = cfg.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if cfg.no_scale_embedding else math.sqrt(embed_dim) self.embed_positions = ( PositionalEmbedding( cfg.max_source_positions, embed_dim, self.padding_idx, learned=cfg.encoder.learned_pos, ) if not cfg.no_token_positional_embeddings else None ) if cfg.layernorm_embedding: self.layernorm_embedding = LayerNorm(embed_dim, export=cfg.export) else: self.layernorm_embedding = None if not cfg.adaptive_input and cfg.quant_noise.pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), cfg.quant_noise.pq, cfg.quant_noise.pq_block_size, ) else: self.quant_noise = None if self.encoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.encoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend( [self.build_encoder_layer(cfg) for i in range(cfg.encoder.layers)] ) self.num_layers = len(self.layers) if cfg.encoder.normalize_before: self.layer_norm = LayerNorm(embed_dim, export=cfg.export) else: self.layer_norm = None
def __init__(self, args, src_dict, embed_speaker): super().__init__(src_dict) self.args = args self.padding_idx = src_dict.pad() self.n_frames_per_step = args.n_frames_per_step self.out_dim = args.output_frame_dim * args.n_frames_per_step self.embed_speaker = embed_speaker self.spk_emb_proj = None if embed_speaker is not None: self.spk_emb_proj = nn.Linear( args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim) self.dropout_module = FairseqDropout( p=args.dropout, module_name=self.__class__.__name__) self.embed_tokens = Embedding(len(src_dict), args.encoder_embed_dim, padding_idx=self.padding_idx) self.embed_positions = PositionalEmbedding(args.max_source_positions, args.encoder_embed_dim, self.padding_idx) self.pos_emb_alpha = nn.Parameter(torch.ones(1)) self.dec_pos_emb_alpha = nn.Parameter(torch.ones(1)) self.encoder_fft_layers = nn.ModuleList( FFTLayer(args.encoder_embed_dim, args.encoder_attention_heads, args.fft_hidden_dim, args.fft_kernel_size, dropout=args.dropout, attention_dropout=args.attention_dropout) for _ in range(args.encoder_layers)) self.var_adaptor = VarianceAdaptor(args) self.decoder_fft_layers = nn.ModuleList( FFTLayer(args.decoder_embed_dim, args.decoder_attention_heads, args.fft_hidden_dim, args.fft_kernel_size, dropout=args.dropout, attention_dropout=args.attention_dropout) for _ in range(args.decoder_layers)) self.out_proj = nn.Linear(args.decoder_embed_dim, self.out_dim) self.postnet = None if args.add_postnet: self.postnet = Postnet(self.out_dim, args.postnet_conv_dim, args.postnet_conv_kernel_size, args.postnet_layers, args.postnet_dropout) self.apply(model_init)
def __init__( self, dictionary, embed_dim=512, max_positions=1024, convolutions=((512, 3), ) * 20, dropout=0.1, attention=False, attention_nheads=1, ): super().__init__(dictionary) self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__) self.num_attention_layers = None num_embeddings = len(dictionary) self.padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx) self.embed_positions = PositionalEmbedding( max_positions, embed_dim, self.padding_idx, ) def expand_bool_array(val): if isinstance(val, bool): # expand True into [True, True, ...] and do the same with False return [val] * len(convolutions) return val attention = expand_bool_array(attention) in_channels = convolutions[0][0] self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.attention = nn.ModuleList() self.attproj = nn.ModuleList() for i, (out_channels, kernel_size) in enumerate(convolutions): self.projections.append( Linear(in_channels, out_channels ) if in_channels != out_channels else None) self.convolutions.append( ConvTBC(in_channels, out_channels * 2, kernel_size, dropout=dropout)) self.attention.append( SelfAttention(out_channels, embed_dim, attention_nheads ) if attention[i] else None) in_channels = out_channels self.fc2 = Linear(in_channels, embed_dim)
def __init__(self, in_dim, hidden_dim, kernel_size, dropout): super().__init__() self.ffn = nn.Sequential( nn.Conv1d(in_dim, hidden_dim, kernel_size=kernel_size, padding=(kernel_size - 1) // 2), nn.ReLU(), nn.Conv1d(hidden_dim, in_dim, kernel_size=kernel_size, padding=(kernel_size - 1) // 2)) self.layer_norm = LayerNorm(in_dim) self.dropout = self.dropout_module = FairseqDropout( p=dropout, module_name=self.__class__.__name__)
def __init__( self, dictionary, embed_dim=512, embed_dict=None, max_positions=1024, convolutions=((512, 3),) * 20, dropout=0.1, ): super().__init__(dictionary) self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__ ) self.num_attention_layers = None num_embeddings = len(dictionary) self.padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx) if embed_dict: self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens) self.embed_positions = PositionalEmbedding( max_positions, embed_dim, self.padding_idx, ) convolutions = extend_conv_spec(convolutions) in_channels = convolutions[0][0] self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.residuals = [] layer_in_channels = [in_channels] for _, (out_channels, kernel_size, residual) in enumerate(convolutions): if residual == 0: residual_dim = out_channels else: residual_dim = layer_in_channels[-residual] self.projections.append(Linear(residual_dim, out_channels) if residual_dim != out_channels else None) if kernel_size % 2 == 1: padding = kernel_size // 2 else: padding = 0 self.convolutions.append( ConvTBC(in_channels, out_channels * 2, kernel_size, dropout=dropout, padding=padding) ) self.residuals.append(residual) in_channels = out_channels layer_in_channels.append(out_channels) self.fc2 = Linear(in_channels, embed_dim)
def __init__(self, args, src_dict, padding_idx=1): super().__init__(None) self._future_mask = torch.empty(0) self.args = args self.padding_idx = src_dict.pad() if src_dict else padding_idx self.n_frames_per_step = args.n_frames_per_step self.out_dim = args.output_frame_dim * args.n_frames_per_step self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.embed_positions = PositionalEmbedding( args.max_target_positions, args.decoder_embed_dim, self.padding_idx ) self.pos_emb_alpha = nn.Parameter(torch.ones(1)) self.prenet = nn.Sequential( Prenet( self.out_dim, args.prenet_layers, args.prenet_dim, args.prenet_dropout ), nn.Linear(args.prenet_dim, args.decoder_embed_dim), ) self.n_transformer_layers = args.decoder_transformer_layers self.transformer_layers = nn.ModuleList( TransformerDecoderLayer(args) for _ in range(self.n_transformer_layers) ) if args.decoder_normalize_before: self.layer_norm = LayerNorm(args.decoder_embed_dim) else: self.layer_norm = None self.feat_proj = nn.Linear(args.decoder_embed_dim, self.out_dim) self.eos_proj = nn.Linear(args.decoder_embed_dim, 1) self.postnet = Postnet( self.out_dim, args.postnet_conv_dim, args.postnet_conv_kernel_size, args.postnet_layers, args.postnet_dropout, ) self.ctc_proj = None if getattr(args, "ctc_weight", 0.0) > 0.0: self.ctc_proj = nn.Linear(self.out_dim, len(src_dict)) self.apply(decoder_init)
def __init__(self, args, src_dict, embed_speaker): super().__init__(src_dict) self.padding_idx = src_dict.pad() self.embed_speaker = embed_speaker self.spk_emb_proj = None if embed_speaker is not None: self.spk_emb_proj = nn.Linear( args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim ) self.dropout_module = FairseqDropout( p=args.dropout, module_name=self.__class__.__name__ ) self.embed_tokens = nn.Embedding( len(src_dict), args.encoder_embed_dim, padding_idx=self.padding_idx ) assert args.encoder_conv_kernel_size % 2 == 1 self.prenet = nn.ModuleList( nn.Sequential( nn.Conv1d( args.encoder_embed_dim, args.encoder_embed_dim, kernel_size=args.encoder_conv_kernel_size, padding=((args.encoder_conv_kernel_size - 1) // 2), ), nn.BatchNorm1d(args.encoder_embed_dim), nn.ReLU(), nn.Dropout(args.encoder_dropout), ) for _ in range(args.encoder_conv_layers) ) self.prenet_proj = nn.Linear(args.encoder_embed_dim, args.encoder_embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, args.encoder_embed_dim, self.padding_idx ) self.pos_emb_alpha = nn.Parameter(torch.ones(1)) self.transformer_layers = nn.ModuleList( TransformerEncoderLayer(args) for _ in range(args.encoder_transformer_layers) ) if args.encoder_normalize_before: self.layer_norm = LayerNorm(args.encoder_embed_dim) else: self.layer_norm = None self.apply(encoder_init)
def __init__(self, args, dictionary): super().__init__(dictionary) self.dropout_module = FairseqDropout( p=args.dropout, module_name=self.__class__.__name__) self.embed_scale = math.sqrt(args.encoder_embed_dim) if args.no_scale_embedding: self.embed_scale = 1.0 self.padding_idx = 1 self.subsample = Conv1dSubsampler( args.input_feat_per_channel * args.input_channels, args.conv_channels, args.encoder_embed_dim, [int(k) for k in args.conv_kernel_sizes.split(",")], ) self.embed_positions = PositionalEmbedding(args.max_source_positions, args.encoder_embed_dim, self.padding_idx) self.transformer_layers = nn.ModuleList([ TransformerEncoderLayer(args) for _ in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(args.encoder_embed_dim) else: self.layer_norm = None # ctc self.ctc_compress_out = args.ctc_compress_out if self.ctc_compress_out: self.ctc_fc = nn.Linear(args.encoder_embed_dim, len(dictionary)) assert args.criterion == "ctc_multi_loss" self.ctc_layer = args.ctc_encoder_layer self.ctc_compress_method = getattr(CTCCompressStrategy, args.ctc_compress_strategy)
def __init__(self, args): super().__init__(None) # no dictionary self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__) self.max_source_positions = args.max_source_positions self.embed_positions = (PositionalEmbedding( args.max_source_positions, args.encoder_embed_dim, padding_idx=0, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None) if args.encoder_layers < 2: raise ValueError( "encoder_layers should be larger than 2, got: {}".format( args.encoder_layers)) vgg_config = json.loads(getattr(args, "vgg_config", "{}") or "{}") logger.info(vgg_config) self.vggblocks = nn.ModuleList([ VGGNet( **vgg_config[0] # in_channels=3, # subsample=3, # conv_kernel_size=5, # activation_fn="gelu" ) ]) for i in range(1, args.encoder_layers): self.vggblocks.append( VGGNet( **vgg_config[i] # in_channels=64, # subsample=2, # out_channels=args.encoder_embed_dim if i==args.encoder_layers-1 else 64, # activation_fn="gelu" ))
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): self.args = args super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__) self.decoder_layerdrop = args.decoder_layerdrop self.only_drop_topk = args.only_drop_topk self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) if not args.adaptive_input and args.quant_noise_pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: self.quant_noise = None self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None) if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.cross_self_attention = getattr(args, "cross_self_attention", False) if self.decoder_layerdrop > 0.0: if self.only_drop_topk > 0: self.layers = PartLayerDropModuleList( p=self.decoder_layerdrop, top_k=self.only_drop_topk, layer_num=args.decoder_layers) else: self.layers = LayerDropModuleList(p=self.decoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([ self.build_decoder_layer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.num_layers = len(self.layers) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.project_out_dim = (Linear( embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None) self.adaptive_softmax = None self.output_projection = None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, utils.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif self.share_input_output_embed: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0], bias=False, ) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = nn.Linear(self.output_embed_dim, len(dictionary), bias=False) nn.init.normal_(self.output_projection.weight, mean=0, std=self.output_embed_dim**-0.5)
def __init__( self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True ): super().__init__(dictionary) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) self.layers = nn.ModuleList([]) self.layers.extend( [ LightConvDecoderLayer( args, no_encoder_attn, kernel_size=args.decoder_kernel_size_list[i] ) for i in range(args.decoder_layers) ] ) self.adaptive_softmax = None self.project_out_dim = ( Linear(embed_dim, output_embed_dim, bias=False) if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None ) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, utils.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim) ) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5) self.register_buffer("version", torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__( self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True, encoder_output_units=512, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, max_target_positions=DEFAULT_MAX_TARGET_POSITIONS, residuals=False, ): super().__init__(dictionary) self.dropout_in_module = FairseqDropout( dropout_in, module_name=self.__class__.__name__ ) self.dropout_out_module = FairseqDropout( dropout_out, module_name=self.__class__.__name__ ) self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.need_attn = True self.max_target_positions = max_target_positions self.residuals = residuals self.num_layers = num_layers self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units if encoder_output_units != hidden_size and encoder_output_units != 0: self.encoder_hidden_proj = Linear(encoder_output_units, hidden_size) self.encoder_cell_proj = Linear(encoder_output_units, hidden_size) else: self.encoder_hidden_proj = self.encoder_cell_proj = None # disable input feeding if there is no encoder # input feeding is described in arxiv.org/abs/1508.04025 input_feed_size = 0 if encoder_output_units == 0 else hidden_size self.layers = nn.ModuleList( [ LSTMCell( input_size=input_feed_size + embed_dim if layer == 0 else hidden_size, hidden_size=hidden_size, ) for layer in range(num_layers) ] ) if attention: # TODO make bias configurable self.attention = AttentionLayer( hidden_size, encoder_output_units, hidden_size, bias=False ) else: self.attention = None if hidden_size != out_embed_dim: self.additional_fc = Linear(hidden_size, out_embed_dim) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax( num_embeddings, hidden_size, adaptive_softmax_cutoff, dropout=dropout_out, ) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)