def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False, uniform=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False, uniform=False) if embed_dim != output_embed_dim else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def build_model(cls, args, task): """Build a new model instance.""" # make sure that all args are properly defaulted (in case there are any new ones) base_architecture(args) if args.encoder_layers != args.decoder_layers: raise ValueError('--encoder-layers must match --decoder-layers') def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) embed_dict = utils.parse_embedding(embed_path) utils.print_embed_overlap(embed_dict, dictionary) return utils.load_embedding(embed_dict, dictionary, embed_tokens) if args.encoder_embed_path: pretrained_encoder_embed = load_pretrained_embedding_from_file( args.encoder_embed_path, task.source_dictionary, args.encoder_embed_dim) else: num_embeddings = len(task.source_dictionary) pretrained_encoder_embed = Embedding(num_embeddings, args.encoder_embed_dim, task.source_dictionary.pad()) if args.share_all_embeddings: # double check all parameters combinations are valid if task.source_dictionary != task.target_dictionary: raise ValueError( '--share-all-embeddings requires a joint dictionary') if args.decoder_embed_path and (args.decoder_embed_path != args.encoder_embed_path): raise ValueError( '--share-all-embed not compatible with --decoder-embed-path' ) if args.encoder_embed_dim != args.decoder_embed_dim: raise ValueError( '--share-all-embeddings requires --encoder-embed-dim to ' 'match --decoder-embed-dim') pretrained_decoder_embed = pretrained_encoder_embed args.share_decoder_input_output_embed = True else: # separate decoder input embeddings pretrained_decoder_embed = None if args.decoder_embed_path: pretrained_decoder_embed = load_pretrained_embedding_from_file( args.decoder_embed_path, task.target_dictionary, args.decoder_embed_dim) # one last double check of parameter combinations if args.share_decoder_input_output_embed and ( args.decoder_embed_dim != args.decoder_out_embed_dim): raise ValueError( '--share-decoder-input-output-embeddings requires ' '--decoder-embed-dim to match --decoder-out-embed-dim') encoder = LSTMEncoder( dictionary=task.source_dictionary, embed_dim=args.encoder_embed_dim, hidden_size=args.encoder_hidden_size, num_layers=args.encoder_layers, dropout_in=args.encoder_dropout_in, dropout_out=args.encoder_dropout_out, bidirectional=args.encoder_bidirectional, pretrained_embed=pretrained_encoder_embed, ) decoder = TGDecoder( dictionary=task.target_dictionary, embed_dim=args.decoder_embed_dim, hidden_size=args.decoder_hidden_size, out_embed_dim=args.decoder_out_embed_dim, num_layers=args.decoder_layers, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, attention=options.eval_bool(args.decoder_attention), encoder_embed_dim=args.encoder_embed_dim, encoder_output_units=encoder.output_units, pretrained_embed=pretrained_decoder_embed, share_input_output_embed=args.share_decoder_input_output_embed, adaptive_softmax_cutoff=(options.eval_str_list( args.adaptive_softmax_cutoff, type=int) if args.criterion == 'adaptive_loss' else None), ) return cls(encoder, decoder)
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument('--relu-dropout', type=float, metavar='D', help='dropout probability after ReLU in FFN') parser.add_argument('--input-dropout', type=float, metavar='D', help='dropout probability of the inputs') parser.add_argument('--encoder-embed-path', type=str, metavar='STR', help='path to pre-trained encoder embedding') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-conv-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-layers', type=int, metavar='N', help='num encoder layers') parser.add_argument('--encoder-attention-heads', type=int, metavar='N', help='num encoder attention heads or LightConv/DynamicConv heads') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block') parser.add_argument('--encoder-learned-pos', action='store_true', help='use learned positional embeddings in the encoder') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-conv-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads or LightConv/DynamicConv heads') parser.add_argument('--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') parser.add_argument('--decoder-normalize-before', action='store_true', help='apply layernorm before each decoder block') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument('--share-all-embeddings', action='store_true', help='share encoder, decoder and output embeddings' ' (requires shared dictionary and embed dim)') parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion'), parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') """LightConv and DynamicConv arguments""" parser.add_argument('--encoder-kernel-size-list', type=lambda x: options.eval_str_list(x, int), help='list of kernel size (default: "[3,7,15,31,31,31,31]")') parser.add_argument('--decoder-kernel-size-list', type=lambda x: options.eval_str_list(x, int), help='list of kernel size (default: "[3,7,15,31,31,31]")') parser.add_argument('--encoder-glu', type=options.eval_bool, help='glu after in proj') parser.add_argument('--decoder-glu', type=options.eval_bool, help='glu after in proj') parser.add_argument('--encoder-conv-type', default='dynamic', type=str, choices=['dynamic', 'lightweight'], help='type of convolution') parser.add_argument('--decoder-conv-type', default='dynamic', type=str, choices=['dynamic', 'lightweight'], help='type of convolution') parser.add_argument('--weight-softmax', default=True, type=options.eval_bool) parser.add_argument('--weight-dropout', type=float, metavar='D', help='dropout probability for conv weights')
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off # TODO parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument( '--activation-dropout', '--relu-dropout', type=float, metavar='D', help='dropout probability after activation in FFN.') parser.add_argument('--encoder-embed-path', type=str, metavar='STR', help='path to pre-trained encoder embedding') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-layers', type=int, metavar='N', help='num encoder layers') parser.add_argument('--encoder-attention-heads', type=int, metavar='N', help='num encoder attention heads') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block') parser.add_argument( '--encoder-learned-pos', action='store_true', help='use learned positional embeddings in the encoder') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads') parser.add_argument( '--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') parser.add_argument('--decoder-normalize-before', action='store_true', help='apply layernorm before each decoder block') parser.add_argument( '--decoder-output-dim', type=int, metavar='N', help='decoder output dimension (extra linear layer ' 'if different from decoder embed dim') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument('--share-all-embeddings', action='store_true', help='share encoder, decoder and output embeddings' ' (requires shared dictionary and embed dim)') parser.add_argument( '--no-token-positional-embeddings', default=False, action='store_true', help= 'if set, disables positional embeddings (outside self attention)') parser.add_argument( '--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion'), parser.add_argument( '--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') parser.add_argument('--layernorm-embedding', action='store_true', help='add layernorm to embedding') parser.add_argument('--no-scale-embedding', action='store_true', help='if True, dont scale embeddings') parser.add_argument( '--checkpoint-activations', action='store_true', help='checkpoint activations at each layer, which saves GPU ' 'memory usage at the cost of some additional compute') # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019) parser.add_argument('--no-cross-attention', default=False, action='store_true', help='do not perform cross-attention') parser.add_argument('--cross-self-attention', default=False, action='store_true', help='perform cross+self-attention') # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0, help='LayerDrop probability for encoder') parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0, help='LayerDrop probability for decoder') parser.add_argument( '--encoder-layers-to-keep', default=None, help='which layers to *keep* when pruning as a comma-separated list' ) parser.add_argument( '--decoder-layers-to-keep', default=None, help='which layers to *keep* when pruning as a comma-separated list' ) # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) parser.add_argument( '--quant-noise-pq', type=float, metavar='D', default=0, help='iterative PQ quantization noise at training time') parser.add_argument( '--quant-noise-pq-block-size', type=int, metavar='D', default=8, help='block size of quantization noise at training time') parser.add_argument( '--quant-noise-scalar', type=float, metavar='D', default=0, help= 'scalar quantization noise and scalar quantization at training time' ) # for prime parser.add_argument('--use_att', type=str, nargs='+', default=[ 'es', 'ds', 'dc', ], help='') parser.add_argument('--kernel_size', type=int, default=0, help='do not set static kernel') parser.add_argument( '--attn_dynamic_type', type=int, default=0, help= '0: no use,1 use static kernel(k>0) or depth kernel(k==0) 2. use dynamic kernel ' ) parser.add_argument('--attn_cat_relu', type=int, default=0) parser.add_argument( '--attn_wide_kernels', type=lambda x: options.eval_str_list(x, int), help='list of kernel size (default: "[3,15]") for wide and gate') parser.add_argument('--weight-dropout', type=float, metavar='D', help='dropout probability for conv weights') parser.add_argument('--dynamic_gate', type=int, default=1, help='0,1') parser.add_argument( '--dynamic_depth_kernels', type=lambda x: options.eval_str_list(x, int), help= 'list of kernel size (default: "[3,3,3,7,7,7,7,7,7,15,15,15]"),for ffn or attn' ) parser.add_argument('--dynamic_padding', type=int, default=0, help='padding before dynamic conv') parser.add_argument('--attn_dynamic_cat', type=int, default=1) parser.add_argument('--input_dropout', type=float, default=0, help='') parser.add_argument('--init_method', type=str, default='km', help='xavier,km,xi,fixup') parser.add_argument('--lnv', type=str, default='origin', help='layernorm,adanorm')
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument('--dropout', default=0.1, type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', default=0., type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument('--relu-dropout', default=0., type=float, metavar='D', help='dropout probability after ReLU in FFN') parser.add_argument('--input-dropout', type=float, metavar='D', help='dropout probability of the inputs') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-output-dim', type=int, metavar='N', help='decoder output dimension') parser.add_argument('--decoder-input-dim', type=int, metavar='N', help='decoder input dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument( '--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads or LightConv/DynamicConv heads') parser.add_argument('--decoder-normalize-before', default=False, action='store_true', help='apply layernorm before each decoder block') parser.add_argument( '--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion') parser.add_argument( '--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') parser.add_argument('--adaptive-softmax-factor', type=float, metavar='N', help='adaptive input factor') parser.add_argument( '--no-token-positional-embeddings', default=False, action='store_true', help= 'if set, disables positional embeddings (outside self attention)') parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true', help='share decoder input and output embeddings') parser.add_argument( '--character-embeddings', default=False, action='store_true', help= 'if set, uses character embedding convolutions to produce token embeddings' ) parser.add_argument( '--character-filters', type=str, metavar='LIST', default= '[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]', help='size of character embeddings') parser.add_argument('--character-embedding-dim', type=int, metavar='N', default=4, help='size of character embeddings') parser.add_argument( '--char-embedder-highway-layers', type=int, metavar='N', default=2, help='number of highway layers for character token embeddder') parser.add_argument('--adaptive-input', default=False, action='store_true', help='if set, uses adaptive input') parser.add_argument('--adaptive-input-factor', type=float, metavar='N', help='adaptive input factor') parser.add_argument( '--adaptive-input-cutoff', metavar='EXPR', help='comma separated list of adaptive input cutoff points.') parser.add_argument( '--tie-adaptive-weights', action='store_true', help= 'if set, ties the weights of adaptive softmax and adaptive input') parser.add_argument( '--tie-adaptive-proj', action='store_true', help= 'if set, ties the projection weights of adaptive softmax and adaptive input' ) parser.add_argument( '--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') """LightConv and DynamicConv arguments""" parser.add_argument( '--decoder-kernel-size-list', type=lambda x: options.eval_str_list(x, int), help='list of kernel size (default: "[3,7,15,31,31,31]")') parser.add_argument('--decoder-glu', type=options.eval_bool, help='glu after in proj') parser.add_argument('--decoder-conv-type', default='dynamic', type=str, choices=['dynamic', 'lightweight'], help='type of convolution') parser.add_argument('--weight-softmax', default=True, type=options.eval_bool) parser.add_argument('--weight-dropout', type=float, metavar='D', help='dropout probability for conv weights')
def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_lm_architecture(args) if getattr(args, 'max_target_positions', None) is None: args.max_target_positions = getattr(args, 'tokens_per_sample', DEFAULT_MAX_TARGET_POSITIONS) if args.character_embeddings: embed_tokens = CharacterTokenEmbedder( task.source_dictionary, eval(args.character_filters), args.character_embedding_dim, args.decoder_embed_dim, args.char_embedder_highway_layers, ) elif args.adaptive_input: embed_tokens = AdaptiveInput( len(task.source_dictionary), task.source_dictionary.pad(), args.decoder_input_dim, args.adaptive_input_factor, args.decoder_embed_dim, options.eval_str_list(args.adaptive_input_cutoff, type=int), ) else: if hasattr(task, 'vqvae_model'): vocab_size = args.codebook_size assert args.decoder_input_dim == task.vqvae_model.bottom_quantizer.dim code_embed_init = task.vqvae_model.bottom_quantizer.embed.data.transpose( 0, 1) embed_tokens = Embedding(vocab_size + 1, args.decoder_input_dim, padding_idx=None, weight=None) else: embed_tokens = Embedding(len(task.source_dictionary), args.decoder_input_dim, task.source_dictionary.pad()) if hasattr(task, 'vqvae_model'): decoder = TransformerDecoder(args, task.target_dictionary, embed_tokens, no_encoder_attn=True, pad_idx=task.padding_idx) else: if args.tie_adaptive_weights: assert args.adaptive_input assert args.adaptive_input_factor == args.adaptive_softmax_factor assert args.adaptive_softmax_cutoff == args.adaptive_input_cutoff, '{} != {}'.format( args.adaptive_softmax_cutoff, args.adaptive_input_cutoff) assert args.decoder_input_dim == args.decoder_output_dim decoder = TransformerDecoder( args, task.target_dictionary, embed_tokens, no_encoder_attn=True, ) return TransformerLanguageModel(decoder)
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument('--encoder-embed-path', type=str, metavar='STR', help='path to pre-trained encoder embedding') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument( '--encoder-learned-pos', action='store_true', help='use learned positional embeddings in the encoder') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument( '--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') parser.add_argument('--decoder-normalize-before', action='store_true', help='apply layernorm before each decoder block') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument('--share-all-embeddings', action='store_true', help='share encoder, decoder and output embeddings' ' (requires shared dictionary and embed dim)') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument('--relu-dropout', type=float, metavar='D', help='dropout probability after ReLU in FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num layers') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='embedding dimension for FFN') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num attention heads') parser.add_argument('--kernel-size-list', type=lambda x: options.eval_str_list(x, int), help='list of kernel size (default: None)') parser.add_argument('--language-embeddings', action='store_true', help='use language embeddings') # for the transformer XL integration, still I believe the numbers can not really adapt # Christine (7-2-2020) parser.add_argument('--d_head', type=int, default=50, help='head dimension') parser.add_argument('--d_inner', type=int, default=1000, help='inner dimension in FF') parser.add_argument( '--pre_lnorm', action='store_true', help='apply LayerNorm to the input instead of the output')
def _fairseq_opt_wrapper(opt, skip_pretrained_embedding_loading=False): """ Marshalls from a dict to a argparse.Namespace object for API compatibility. Also does some necessary post-processing needed for fairseq. Optionally can override pretrained embedding options, which is useful if we're just loading a model from a checkpoint. :param opt: dict. ParlAI options passed around from everywhere. :param skip_pretrained_embedding_loading: bool. Don't preload word embeddings. :return: an argparse.Namespace object for use in fairseq-py. """ args = argparse.Namespace() # first set args according to ParlAI options for key in opt: if opt[key] is not None: setattr(args, key, opt[key]) # at this point the user *must* have specified an arch if not hasattr(args, "arch"): raise ValueError("--arch/-a must be specified") # fill in default options from the model models.ARCH_CONFIG_REGISTRY[args.arch](args) # post processing of args. See # https://github.com/pytorch/fairseq/blob/v0.5.0/fairseq/options.py#L95 if hasattr(args, "lr"): args.lr = options.eval_str_list(args.lr, type=float) if hasattr(args, "update_freq"): args.update_freq = options.eval_str_list(args.update_freq, int) if hasattr(args, "max_sentences_valid"): args.max_sentences_valid = args.max_sentences if getattr(args, "truncate") == -1: # some torch agents use positional embeddings, which must have a max length setattr(args, "truncate", 1024) if not hasattr(args, "max_source_positions"): # fairseq uses a different name for this CLI parameter # Sometimes it's set in model defaults, but not for all models setattr(args, "max_source_positions", getattr(args, "truncate")) # if we don't have source lengths, we don't have target lengths setattr(args, "max_target_positions", getattr(args, "truncate")) # handle modelzoo if possible for k in ("encoder_embed_path", "decoder_embed_path"): if getattr(args, k, None) is None: # not an argument for this model, pretrained embeddings don't matter continue elif skip_pretrained_embedding_loading: # if we want to skip pretrained, then hide the option from fairseq setattr(args, k, None) else: # otherwise we may need to modelzoo adjust the path for fairseq setattr(args, k, modelzoo_path(opt.get("datapath"), getattr(args, k))) # Here we hardcode a few options that we currently do not support # turn off distributed training args.distributed_world_size = 1 args.distributed_rank = 0 return args, vars(args)
def build_model(cls, args, task): """Build a new model instance.""" # make sure that all args are properly defaulted (in case there are any new ones) base_architecture(args) max_source_positions = getattr(args, "max_source_positions", DEFAULT_MAX_SOURCE_POSITIONS) max_target_positions = getattr(args, "max_target_positions", DEFAULT_MAX_TARGET_POSITIONS) def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) embed_dict = utils.parse_embedding(embed_path) utils.print_embed_overlap(embed_dict, dictionary) return utils.load_embedding(embed_dict, dictionary, embed_tokens) # separate decoder input embeddings pretrained_decoder_embed = None if args.decoder_embed_path: pretrained_decoder_embed = load_pretrained_embedding_from_file( args.decoder_embed_path, task.target_dictionary, args.decoder_embed_dim) # one last double check of parameter combinations if args.share_decoder_input_output_embed and ( args.decoder_embed_dim != args.decoder_out_embed_dim): raise ValueError( "--share-decoder-input-output-embed requires " "--decoder-embed-dim to match --decoder-out-embed-dim") if args.decoder_freeze_embed: pretrained_decoder_embed.weight.requires_grad = False out_channels = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_channels, type=int) kernel_sizes = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_kernel_sizes, type=int) strides = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_strides, type=int) logger.info("input feature dimension: {}, channels: {}".format( task.feat_dim, task.feat_in_channels)) assert task.feat_dim % task.feat_in_channels == 0 conv_layers = ConvBNReLU( out_channels, kernel_sizes, strides, in_channels=task.feat_in_channels, ) if out_channels is not None else None rnn_encoder_input_size = task.feat_dim // task.feat_in_channels if conv_layers is not None: for stride in strides: if isinstance(stride, (list, tuple)): assert len(stride) > 0 s = stride[1] if len(stride) > 1 else stride[0] else: assert isinstance(stride, int) s = stride rnn_encoder_input_size = (rnn_encoder_input_size + s - 1) // s rnn_encoder_input_size *= out_channels[-1] else: rnn_encoder_input_size = task.feat_dim scheduled_sampling_rate_scheduler = ScheduledSamplingRateScheduler( args.scheduled_sampling_probs, args.start_scheduled_sampling_epoch, ) encoder = SpeechLSTMEncoder( conv_layers_before=conv_layers, input_size=rnn_encoder_input_size, hidden_size=args.encoder_rnn_hidden_size, num_layers=args.encoder_rnn_layers, dropout_in=args.encoder_rnn_dropout_in, dropout_out=args.encoder_rnn_dropout_out, bidirectional=args.encoder_rnn_bidirectional, residual=args.encoder_rnn_residual, max_source_positions=max_source_positions, ) decoder = SpeechLSTMDecoder( dictionary=task.target_dictionary, embed_dim=args.decoder_embed_dim, hidden_size=args.decoder_hidden_size, out_embed_dim=args.decoder_out_embed_dim, num_layers=args.decoder_layers, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, encoder_output_units=encoder.output_units, attn_type=args.attention_type, attn_dim=args.attention_dim, need_attn=args.need_attention, residual=args.decoder_rnn_residual, pretrained_embed=pretrained_decoder_embed, share_input_output_embed=args.share_decoder_input_output_embed, adaptive_softmax_cutoff=(options.eval_str_list( args.adaptive_softmax_cutoff, type=int) if args.criterion == "adaptive_loss" else None), max_target_positions=max_target_positions, scheduled_sampling_rate_scheduler=scheduled_sampling_rate_scheduler, ) pretrained_lm = None if args.pretrained_lm_checkpoint: logger.info("loading pretrained LM from {}".format( args.pretrained_lm_checkpoint)) pretrained_lm = checkpoint_utils.load_model_ensemble( args.pretrained_lm_checkpoint, task=task)[0][0] pretrained_lm.make_generation_fast_() # freeze pretrained model for param in pretrained_lm.parameters(): param.requires_grad = False return cls(encoder, decoder, pretrained_lm)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): self.args = args super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None) self.cross_self_attention = getattr(args, "cross_self_attention", False) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.layers = nn.ModuleList([]) self.layers.extend([ self.build_decoder_layer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.num_layers = len(self.layers) self.adaptive_softmax = None self.project_out_dim = (Linear( embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.n_experts = 1 self.nhidlast = self.embed_dim self.ninp = self.embed_dim self.ntoken = 9744 self.prior = nn.Linear(self.nhidlast, self.n_experts, bias=False) self.latent = nn.Sequential( nn.Linear(self.nhidlast, self.n_experts * self.ninp), nn.Tanh())
def build_single_decoder(args, src_dict, dst_dict, ngram_decoder=None, project_output=True, is_lm=False): if args.adaptive_softmax_cutoff is not None: project_output = False attention_type = args.attention_type encoder_hidden_dim = args.encoder_hidden_dim if is_lm: attention_type = "no" encoder_hidden_dim = 0 if ngram_decoder: if args.ngram_activation_type == "relu": activation_fn = nn.ReLU elif args.ngram_activation_type == "tanh": activation_fn = nn.Tanh else: raise Exception("ngram_activation_type '%s' not implemented" % args.ngram_activation_type) decoder = NGramDecoder( src_dict=src_dict, dst_dict=dst_dict, n=ngram_decoder, encoder_hidden_dim=encoder_hidden_dim, embed_dim=args.decoder_embed_dim, freeze_embed=args.decoder_freeze_embed, out_embed_dim=args.decoder_out_embed_dim, num_layers=args.decoder_layers, hidden_dim=args.decoder_hidden_dim, attention_type=attention_type, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, residual_level=args.residual_level, activation_fn=activation_fn, project_output=project_output, pretrained_embed=args.decoder_pretrained_embed, projection_pretrained_embed=args.decoder_out_pretrained_embed, ) else: decoder = RNNDecoder( src_dict=src_dict, dst_dict=dst_dict, vocab_reduction_params=args.vocab_reduction_params, encoder_hidden_dim=encoder_hidden_dim, embed_dim=args.decoder_embed_dim, freeze_embed=args.decoder_freeze_embed, out_embed_dim=args.decoder_out_embed_dim, cell_type=args.cell_type, num_layers=args.decoder_layers, hidden_dim=args.decoder_hidden_dim, attention_type=attention_type, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, residual_level=args.residual_level, averaging_encoder=args.averaging_encoder, project_output=project_output, pretrained_embed=args.decoder_pretrained_embed, projection_pretrained_embed=args.decoder_out_pretrained_embed, tie_embeddings=args.decoder_tie_embeddings, att_weighted_src_embeds=args.att_weighted_src_embeds, src_embed_dim=args.encoder_embed_dim, att_weighted_activation_type=args.att_weighted_activation_type, ) # Being able to use adaptive softmax for RNN decoder decoder.adaptive_softmax = None if args.adaptive_softmax_cutoff is not None: decoder.adaptive_softmax = AdaptiveSoftmax( len(dst_dict), args.decoder_out_embed_dim or args.decoder_hidden_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.dropout, ) return decoder
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument("--dropout", type=float, metavar="D", help="dropout probability") parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--relu-dropout", type=float, metavar="D", help="dropout probability after ReLU in FFN", ) parser.add_argument( "--input-dropout", type=float, metavar="D", help="dropout probability of the inputs", ) parser.add_argument( "--encoder-embed-path", type=str, metavar="STR", help="path to pre-trained encoder embedding", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-conv-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN", ) parser.add_argument("--encoder-layers", type=int, metavar="N", help="num encoder layers") parser.add_argument( "--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads or LightConv/DynamicConv heads", ) parser.add_argument( "--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block", ) parser.add_argument( "--encoder-learned-pos", action="store_true", help="use learned positional embeddings in the encoder", ) parser.add_argument( "--decoder-embed-path", type=str, metavar="STR", help="path to pre-trained decoder embedding", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-conv-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-ffn-embed-dim", type=int, metavar="N", help="decoder embedding dimension for FFN", ) parser.add_argument("--decoder-layers", type=int, metavar="N", help="num decoder layers") parser.add_argument( "--decoder-attention-heads", type=int, metavar="N", help="num decoder attention heads or LightConv/DynamicConv heads", ) parser.add_argument( "--decoder-learned-pos", action="store_true", help="use learned positional embeddings in the decoder", ) parser.add_argument( "--decoder-normalize-before", action="store_true", help="apply layernorm before each decoder block", ) parser.add_argument( "--share-decoder-input-output-embed", action="store_true", help="share decoder input and output embeddings", ) parser.add_argument( "--share-all-embeddings", action="store_true", help="share encoder, decoder and output embeddings" " (requires shared dictionary and embed dim)", ) parser.add_argument( "--adaptive-softmax-cutoff", metavar="EXPR", help="comma separated list of adaptive softmax cutoff points. " "Must be used with adaptive_loss criterion", ), parser.add_argument( "--adaptive-softmax-dropout", type=float, metavar="D", help="sets adaptive softmax dropout for the tail projections", ) """LightConv and DynamicConv arguments""" parser.add_argument( "--encoder-kernel-size-list", type=lambda x: options.eval_str_list(x, int), help='list of kernel size (default: "[3,7,15,31,31,31,31]")', ) parser.add_argument( "--decoder-kernel-size-list", type=lambda x: options.eval_str_list(x, int), help='list of kernel size (default: "[3,7,15,31,31,31]")', ) parser.add_argument("--encoder-glu", type=options.eval_bool, help="glu after in proj") parser.add_argument("--decoder-glu", type=options.eval_bool, help="glu after in proj") parser.add_argument( "--encoder-conv-type", default="dynamic", type=str, choices=["dynamic", "lightweight"], help="type of convolution", ) parser.add_argument( "--decoder-conv-type", default="dynamic", type=str, choices=["dynamic", "lightweight"], help="type of convolution", ) parser.add_argument("--weight-softmax", default=True, type=options.eval_bool) parser.add_argument( "--weight-dropout", type=float, metavar="D", help="dropout probability for conv weights", )
def __init__(self, **kwargs): super().__init__() """Build a new model instance.""" # make sure that all args are properly defaulted (in case there are any new ones) args = Parameters() args.update(**kwargs) args.criterion = '' lstm_luong_wmt_en_de(args) if args.encoder_layers != args.decoder_layers: raise ValueError('--encoder-layers must match --decoder-layers') max_source_positions = getattr(args, 'max_source_positions', DEFAULT_MAX_SOURCE_POSITIONS) max_target_positions = getattr(args, 'max_target_positions', DEFAULT_MAX_TARGET_POSITIONS) src_dict, tgt_dict = kwargs["vocab_src"], kwargs["vocab_tgt"] pretrained_encoder_embed = None pretrained_decoder_embed = None # one last double check of parameter combinations if args.share_decoder_input_output_embed and ( args.decoder_embed_dim != args.decoder_out_embed_dim): raise ValueError( '--share-decoder-input-output-embeddings requires ' '--decoder-embed-dim to match --decoder-out-embed-dim') if args.encoder_freeze_embed: pretrained_encoder_embed.weight.requires_grad = False if args.decoder_freeze_embed: pretrained_decoder_embed.weight.requires_grad = False self.encoder = LSTMEncoder(dictionary=src_dict, embed_dim=args.encoder_embed_dim, hidden_size=args.encoder_hidden_size, num_layers=args.encoder_layers, dropout_in=args.encoder_dropout_in, dropout_out=args.encoder_dropout_out, bidirectional=args.encoder_bidirectional, pretrained_embed=pretrained_encoder_embed, max_source_positions=max_source_positions) self.decoder = LSTMDecoder( dictionary=tgt_dict, embed_dim=args.decoder_embed_dim, hidden_size=args.decoder_hidden_size, out_embed_dim=args.decoder_out_embed_dim, num_layers=args.decoder_layers, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, attention=options.eval_bool(args.decoder_attention), encoder_output_units=self.encoder.output_units, pretrained_embed=pretrained_decoder_embed, share_input_output_embed=args.share_decoder_input_output_embed, adaptive_softmax_cutoff=(options.eval_str_list( args.adaptive_softmax_cutoff, type=int) if args.criterion == 'adaptive_loss' else None), max_target_positions=max_target_positions, residuals=False)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim self.ordinary_sinpos = args.ordinary_sinpos self.represent_length_by_lrpe = args.represent_length_by_lrpe self.represent_length_by_ldpe = args.represent_length_by_ldpe padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False, uniform=False) if embed_dim != input_embed_dim else None self.embed_positions_original = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings and self.ordinary_sinpos else None self.embed_positions_lrpe = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings and self.represent_length_by_lrpe else None self.embed_positions_ldpe = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings and self.represent_length_by_ldpe else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False, uniform=False) if embed_dim != output_embed_dim else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--encoder-conv-channels', type=str, metavar='EXPR', help='list of encoder convolution\'s out channels') parser.add_argument('--encoder-conv-kernel-sizes', type=str, metavar='EXPR', help='list of encoder convolution\'s kernel sizes') parser.add_argument('--encoder-conv-strides', type=str, metavar='EXPR', help='list of encoder convolution\'s strides') parser.add_argument('--encoder-rnn-hidden-size', type=int, metavar='N', help='encoder rnn\'s hidden size') parser.add_argument('--encoder-rnn-layers', type=int, metavar='N', help='number of rnn encoder layers') parser.add_argument( '--encoder-rnn-bidirectional', type=lambda x: options.eval_bool(x), help='make all rnn layers of encoder bidirectional') parser.add_argument( '--encoder-rnn-residual', type=lambda x: options.eval_bool(x), help='create residual connections for rnn encoder ' 'layers (starting from the 2nd layer), i.e., the actual ' 'output of such layer is the sum of its input and output') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-freeze-embed', action='store_true', help='freeze decoder embeddings') parser.add_argument('--decoder-hidden-size', type=int, metavar='N', help='decoder hidden size') parser.add_argument('--decoder-layers', type=int, metavar='N', help='number of decoder layers') parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N', help='decoder output embedding dimension') parser.add_argument( '--decoder-rnn-residual', type=lambda x: options.eval_bool(x), help='create residual connections for rnn decoder ' 'layers (starting from the 2nd layer), i.e., the actual ' 'output of such layer is the sum of its input and output') parser.add_argument('--attention-type', type=str, metavar='STR', choices=['bahdanau', 'luong'], help='attention type') parser.add_argument('--attention-dim', type=int, metavar='N', help='attention dimension') parser.add_argument( '--need-attention', action='store_true', help='need to return attention tensor for the caller') parser.add_argument( '--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion') parser.add_argument('--share-decoder-input-output-embed', type=lambda x: options.eval_bool(x), help='share decoder input and output embeddings') parser.add_argument( '--pretrained-lm-checkpoint', type=str, metavar='STR', help='path to load checkpoint from pretrained language model(LM), ' 'which will be present and kept fixed during training.') # Granular dropout settings (if not specified these default to --dropout) parser.add_argument( '--encoder-rnn-dropout-in', type=float, metavar='D', help='dropout probability for encoder rnn\'s input') parser.add_argument( '--encoder-rnn-dropout-out', type=float, metavar='D', help='dropout probability for encoder rnn\'s output') parser.add_argument( '--decoder-dropout-in', type=float, metavar='D', help='dropout probability for decoder input embedding') parser.add_argument('--decoder-dropout-out', type=float, metavar='D', help='dropout probability for decoder output') # Scheduled sampling options parser.add_argument( '--scheduled-sampling-probs', type=lambda p: options.eval_str_list(p), metavar='P_1,P_2,...,P_N', default=1.0, help='scheduled sampling probabilities of sampling the truth ' 'labels for N epochs starting from --start-schedule-sampling-epoch; ' 'all later epochs using P_N') parser.add_argument( '--start-scheduled-sampling-epoch', type=int, metavar='N', default=1, help='start scheduled sampling from the specified epoch')
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): self.args = args super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__) self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) if not args.adaptive_input and args.quant_noise_pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: self.quant_noise = None self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None) if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.cross_self_attention = getattr(args, "cross_self_attention", False) if self.decoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.decoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([ self.build_decoder_layer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.num_layers = len(self.layers) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.project_out_dim = (Linear( embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None) self.adaptive_softmax = None self.output_projection = None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif self.share_input_output_embed: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0], bias=False, ) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = nn.Linear(self.output_embed_dim, len(dictionary), bias=False) nn.init.normal_(self.output_projection.weight, mean=0, std=self.output_embed_dim**-0.5)
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument("--dropout", type=float, metavar="D", help="dropout probability") parser.add_argument("--encoder-conv-channels", type=str, metavar="EXPR", help="list of encoder convolution\'s out channels") parser.add_argument("--encoder-conv-kernel-sizes", type=str, metavar="EXPR", help="list of encoder convolution\'s kernel sizes") parser.add_argument("--encoder-conv-strides", type=str, metavar="EXPR", help="list of encoder convolution\'s strides") parser.add_argument("--encoder-rnn-hidden-size", type=int, metavar="N", help="encoder rnn\'s hidden size") parser.add_argument("--encoder-rnn-layers", type=int, metavar="N", help="number of rnn encoder layers") parser.add_argument( "--encoder-rnn-bidirectional", type=lambda x: options.eval_bool(x), help="make all rnn layers of encoder bidirectional") parser.add_argument( "--encoder-rnn-residual", type=lambda x: options.eval_bool(x), help="create residual connections for rnn encoder " "layers (starting from the 2nd layer), i.e., the actual " "output of such layer is the sum of its input and output") parser.add_argument("--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension") parser.add_argument("--decoder-embed-path", type=str, metavar="STR", help="path to pre-trained decoder embedding") parser.add_argument("--decoder-freeze-embed", action="store_true", help="freeze decoder embeddings") parser.add_argument("--decoder-hidden-size", type=int, metavar="N", help="decoder hidden size") parser.add_argument("--decoder-layers", type=int, metavar="N", help="number of decoder layers") parser.add_argument("--decoder-out-embed-dim", type=int, metavar="N", help="decoder output embedding dimension") parser.add_argument( "--decoder-rnn-residual", type=lambda x: options.eval_bool(x), help="create residual connections for rnn decoder " "layers (starting from the 2nd layer), i.e., the actual " "output of such layer is the sum of its input and output") parser.add_argument("--attention-type", type=str, metavar="STR", choices=["bahdanau", "luong"], help="attention type") parser.add_argument("--attention-dim", type=int, metavar="N", help="attention dimension") parser.add_argument( "--need-attention", action="store_true", help="need to return attention tensor for the caller") parser.add_argument( "--adaptive-softmax-cutoff", metavar="EXPR", help="comma separated list of adaptive softmax cutoff points. " "Must be used with adaptive_loss criterion") parser.add_argument("--share-decoder-input-output-embed", type=lambda x: options.eval_bool(x), help="share decoder input and output embeddings") parser.add_argument( "--pretrained-lm-checkpoint", type=str, metavar="STR", help="path to load checkpoint from pretrained language model(LM), " "which will be present and kept fixed during training.") # Granular dropout settings (if not specified these default to --dropout) parser.add_argument( "--encoder-rnn-dropout-in", type=float, metavar="D", help="dropout probability for encoder rnn\'s input") parser.add_argument( "--encoder-rnn-dropout-out", type=float, metavar="D", help="dropout probability for encoder rnn\'s output") parser.add_argument( "--decoder-dropout-in", type=float, metavar="D", help="dropout probability for decoder input embedding") parser.add_argument("--decoder-dropout-out", type=float, metavar="D", help="dropout probability for decoder output") # Scheduled sampling options parser.add_argument( "--scheduled-sampling-probs", type=lambda p: options.eval_str_list(p), metavar="P_1,P_2,...,P_N", default=[1.0], help="scheduled sampling probabilities of sampling the truth " "labels for N epochs starting from --start-schedule-sampling-epoch; " "all later epochs using P_N") parser.add_argument( "--start-scheduled-sampling-epoch", type=int, metavar="N", default=1, help="start scheduled sampling from the specified epoch")
def __init__(self, args, src_dict, dst_dict, embed_tokens): super().__init__(dst_dict) self.dropout = args.dropout self.decoder_layerdrop = 0 if hasattr(args, "decoder_layerdrop") and args.decoder_layerdrop > 0: self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed embed_dim = embed_tokens.embedding_dim padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, padding_idx, learned=args.decoder_learned_pos) self.aan = args.aan decoder_layer_class = (AANDecoderLayer if self.aan else fairseq_transformer.TransformerDecoderLayer) self.layers = nn.ModuleList([]) self.layers.extend( [decoder_layer_class(args) for i in range(args.decoder_layers)]) if hasattr(args, "decoder_layers_to_keep") and args.decoder_layers_to_keep: layers_to_keep = sorted( int(x) for x in args.decoder_layers_to_keep.split(",")) self.decoder_layers_to_keep = { layer_id: layer_idx for layer_idx, layer_id in enumerate(layers_to_keep) } self.adaptive_softmax = None self.bottleneck_layer = None out_embed_dim = embed_dim if args.decoder_out_embed_dim is not None: assert ( not args.share_all_embeddings and not args.share_decoder_input_output_embed ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!" self.bottleneck_layer = fairseq_transformer.Linear( embed_dim, args.decoder_out_embed_dim) out_embed_dim = args.decoder_out_embed_dim if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dst_dict), out_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.dropout, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dst_dict), out_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5) self.vocab_reduction_module = None if args.vocab_reduction_params: assert ( self.adaptive_softmax is None ), "vocabulary reduction not compatible with adaptive softmax!" self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, args.vocab_reduction_params, fp16=args.fp16) self.onnx_trace = False # Use quantizable nn.Linear for output projection instead of F.linear self.output_projection = None if self.vocab_reduction_module is None: if self.share_input_output_embed: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0]) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = nn.Linear(self.embed_out.shape[1], self.embed_out.shape[0]) self.output_projection.weight = self.embed_out
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument( '--activation-dropout', '--relu-dropout', type=float, metavar='D', help='dropout probability after activation in FFN.') parser.add_argument('--encoder-embed-path', type=str, metavar='STR', help='path to pre-trained encoder embedding') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-layers', type=int, metavar='N', help='num encoder layers') parser.add_argument('--encoder-attention-heads', type=int, metavar='N', help='num encoder attention heads') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block') parser.add_argument('--decoder-final-norm', default=False, action='store_true', help='apply layernorm before each decoder block') parser.add_argument( '--encoder-learned-pos', action='store_true', help='use learned positional embeddings in the encoder') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads') parser.add_argument( '--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') parser.add_argument('--decoder-normalize-before', action='store_true', help='apply layernorm before each decoder block') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument('--share-all-embeddings', action='store_true', help='share encoder, decoder and output embeddings' ' (requires shared dictionary and embed dim)') parser.add_argument( '--no-token-positional-embeddings', default=False, action='store_true', help= 'if set, disables positional embeddings (outside self attention)') parser.add_argument( '--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion'), parser.add_argument( '--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') parser.add_argument('--use_att', type=str, nargs='+', default=[ 'es', 'ds', 'dc', ], help='') parser.add_argument('--combine', type=int, default=0, help='0 as usual 1 combine residual') parser.add_argument('--kernel_size', type=int, default=0, help='do not set static kernel') parser.add_argument( '--attn_dynamic_type', type=int, default=0, help= '0: no use,1 use static kernel(k>0) or depth kernel(k==0) 2. use dynamic kernel ' ) parser.add_argument('--attn_cat_relu', type=int, default=0) parser.add_argument( '--attn_wide_kernels', type=lambda x: options.eval_str_list(x, int), help='list of kernel size (default: "[3,15]") for wide and gate') parser.add_argument('--weight-dropout', type=float, metavar='D', help='dropout probability for conv weights') parser.add_argument('--dynamic_gate', type=int, default=1, help='0,1') parser.add_argument( '--dynamic_depth_kernels', type=lambda x: options.eval_str_list(x, int), help= 'list of kernel size (default: "[3,3,3,7,7,7,7,7,7,15,15,15]"),for ffn or attn' ) parser.add_argument('--dynamic_padding', type=int, default=0, help='padding before dynamic conv') parser.add_argument('--attn_dynamic_cat', type=int, default=1) parser.add_argument('--bm', type=int, default=0, help='whether to use transformer_bm') parser.add_argument('--bm_in_a', type=float, default=3, help='sqrt(6/(1+a)),-1 for xavier') parser.add_argument('--bm_out_a', type=float, default=0, help='sqrt(6/(1+a)), -1 for xavier') parser.add_argument('--bm_fc3', type=float, default=1, help='') parser.add_argument('--bm_fc4', type=float, default=1, help='') parser.add_argument('--input_dropout', type=float, default=0, help='') parser.add_argument('--init_method', type=str, default='km', help='xavier,km,xi,fixup') parser.add_argument('--lnv', type=str, default='origin', help='layernorm,adanorm')
def build_model(cls, args, task): """Build a new model instance.""" # make sure that all args are properly defaulted (in case there are any new ones) base_architecture(args) if args.encoder_layers_to_keep: args.encoder_layers = len(args.encoder_layers_to_keep.split(",")) max_source_positions = getattr(args, 'max_source_positions', DEFAULT_MAX_SOURCE_POSITIONS) max_target_positions = getattr(args, 'max_target_positions', DEFAULT_MAX_TARGET_POSITIONS) def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) embed_dict = utils.parse_embedding(embed_path) utils.print_embed_overlap(embed_dict, dictionary) return utils.load_embedding(embed_dict, dictionary, embed_tokens) ''' if args.encoder_embed_path: pretrained_encoder_embed = load_pretrained_embedding_from_file( args.encoder_embed_path, task.source_dictionary, args.encoder_embed_dim) else: num_embeddings = len(task.source_dictionary) pretrained_encoder_embed = Embedding( num_embeddings, args.encoder_embed_dim, task.source_dictionary.pad() ) if args.share_all_embeddings: # double check all parameters combinations are valid if task.source_dictionary != task.target_dictionary: raise ValueError('--share-all-embeddings requires a joint dictionary') if args.decoder_embed_path and ( args.decoder_embed_path != args.encoder_embed_path): raise ValueError( '--share-all-embed not compatible with --decoder-embed-path' ) if args.encoder_embed_dim != args.decoder_embed_dim: raise ValueError( '--share-all-embeddings requires --encoder-embed-dim to ' 'match --decoder-embed-dim' ) pretrained_decoder_embed = pretrained_encoder_embed args.share_decoder_input_output_embed = True else: # separate decoder input embeddings pretrained_decoder_embed = None if args.decoder_embed_path: pretrained_decoder_embed = load_pretrained_embedding_from_file( args.decoder_embed_path, task.target_dictionary, args.decoder_embed_dim ) # one last double check of parameter combinations if args.share_decoder_input_output_embed and ( args.decoder_embed_dim != args.decoder_out_embed_dim): raise ValueError( '--share-decoder-input-output-embeddings requires ' '--decoder-embed-dim to match --decoder-out-embed-dim' ) if args.encoder_freeze_embed: pretrained_encoder_embed.weight.requires_grad = False if args.decoder_freeze_embed: pretrained_decoder_embed.weight.requires_grad = False ''' encoder = TransformerEncoder(args, task.source_dictionary, args.word_encoder_embed_dim, args.encoder_embed_dim) decoder = LSTMDecoder( dictionary=task.target_dictionary, embed_dim=args.decoder_embed_dim, hidden_size=args.decoder_hidden_size, out_embed_dim=args.decoder_out_embed_dim, num_layers=args.decoder_layers, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, attention=options.eval_bool(args.decoder_attention), encoder_output_units=encoder.output_units, pretrained_embed=None, share_input_output_embed=args.share_decoder_input_output_embed, adaptive_softmax_cutoff=( options.eval_str_list(args.adaptive_softmax_cutoff, type=int) if args.criterion == 'adaptive_loss' else None ), max_target_positions=max_target_positions ) return cls(args, encoder, decoder)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ LightConvDecoderLayer(args, no_encoder_attn, kernel_size=args.decoder_kernel_size_list[i]) for i in range(args.decoder_layers) ]) self.decoder_dynamic_combination = args.decoder_dynamic_combination self.decoder_linear_combination = args.decoder_linear_combination assert not (self.decoder_dynamic_combination and self.decoder_linear_combination) if self.decoder_linear_combination or self.decoder_dynamic_combination: self.weight_ffn = nn.Sequential( nn.Linear(embed_dim, args.decoder_ffn_embed_dim), nn.ReLU(), nn.Linear(args.decoder_ffn_embed_dim, embed_dim), ) if self.decoder_dynamic_combination: self.proj = nn.ModuleList([ nn.Sequential( nn.Linear(embed_dim * args.decoder_layers, embed_dim * 2), nn.ReLU(), nn.Linear(embed_dim * 2, embed_dim)) for _ in range(args.decoder_layers) ]) if self.decoder_linear_combination: self.weights = nn.ParameterList([ nn.Parameter(torch.randn(1, 1, embed_dim), requires_grad=True) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__(self, args, dictionary, embed_tokens, lang2idx2idx, M, N, no_encoder_attn=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim # define a dict of lang vocab id to its index in syntactic matrix self.lang2idx2idx = torch.LongTensor(lang2idx2idx) # define semantic and syntactic matrices no_langs = len([i for i in self.lang2idx2idx if i>-1]) self.M = M self.N = N padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def parse_args_and_adversary(parser, input_args=None): """This does the same thing as fairseq.options.parse_args_and_arch but for the criterion and adversary only""" # The parser doesn't know about adversary/criterion-specific args, so # we parse twice. First we parse the adversary/criterion, then we # parse a second time after adding the *-specific arguments. # If input_args is given, we will parse those args instead of sys.argv. args, _ = parser.parse_known_args(input_args) # Add model-specific args to parser. if hasattr(args, "arch"): model_specific_group = parser.add_argument_group( "Model-specific configuration", # Only include attributes which are explicitly given as command-line # arguments or which have default values. argument_default=argparse.SUPPRESS, ) ARCH_MODEL_REGISTRY[args.arch].add_args(model_specific_group) # Add adversary-specific args to parser. adversary_specific_group = parser.add_argument_group( f'Arguments for adversary "{args.adversary}"', # Only include attributes which are explicitly given as command-line # arguments or which have default values. argument_default=argparse.SUPPRESS, ) ADVERSARY_REGISTRY[args.adversary].add_args(adversary_specific_group) # Add adversarial criterion-specific args to parser. adv_criterion_specific_group = parser.add_argument_group( f'Arguments for criterion "{args.adv_criterion}"', # Only include attributes which are explicitly given as command-line # arguments or which have default values. argument_default=argparse.SUPPRESS, ) CRITERION_REGISTRY[args.adv_criterion].add_args( adv_criterion_specific_group) if hasattr(args, "criterion"): # Add criterion-specific args to parser. criterion_specific_group = parser.add_argument_group( f'Arguments for criterion "{args.criterion}"', # Only include attributes which are explicitly given as command-line # arguments or which have default values. argument_default=argparse.SUPPRESS, ) CRITERION_REGISTRY[args.criterion].add_args(criterion_specific_group) # Add other *-specific args to parser. if hasattr(args, "optimizer"): OPTIMIZER_REGISTRY[args.optimizer].add_args(parser) if hasattr(args, "lr_scheduler"): LR_SCHEDULER_REGISTRY[args.lr_scheduler].add_args(parser) if hasattr(args, "task"): TASK_REGISTRY[args.task].add_args(parser) # Parse a second time. args = parser.parse_args(input_args) # Post-process args. if hasattr(args, "lr"): args.lr = eval_str_list(args.lr, type=float) if hasattr(args, "update_freq"): args.update_freq = eval_str_list(args.update_freq, type=int) if hasattr(args, "max_sentences_valid") and args.max_sentences_valid is None: args.max_sentences_valid = args.max_sentences # The following line is a hack to be able to use the cross_entropy # criterion without polluting the command line with unnecessary arguments if not hasattr(args, "sentence_avg"): args.sentence_avg = False # this is another hack to ignore the multilingual case if not hasattr(args, "multiling_source_lang"): args.multiling_source_lang = None # Apply architecture configuration. if hasattr(args, "arch"): ARCH_CONFIG_REGISTRY[args.arch](args) return args
def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_lm_architecture(args) if args.decoder_layers_to_keep: args.decoder_layers = len(args.decoder_layers_to_keep.split(",")) if getattr(args, "max_target_positions", None) is None: args.max_target_positions = getattr(args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS) if args.character_embeddings: embed_tokens = CharacterTokenEmbedder( task.source_dictionary, eval(args.character_filters), args.character_embedding_dim, args.decoder_embed_dim, args.char_embedder_highway_layers, ) elif args.adaptive_input: embed_tokens = AdaptiveInput( len(task.source_dictionary), task.source_dictionary.pad(), args.decoder_input_dim, args.adaptive_input_factor, args.decoder_embed_dim, options.eval_str_list(args.adaptive_input_cutoff, type=int), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: embed_tokens = cls.build_embedding(args, task.source_dictionary, args.decoder_input_dim) if args.tie_adaptive_weights: assert args.adaptive_input assert args.adaptive_input_factor == args.adaptive_softmax_factor assert (args.adaptive_softmax_cutoff == args.adaptive_input_cutoff ), "{} != {}".format(args.adaptive_softmax_cutoff, args.adaptive_input_cutoff) assert args.decoder_input_dim == args.decoder_output_dim decoder = TransformerDecoder(args, task.target_dictionary, embed_tokens, no_encoder_attn=True) if getattr(args, "lm_path", None): print('load Transformer_LM from {}'.format(args.lm_path)) state = checkpoint_utils.load_checkpoint_to_cpu(args.lm_path) lm_args = state["args"] lm_args.data = args.data assert getattr(lm_args, "lm_path", None) is None task = tasks.setup_task(lm_args) decoder = task.build_model(lm_args) print('restore Transformer_LM from {}'.format(args.lm_path)) decoder.load_state_dict(state["model"], strict=True) decoder.dim_output = len(task.dictionary) return cls(decoder)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, left_pad=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed self.dictionary =dictionary input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False, uniform=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False, uniform=False) if embed_dim != output_embed_dim else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) self.head_nums = args.decoder_attention_heads # complementary self.head_dim = embed_dim // args.decoder_attention_heads # self.head_dim = embed_dim self.attn_out = Linear(self.head_dim, self.head_dim) self.re_fc_1 = Linear(self.head_dim, self.head_dim) self.re_fc_2 = Linear(self.head_dim, self.head_dim) # self.re_fc_1 = Linear(self.head_dim, 512) # self.re_fc_2 = Linear(512, self.head_dim) self.re_layer_norm_1 = LayerNorm(self.head_dim, ) self.re_layer_norm_2 = LayerNorm(self.head_dim) self.re_embed_out = nn.Parameter(torch.Tensor(len(dictionary), self.head_dim)) nn.init.normal_(self.re_embed_out, mean=0, std=self.head_dim ** -0.5)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ transformer_with_copyDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.copy_attention = MultiheadOnlyAttention( embed_dim, 1, dropout=args.attention_dropout, ) self.copy_or_generate = nn.Sequential(nn.Linear(embed_dim, 1), nn.Sigmoid()) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__(self, args, dictionary, embed_tokens, embed_scale=None, no_encoder_attn=False, left_pad=False, final_norm=True, remove_head=False): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim self.embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( self.embed_dim) if embed_scale is None else embed_scale self.project_in_dim = nn.Linear( input_embed_dim, self.embed_dim, bias=False) if self.embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, self.embed_dim, self.padding_idx, # learned=args.decoder_learned_pos, ) if not args.no_dec_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = nn.Linear(self.embed_dim, output_embed_dim, bias=False) \ if self.embed_dim != output_embed_dim and not args.tie_adaptive_weights else None # self.load_softmax = not getattr(args, 'remove_head', False) self.load_softmax = not remove_head if self.load_softmax: if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim)) # nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = BertLayerNorm(self.embed_dim)
def __init__(self, args, dictionary, embed_tokens, problinkinput, problinkweight, no_encoder_attn=False, left_pad=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed self.problinkinput = problinkinput self.problinkweight = problinkweight input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, left_pad=left_pad, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerBayesDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.layers_cls = nn.ModuleList([]) self.layers_cls.extend([ DACls.build_bayesclassifier(args, args.decoder_embed_dim) for i in range(args.decoder_layers) ]) self.encoder_out_cls = DACls.build_bayesclassifier( args, args.encoder_embed_dim) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__( self, args, src_dict, dst_dict, embed_tokens, no_encoder_attn=False, left_pad=False, final_norm=True, ): super().__init__(dst_dict) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = fairseq_transformer.PositionalEmbedding( 1024, embed_dim, padding_idx, learned=args.decoder_learned_pos) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerAANDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.bottleneck_layer = None out_embed_dim = embed_dim if args.decoder_out_embed_dim is not None: assert ( not args.share_all_embeddings and not args.share_decoder_input_output_embed ), "--decoder-out-embed-dim is incompatible with sharing output embeddings!" self.bottleneck_layer = Linear(embed_dim, args.decoder_out_embed_dim) out_embed_dim = args.decoder_out_embed_dim if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dst_dict), out_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dst_dict), out_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=out_embed_dim**-0.5) self.register_buffer("version", torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) self.vocab_reduction_module = None if args.vocab_reduction_params: assert ( self.adaptive_softmax is None ), "vocabulary reduction not compatible with adaptive softmax!" self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, args.vocab_reduction_params, fp16=args.fp16) self.onnx_trace = False
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.cross_self_attention = getattr(args, 'cross_self_attention', False) self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, 'no_decoder_final_norm', False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, 'layernorm_embedding', False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None
def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument('--encoder-embed-path', type=str, metavar='STR', help='path to pre-trained encoder embedding') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument( '--encoder-learned-pos', action='store_true', help='use learned positional embeddings in the encoder') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument( '--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') parser.add_argument('--decoder-normalize-before', action='store_true', help='apply layernorm before each decoder block') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument('--share-all-embeddings', action='store_true', help='share encoder, decoder and output embeddings' ' (requires shared dictionary and embed dim)') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument('--relu-dropout', type=float, metavar='D', help='dropout probability after ReLU in FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num layers') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='embedding dimension for FFN') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num attention heads') parser.add_argument('--kernel-size-list', type=lambda x: options.eval_str_list(x, int), help='list of kernel size (default: None)') parser.add_argument('--language-embeddings', action='store_true', help='use language embeddings')