def __init__(self, config, dataset): ''' Initialize the Transformer ''' super(NewTransformer, self).__init__() self.dataset = dataset self.embedding = TokenEmbedding( dataset.vocab_size, config.embedding_size, padding_idx=self.padding_idx ) self.position_embedding = PositionEmbedding(config.embedding_size) self.dropout = nn.Dropout(config.dropout_p, inplace=True) # Uniq attn attributes self.attn_ofs_uniq = list(set( config.enc_attn_offset + config.dec_attn_offset + config.enc_dec_attn_offset)) self.attn_std_uniq = list(set( config.enc_attn_std + config.dec_attn_std + config.enc_dec_attn_std)) # Allow for overriding the encoders and decoders in dervied classes self.encoders = self.create_encoders(config) self.decoders = self.create_decoders(config) self.label_smoothing = LabelSmoothingLoss( config.label_smoothing or 0, ignore_index=self.padding_idx, reduction='none' ) self.cross_entropy = nn.CrossEntropyLoss( ignore_index=self.padding_idx, reduction='none' )
def __init__(self, config, dataset): ''' Initialize''' super(NPLM, self).__init__() self.dataset = dataset self.adaptive = config.adaptive # ngm: n tokens that concat with full emb # wsz: window size to average for long term context self.ngm, self.wsz = config.context_config self.long_term_block = 0 if self.ngm > 0 and self.wsz == -1 else \ (config.batch_length - self.ngm) // self.wsz self.dim_concat_embs = self.ngm * config.embedding_size + self.long_term_block * config.embedding_size self.embedding = TokenEmbedding( dataset.vocab_size, config.embedding_size, config.model_size, config.cutoffs, emb_std=config.emb_std, proj_std = config.proj_std, div_val=config.div_val, padding_idx=self.padding_idx, do_proj=config.do_proj ) if self.adaptive: self.adaptive_softmax = AdaptiveSoftmax(self.dataset.vocab_size, config.embedding_size, config.embedding_size, config.cutoffs, div_val=config.div_val) self.tie_weights = config.tie_weights self.tie_projs = config.tie_projs if self.tie_weights: for i in range(len(self.adaptive_softmax.out_layers)): self.adaptive_softmax.out_layers[i].weight = self.embedding.emb_layers[i].weight if self.tie_projs: for i in range(1, len(self.adaptive_softmax.out_projs)): if config.div_val == 1 and config.model_size != config.embedding_size: self.adaptive_softmax.out_projs[i] = self.embedding.emb_projs[0] elif config.div_val != 1: self.adaptive_softmax.out_projs[i] = self.embedding.emb_projs[i] self.layers = self.create_layers(config) self.position_embedding = PositionEmbedding(config.model_size) # only used in transformer-N self.label_smoothing = LabelSmoothingLoss( config.label_smoothing or 0, ignore_index=self.padding_idx, reduction='none' ) self.cross_entropy = nn.CrossEntropyLoss( ignore_index=self.padding_idx, reduction='none' ) self.dropout = nn.Dropout(config.dropout_p, inplace=True) self.config = config
def __init__(self, config, dataset): ''' Initialize the Transformer ''' super(ProbeNewTransformer, self).__init__() self.dataset = dataset self.span = config.span self.embedding = TokenEmbedding( dataset.vocab_size, config.embedding_size, padding_idx=self.padding_idx ) self.position_embedding = PositionEmbedding(config.embedding_size) self.dropout = nn.Dropout(config.dropout_p, inplace=True) # Allow for overriding the encoders and decoders in dervied classes self.encoders = type(self).create_encoders(config) self.decoders = self.create_decoders(config) self.label_smoothing = LabelSmoothingLoss( config.label_smoothing or 0, ignore_index=self.padding_idx, reduction='none' ) self.cross_entropy = nn.CrossEntropyLoss( ignore_index=self.padding_idx, reduction='none' )
def __init__(self, config, dataset): ''' Initialize the Transformer ''' super(Transformer, self).__init__() self.dataset = dataset self.config = config self.adaptive = config.adaptive self.embedding = TokenEmbedding(dataset.vocab_size, config.embedding_size, config.model_size, config.cutoffs, emb_std=config.emb_std, proj_std=config.proj_std, div_val=config.div_val, padding_idx=self.padding_idx, do_proj=config.do_proj) if self.adaptive: self.adaptive_softmax = AdaptiveSoftmax(self.dataset.vocab_size, config.embedding_size, config.model_size, config.cutoffs, div_val=config.div_val) self.tie_weights = config.tie_weights self.tie_projs = config.tie_projs if self.tie_weights: for i in range(len(self.adaptive_softmax.out_layers)): self.adaptive_softmax.out_layers[ i].weight = self.embedding.emb_layers[i].weight if self.tie_projs: for i in range(1, len(self.adaptive_softmax.out_projs)): if config.div_val == 1 and config.model_size != config.embedding_size: self.adaptive_softmax.out_projs[ i] = self.embedding.emb_projs[0] elif config.div_val != 1: self.adaptive_softmax.out_projs[ i] = self.embedding.emb_projs[i] self.position_embedding = PositionEmbedding(config.embedding_size) self.dropout = nn.Dropout(config.dropout_p, inplace=True) if len(config.no_attention) == 1: config.no_attention = config.no_attention * config.num_layers assert len(config.no_attention) == config.num_layers self.layers = self.create_layers(config) self.label_smoothing = LabelSmoothingLoss( config.label_smoothing or 0, ignore_index=self.padding_idx, reduction='none') self.cross_entropy = nn.CrossEntropyLoss(ignore_index=self.padding_idx, reduction='none')
def __init__(self, config, dataset): ''' Initialize the Transformer ''' super(InterleaveFixedPosEmbEncoderOnlyTransformer, self).__init__() self.dataset = dataset self.embedding = TokenEmbedding(dataset.vocab_size, config.embedding_size, padding_idx=self.padding_idx) self.position_embedding = PositionEmbedding(config.embedding_size) self.num_layers = config.num_layers encoder_positional_embedding_list = [] for i in range(self.num_layers // 2): position_embedding_encoder = LearnedPositionalEmbedding( dataset.max_input_length, config.embedding_size, self.padding_idx) nn.init.normal_(position_embedding_encoder.weight, mean=0, std=config.embedding_size**-0.5) if self.padding_idx is not None: nn.init.constant_( position_embedding_encoder.weight[self.padding_idx], 0) encoder_positional_embedding_list.append( position_embedding_encoder) self.encoder_positional_embeddings = nn.ModuleList( encoder_positional_embedding_list) self.position_embedding_decoder = LearnedPositionalEmbedding( dataset.max_target_length, config.embedding_size, self.padding_idx) nn.init.normal_(self.position_embedding_decoder.weight, mean=0, std=config.embedding_size**-0.5) if self.padding_idx is not None: nn.init.constant_( self.position_embedding_decoder.weight[self.padding_idx], 0) self.dropout = nn.Dropout(config.dropout_p, inplace=True) # Uniq attn attributes self.attn_ofs_uniq = list( set(config.enc_attn_offset + config.dec_attn_offset + config.enc_dec_attn_offset)) self.attn_std_uniq = list( set(config.enc_attn_std + config.dec_attn_std + config.enc_dec_attn_std)) # Allow for overriding the encoders and decoders in dervied classes self.encoders = self.create_encoders(config) self.decoders = self.create_decoders(config) self.label_smoothing = LabelSmoothingLoss( config.label_smoothing or 0, ignore_index=self.padding_idx, reduction='none') self.cross_entropy = nn.CrossEntropyLoss(ignore_index=self.padding_idx, reduction='none')