def __init__(self, SRC: data.Field, TGT: data.Field): """ :param SRC: the trained torchtext.data.Field object containing the source side vocabulary :param TGT: the trained torchtext.data.Field object containing the target side vocabulary """ super(Transformer, self).__init__() self.SRC = SRC self.TGT = TGT # #################################### Parameter Initialization ################################################ d_model = int(cfg.transformer_d_model) h = int(cfg.transformer_h) dropout = float(cfg.transformer_dropout) d_ff = int(cfg.transformer_d_ff) max_len = int(cfg.transformer_max_len) N = int(cfg.transformer_N) loss_smoothing = float(cfg.transformer_loss_smoothing) # #################################### Loss Function Initialization ############################################ self.criterion = LabelSmoothing( size=len(TGT.vocab), padding_idx=TGT.vocab.stoi[cfg.pad_token], smoothing=loss_smoothing) # #################################### ENCODER INITIALIZATION ################################################## c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) encoder_layer = EncoderLayer(d_model, c(attn), c(ff), dropout) self.enc_layers = clones(encoder_layer, N) self.enc_norm = LayerNorm(encoder_layer.size) # #################################### DECODER INITIALIZATION ################################################## position = PositionalEncoding(d_model, dropout, max_len) decoder_layer = DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout) self.dec_layers = clones(decoder_layer, N) self.dec_norm = LayerNorm(decoder_layer.size) # #################################### EMBEDDINGS INITIALIZATION ############################################### self.src_embed = nn.Sequential(Embeddings(d_model, len(SRC.vocab)), c(position)) self.tgt_embed = nn.Sequential(Embeddings(d_model, len(TGT.vocab)), c(position)) # #################################### GENERATOR INITIALIZATION ################################################ self.generator = Generator(d_model, len(TGT.vocab)) # #################################### BEAM SEARCH PARAMETERS ################################################## self.beam_search_decoding = False self.beam_size = int(cfg.beam_size) self.beam_search_length_norm_factor = float( cfg.beam_search_length_norm_factor) self.beam_search_coverage_penalty_factor = float( cfg.beam_search_coverage_penalty_factor)
def __init__(self, size, self_attn, src_attn, feed_forward, dropout): super(DecoderLayer, self).__init__() self.size = size self.self_attn = self_attn self.src_attn = src_attn self.feed_forward = feed_forward self.sublayer = clones(SublayerConnection(size, dropout), 3)
def __init__(self, h, d_model, dropout=0.1): """ Implements Figure 2 (right) of the paper (https://arxiv.org/pdf/1706.03762.pdf) """ super(MultiHeadedAttention, self).__init__() assert d_model % h == 0 # We assume d_v always equals d_k self.d_k = d_model // h self.h = h self.linears = clones(nn.Linear(d_model, d_model), 4) self.attn = None self.dropout = nn.Dropout(p=dropout)
def __init__(self, use_left_over_vector=False, value_from_token_embedding=False): super(MultiHeadAspectAugmentationLayer, self).__init__() self.d_model = int(cfg.transformer_d_model) dropout = float(cfg.transformer_dropout) self.sublayer = clones(SublayerConnection(self.d_model, dropout), 2) self.aspect_attn = None self.bert_lm = None self.aspect_vectors = None self.bert_weights_for_average_pooling = None self.use_left_over_vector = use_left_over_vector print("Aspect multi-head attention will{} use the left-over aspect vector".format("" if use_left_over_vector else " not")) self.softmax = nn.Softmax(dim=-1) self.value_from_token_embedding = value_from_token_embedding