def __init__(self, word_embeddings: TextFieldEmbedder, encoder: Seq2VecEncoder, vocab: Vocabulary, positive_label: int = 4) -> None: super().__init__(vocab) # We need the embeddings to convert word IDs to their vector representations self.word_embeddings = word_embeddings # bottle-neck self.linear_bn = torch.nn.Linear( in_features=word_embeddings.get_output_dim(), out_features=encoder.get_input_dim()) self.encoder = encoder # After converting a sequence of vectors to a single vector, we feed it into # a fully-connected linear layer to reduce the dimension to the total number of labels. self.linear = torch.nn.Linear( in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels')) # Monitor the metrics - we use accuracy, as well as prec, rec, f1 for 4 (very positive) self.accuracy = CategoricalAccuracy() self.f1_measure = F1Measure(positive_label) # We use the cross entropy loss because this is a classification task. # Note that PyTorch's CrossEntropyLoss combines softmax and log likelihood loss, # which makes it unnecessary to add a separate softmax layer. self.loss_function = torch.nn.CrossEntropyLoss()
def __init__(self, word_embedder: TextFieldEmbedder, position_embedder: TextFieldEmbedder, polarities: list, vocab: Vocabulary, configuration: dict): super().__init__(vocab) self.configuration = configuration self.word_embedder = word_embedder self.position_embedder = position_embedder self.polarites = polarities self.polarity_num = len(polarities) self.sentiment_loss = nn.CrossEntropyLoss() self._accuracy = metrics.CategoricalAccuracy() word_embedding_dim = word_embedder.get_output_dim() lstm_input_size = word_embedding_dim num_layers = 3 self.lstm = torch.nn.LSTM(lstm_input_size, int(word_embedding_dim / 2), batch_first=True, bidirectional=True, num_layers=num_layers, dropout=0.5) sentiment_fc_input_size = word_embedding_dim self.sentiment_fc = nn.Sequential( nn.Linear(sentiment_fc_input_size, sentiment_fc_input_size), nn.ReLU(), nn.Linear(sentiment_fc_input_size, self.polarity_num)) self.dropout_after_embedding_layer = nn.Dropout(0.5) self.dropout_after_lstm_layer = nn.Dropout(0.5)
def __init__(self, word_embeddings: TextFieldEmbedder, n_grams: int, n_kernels: int, conv_out_dim: int): super(Conv_KNRM, self).__init__() self.word_embeddings = word_embeddings # static - kernel size & magnitude variables self.mu = Variable(torch.cuda.FloatTensor(self.kernel_mus(n_kernels)), requires_grad=False).view(1, 1, 1, n_kernels) self.sigma = Variable(torch.cuda.FloatTensor( self.kernel_sigmas(n_kernels)), requires_grad=False).view(1, 1, 1, n_kernels) self.convolutions = [] for i in range(1, n_grams + 1): self.convolutions.append( nn.Sequential( nn.ConstantPad1d((0, i - 1), 0), nn.Conv1d(kernel_size=i, in_channels=word_embeddings.get_output_dim(), out_channels=conv_out_dim), nn.ReLU())) self.convolutions = nn.ModuleList( self.convolutions) # register conv as part of the model # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) self.cosine_module = CosineMatrixAttention() # *9 because we concat the 3x3 conv match sums together before the dense layer self.dense = nn.Linear(n_kernels * n_grams * n_grams, 1, bias=False) # init with small weights, otherwise the dense output is way to high fot torch.nn.init.uniform_(self.dense.weight, -0.014, 0.014) # inits taken from matchzoo
def __init__(self, word_embeddings: TextFieldEmbedder, vocab: Vocabulary) -> None: super().__init__(vocab) self.word_embeddings = word_embeddings self.out = torch.nn.Linear( in_features=self.word_embeddings.get_output_dim() * 4, out_features=vocab.get_vocab_size('labels') ) self.accuracy = CategoricalAccuracy() self.f_score_0 = F1Measure(positive_label=0) self.f_score_1 = F1Measure(positive_label=1) self.f_score_2 = F1Measure(positive_label=2) self.loss = CrossEntropyLoss() self.attention = BilinearAttention(word_embeddings.get_output_dim() * 3, word_embeddings.get_output_dim())
def __init__(self, word_embeddings: TextFieldEmbedder, encoder: Seq2VecEncoder, dropout_p: int, vocab: Vocabulary) -> None: super().__init__(vocab) self.word_embeddings = word_embeddings self.embedding2input = FeedForward( input_dim=word_embeddings.get_output_dim(), num_layers=1, hidden_dims=encoder.get_input_dim(), activations=Activation.by_name('relu')(), dropout=dropout_p) self.encoder = encoder self.hidden2intermediate = FeedForward( input_dim=encoder.get_output_dim(), num_layers=1, hidden_dims=int(encoder.get_output_dim() / 2), activations=Activation.by_name('relu')(), dropout=dropout_p) self.intermediate2tag = nn.Linear( in_features=int(encoder.get_output_dim() / 2), out_features=vocab.get_vocab_size('labels')) self.accuracy = CategoricalAccuracy() self.loss_function = torch.nn.CrossEntropyLoss()
def __init__(self, word_embeddings: TextFieldEmbedder, encoder: Seq2VecEncoder, vocab: Vocabulary) -> None: super().__init__(vocab) self.word_embedding = word_embeddings self.encoder = encoder self.hidden2out = torch.nn.Linear(in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size("labels")) self.accuracy = MicroMetrics(vocab) self.lstm = nn.LSTM(input_size=word_embeddings.get_output_dim(), hidden_size=128, num_layers=1, batch_first=True) self.label_index_to_label = self.vocab.get_index_to_token_vocabulary('labels')
def __init__( self, word_embedder: TextFieldEmbedder, attribute_embedder: Embedding, content_encoder: Seq2SeqEncoder, vocab: Vocabulary, max_decoding_steps: int = 20, beam_size: int = None, scheduled_sampling_ratio: float = 0., ) -> None: super().__init__(vocab) self.scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self.start_index = self.vocab.get_token_index(START_SYMBOL, 'tokens') self.end_index = self.vocab.get_token_index(END_SYMBOL, 'tokens') # TODO: not sure if we need this self.bleu = None # At prediction time, we use a beam search to find the most likely sequence of target tokens. beam_size = beam_size or 1 self.max_decoding_steps = max_decoding_steps self.beam_search = BeamSearch(self.end_index, max_steps=max_decoding_steps, beam_size=beam_size) # Dense embedding of source and target vocab tokens and attribute. self.word_embedder = word_embedder self.attribute_embedder = attribute_embedder # Encodes the sequence of source embeddings into a sequence of hidden states. self.content_encoder = content_encoder num_classes = self.vocab.get_vocab_size('tokens') # TODO: not sure if we need this self.attention = None # Dense embedding of vocab words in the target space. embedding_dim = word_embedder.get_output_dim() self.target_embedder = Embedding(num_classes, embedding_dim) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. self.encoder_output_dim = self.content_encoder.get_output_dim( ) + embedding_dim self.decoder_output_dim = self.encoder_output_dim self.decoder_input_dim = embedding_dim self.decoder_cell = LSTMCell(self.decoder_input_dim, self.decoder_output_dim) self.output_projection_layer = Linear(self.decoder_output_dim, num_classes)
def __init__(self, word_embeddings: TextFieldEmbedder, vocab: Vocabulary) -> None: super().__init__(vocab) self.word_embeddings = word_embeddings self.bert_seq_encoder = PytorchSeq2VecWrapper(LSTM(word_embeddings.get_output_dim(), int(word_embeddings.get_output_dim()/2), batch_first=True, bidirectional=True)) self.out = torch.nn.Linear( in_features=word_embeddings.get_output_dim()*4, out_features=vocab.get_vocab_size('labels') ) self.accuracy = CategoricalAccuracy() self.f_score_0 = F1Measure(positive_label=0) self.f_score_1 = F1Measure(positive_label=1) self.f_score_2 = F1Measure(positive_label=2) self.loss = CrossEntropyLoss()
def __init__(self, word_embeddings: TextFieldEmbedder, p_encoder: Seq2SeqEncoder, q_encoder: Seq2SeqEncoder, a_encoder: Seq2SeqEncoder, vocab: Vocabulary, embedding_dropout: float = 0.0, encoder_dropout: float = 0.0) -> None: # We have to pass the vocabulary to the constructor. super().__init__(vocab) self.word_embeddings = word_embeddings if embedding_dropout > 0: self.embedding_dropout = torch.nn.Dropout(p=embedding_dropout) else: self.embedding_dropout = lambda x: x if encoder_dropout > 0: self.encoder_dropout = torch.nn.Dropout(p=encoder_dropout) else: self.encoder_dropout = lambda x: x embedding_dim = word_embeddings.get_output_dim() self.p_q_match = SequenceAttention(input_dim=embedding_dim) self.a_p_match = SequenceAttention(input_dim=embedding_dim) self.a_q_match = SequenceAttention(input_dim=embedding_dim) # Our model has different encoders for each of the fields (passage, # answer and question). self.p_encoder = p_encoder self.q_encoder = q_encoder self.a_encoder = a_encoder # Attention layers: passage-question, question-self, answer-self self.p_q_attn = BilinearAttention( vector_dim=self.q_encoder.get_output_dim(), matrix_dim=self.p_encoder.get_output_dim(), ) self.q_self_attn = LinearSelfAttention( input_dim=self.q_encoder.get_output_dim() ) self.a_self_attn = LinearSelfAttention( input_dim=self.a_encoder.get_output_dim() ) self.p_a_bilinear = torch.nn.Linear( in_features=self.p_encoder.get_output_dim(), out_features=self.a_encoder.get_output_dim() ) self.q_a_bilinear = torch.nn.Linear( in_features=self.q_encoder.get_output_dim(), out_features=self.a_encoder.get_output_dim() )
def __init__(self, word_embedder: TextFieldEmbedder, aspect_embedder: TextFieldEmbedder, categories: list, polarities: list, vocab: Vocabulary, configuration: dict): super().__init__(vocab) self.configuration = configuration self.word_embedder = word_embedder self.aspect_embedder = aspect_embedder self.categories = categories self.polarites = polarities self.category_num = len(categories) self.polarity_num = len(polarities) self.sentiment_loss = nn.CrossEntropyLoss() self._accuracy = metrics.CategoricalAccuracy() word_embedding_dim = word_embedder.get_output_dim() aspect_word_embedding_dim = aspect_embedder.get_output_dim() if self.configuration['model_name'] in ['ae-lstm', 'atae-lstm']: lstm_input_size = word_embedding_dim + aspect_word_embedding_dim else: lstm_input_size = word_embedding_dim num_layers = 1 hidden_size = 300 self.lstm = torch.nn.LSTM(lstm_input_size, hidden_size, batch_first=True, bidirectional=False, num_layers=num_layers) if self.configuration['model_name'] in ['at-lstm', 'atae-lstm']: attention_input_size = word_embedding_dim + aspect_word_embedding_dim self.sentiment_attention = AttentionInHtt(attention_input_size, lstm_input_size) self.sentiment_fc = nn.Sequential( nn.Linear(hidden_size * 2, self.polarity_num)) else: self.sentiment_attention = None self.sentiment_fc = nn.Sequential( nn.Linear(hidden_size, self.polarity_num))
def __init__(self, word_embeddings: TextFieldEmbedder, vocab: Vocabulary, loss: str, hinge_margin: float) -> None: super().__init__(vocab) self.word_embeddings = word_embeddings self.out = torch.nn.Linear( in_features=word_embeddings.get_output_dim(), out_features=1) self.accuracy = BooleanAccuracy() self.loss_name = loss if loss == 'hinge': self.loss = MarginRankingLoss(margin=hinge_margin, reduction='mean') else: self.loss = BCEWithLogitsLoss(reduction='mean') self.sigmoid = torch.nn.Sigmoid()
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, sentence_encoder: Seq2VecEncoder, claim_encoder: Seq2SeqEncoder, attention: Attention, max_steps: int = 100, beam_size: int = 5, beta: float = 1.0) -> None: super(Seq2SeqClaimRank, self).__init__(vocab) self.text_field_embedder = text_field_embedder self.sentence_encoder = sentence_encoder self.claim_encoder = TimeDistributed(claim_encoder) # Handles additional sequence dim self.claim_encoder_dim = claim_encoder.get_output_dim() self.attention = attention self.decoder_embedding_dim = text_field_embedder.get_output_dim() self.max_steps = max_steps self.beam_size = beam_size self.beta = beta # self.target_embedder = torch.nn.Embedding(vocab.get_vocab_size(), decoder_embedding_dim) # Since we are using the sentence encoding as the initial hidden state to the decoder, the # decoder hidden dim must match the sentence encoder hidden dim. self.decoder_output_dim = sentence_encoder.get_output_dim() self.decoder_0_cell = torch.nn.LSTMCell(self.decoder_embedding_dim + self.claim_encoder_dim, self.decoder_output_dim) self.decoder_1_cell = torch.nn.LSTMCell(self.decoder_output_dim, self.decoder_output_dim) # When projecting out we will use attention to combine claim embeddings into a single # context embedding, this will be concatenated with the decoder cell output before being # fed to the projection layer. Hence the expected input size is: # decoder output dim + claim encoder output dim projection_input_dim = self.decoder_output_dim + self.claim_encoder_dim self.output_projection_layer = torch.nn.Linear(projection_input_dim, vocab.get_vocab_size()) self._start_index = self.vocab.get_token_index('<s>') self._end_index = self.vocab.get_token_index('</s>') self.beam_search = BeamSearch(self._end_index, max_steps=max_steps, beam_size=beam_size) pad_index = vocab.get_token_index(vocab._padding_token) self.bleu = BLEU(exclude_indices={pad_index, self._start_index, self._end_index}) self.avg_reconstruction_loss = Average() self.avg_claim_scoring_loss = Average()
def __init__(self, word_embeddings: TextFieldEmbedder, n_grams: int, n_kernels: int, conv_out_dim: int): super(Conv_KNRM, self).__init__() self.word_embeddings = word_embeddings # static - kernel size & magnitude variables self.mu = Variable(torch.FloatTensor(self.kernel_mus(n_kernels)), requires_grad = False).view(1, 1, 1, n_kernels) self.sigma = Variable(torch.FloatTensor(self.kernel_sigmas(n_kernels)), requires_grad = False).view(1, 1, 1, n_kernels) # Implement 1 Dimensional CNN layer for each n-gram type # Also, use RelU as Activation function self.convolutions = [] for i in range (1, n_grams + 1): self.convolutions.append(nn.Sequential( nn.ConstantPad1d((0 , i-1 ), 0), # the kernel size of the convolutional layer is the same as the current i-gram(uni, bi, tri...) in the loop nn.Conv1d(kernel_size = i, in_channels = word_embeddings.get_output_dim(), out_channels = conv_out_dim), nn.ReLU())) # register conv as part of the model self.convolutions = nn.ModuleList(self.convolutions) #Cosine similarity matrix self.cosine_module = CosineMatrixAttention() # Initialize the Linear transformer model: # size of the input: number of elements in the soft-TF feautes * number of kernel products ( # n_kernels * n_grams * n_grams = all combination of match matrix creation # (n-gram pairs from query and document embeddings) # the output will be 1 sample # also use bias based on the paper formula (by default it's true but just to make sure) self.transform = nn.Linear(in_features = n_kernels * n_grams * n_grams, out_features = 1, bias = True)
def __init__(self, word_embedder: TextFieldEmbedder, position_embedder: TextFieldEmbedder, polarities: list, vocab: Vocabulary, configuration: dict): super().__init__(vocab) self.configuration = configuration self.word_embedder = word_embedder self.position_embedder = position_embedder self.polarites = polarities self.polarity_num = len(polarities) self.sentiment_loss = nn.CrossEntropyLoss() self._accuracy = metrics.CategoricalAccuracy() word_embedding_dim = word_embedder.get_output_dim() lstm_input_size = word_embedding_dim sentiment_fc_input_size = lstm_input_size self.sentiment_fc = nn.Sequential( nn.Linear(sentiment_fc_input_size, sentiment_fc_input_size), nn.ReLU(), nn.Linear(sentiment_fc_input_size, self.polarity_num)) self.dropout_after_embedding_layer = nn.Dropout(0.5) self.dropout_after_lstm_layer = nn.Dropout(0.5) self.gnn_for_sentiment = GAT(word_embedding_dim, word_embedding_dim, word_embedding_dim, 4, self.configuration)
def __init__(self, word_embeddings: TextFieldEmbedder, encoder: Seq2SeqEncoder, dropout_p: int, vocab: Vocabulary) -> None: super().__init__(vocab) self.word_embeddings = word_embeddings self.embedding2input = FeedForward( input_dim=word_embeddings.get_output_dim(), num_layers=1, hidden_dims=encoder.get_input_dim(), activations=Activation.by_name('relu')(), dropout=dropout_p) self.encoder = encoder self.hidden2intermediate = FeedForward( input_dim=encoder.get_output_dim(), num_layers=1, hidden_dims=int(encoder.get_output_dim() / 2), activations=Activation.by_name('relu')(), dropout=dropout_p) self.intermediate2tag = nn.Linear( in_features=int(encoder.get_output_dim() / 2), out_features=vocab.get_vocab_size('labels')) # self.accuracy = CategoricalAccuracy() label_vocab = vocab.get_token_to_index_vocabulary('labels').copy() # print("label_vocab: ", label_vocab) [label_vocab.pop(x) for x in ['O', 'OR']] labels_for_metric = list(label_vocab.values()) # print("labels_for_metric: ", labels_for_metric) self.accuracy = CustomFBetaMeasure(beta=1.0, average='micro', labels=labels_for_metric)
def __init__( self, vocab: Vocabulary, embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, emb_to_enc_proj: FeedForward = None, feedforward: FeedForward = None, dropout: float = 0.0, num_tags: int = 2, use_crf: bool = False, ): super().__init__(vocab) self.embedder = embedder self.emb_to_enc_proj = None if emb_to_enc_proj is not None: self.emb_to_enc_proj = emb_to_enc_proj self.encoder = encoder assert (embedder.get_output_dim() == encoder.get_input_dim() or emb_to_enc_proj is not None and emb_to_enc_proj.get_output_dim() == encoder.get_input_dim()) self.feedforward = None pre_output_dim = encoder.get_output_dim() if feedforward is not None: assert feedforward.get_input_dim() == encoder.get_output_dim() self.feedforward = feedforward pre_output_dim = self.feedforward.get_output_dim() self.hidden2tag = torch.nn.Linear(in_features=pre_output_dim, out_features=num_tags) self.dropout = torch.nn.Dropout(dropout) self.accuracy = CategoricalAccuracy() self.f1 = F1Measure(1) self.use_crf = use_crf if use_crf: self.crf = ConditionalRandomField( num_tags, include_start_end_transitions=True)
def __init__(self, word_embedder: TextFieldEmbedder, aspect_embedder: TextFieldEmbedder, categories: list, polarities: list, vocab: Vocabulary, configuration: dict): super().__init__(vocab) self.configuration = configuration self.word_embedder = word_embedder self.aspect_embedder = aspect_embedder self.categories = categories self.polarites = polarities self.category_num = len(categories) self.polarity_num = len(polarities) self.category_loss = nn.BCEWithLogitsLoss() self.sentiment_loss = nn.CrossEntropyLoss() self._accuracy = metrics.CategoricalAccuracy() word_embedding_dim = word_embedder.get_output_dim() lstm_input_size = word_embedding_dim num_layers = 1 hidden_size = 32 self.aspect_gru = torch.nn.GRU(lstm_input_size, hidden_size, batch_first=True, bidirectional=True, num_layers=num_layers) self.sentiment_gru = torch.nn.GRU(lstm_input_size, hidden_size, batch_first=True, bidirectional=True, num_layers=num_layers) self.aspect_attention = AttentionInHtt(hidden_size * 3, hidden_size) self.sentiment_attention = AttentionInHtt(hidden_size * 5, hidden_size, softmax=False) self.sentiment_fc = nn.Sequential( nn.Linear(hidden_size * 3, self.polarity_num))
def __init__(self, vocab, embedder: TextFieldEmbedder, max_target_positions, dropout, share_decoder_input_output_embed, decoder_output_dim, decoder_conv_dim, decoder_glu, decoder_conv_type, weight_softmax, decoder_attention_heads, weight_dropout, relu_dropout, input_dropout, decoder_normalize_before, attention_dropout, decoder_ffn_embed_dim, decoder_kernel_size_list, adaptive_softmax_cutoff=None, tie_adaptive_weights=False, adaptive_softmax_dropout=0, tie_adaptive_proj=False, adaptive_softmax_factor=0, decoder_layers=6, final_norm=True, padding_idx=0, namespace='target_tokens', vocab_size=None, section_attn=False, swap=False): super().__init__() self.vocab = vocab vocab_size = vocab_size or vocab.get_vocab_size(namespace) self.dropout = dropout self.share_input_output_embed = share_decoder_input_output_embed input_embed_dim = embedder.get_output_dim() embed_dim = input_embed_dim output_embed_dim = input_embed_dim padding_idx = padding_idx self.max_target_positions = max_target_positions self.embedder = embedder self.project_in_dim = GehringLinear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.layers = nn.ModuleList([]) self.layers.extend([ DynamicConvDecoderLayer(embed_dim, decoder_conv_dim, decoder_glu, decoder_conv_type, weight_softmax, decoder_attention_heads, weight_dropout, dropout, relu_dropout, input_dropout, decoder_normalize_before, attention_dropout, decoder_ffn_embed_dim, swap, kernel_size=decoder_kernel_size_list[i]) for i in range(decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = GehringLinear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not tie_adaptive_weights else None if adaptive_softmax_cutoff is not None: adaptive_inputs = None if isinstance(embedder, AdaptiveEmbedding): adaptive_inputs = embedder elif hasattr(embedder, 'token_embedder_adaptive'): adaptive_inputs = embedder.token_embedder_adaptive elif tie_adaptive_weights: raise ValueError('Cannot locate adaptive_inputs.') self.adaptive_softmax = AdaptiveSoftmax( vocab_size, output_embed_dim, eval_str_list(adaptive_softmax_cutoff, type=int), dropout=adaptive_softmax_dropout, adaptive_inputs=adaptive_inputs, factor=adaptive_softmax_factor, tie_proj=tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(vocab_size, output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = decoder_normalize_before and final_norm if self.normalize: self.layer_norm = nn.LayerNorm(embed_dim)
def __init__(self, vocab, embedder: TextFieldEmbedder, num_layers, hidden_size, dropout, share_decoder_input_output_embed, vocab_size=None, adaptive_softmax_cutoff=None, tie_adaptive_weights=False, adaptive_softmax_dropout=0, tie_adaptive_proj=False, adaptive_softmax_factor=0, article_embed_size=1024, image_embed_size=2048, namespace='target_tokens'): super().__init__() self.vocab = vocab self.hidden_size = hidden_size vocab_size = vocab_size or vocab.get_vocab_size(namespace) self.dropout = dropout self.share_input_output_embed = share_decoder_input_output_embed input_embed_dim = embedder.get_output_dim() embed_dim = input_embed_dim output_embed_dim = input_embed_dim self.layers = nn.ModuleList([]) self.h = nn.ParameterList([]) self.c = nn.ParameterList([]) for layer in range(num_layers): input_size = hidden_size + embed_dim if layer == 0 else hidden_size rnn = LSTMCell(input_size=input_size, hidden_size=hidden_size) self.layers.append(rnn) self.h.append(nn.Parameter(torch.zeros(1, hidden_size))) self.c.append(nn.Parameter(torch.zeros(1, hidden_size))) self.image_attention = AttentionLayer( hidden_size, image_embed_size, hidden_size, bias=True) self.article_attention = AttentionLayer( hidden_size, article_embed_size, hidden_size, bias=True) self.attn_proj = GehringLinear(hidden_size * 2, hidden_size) self.embedder = embedder self.adaptive_softmax = None self.project_out_dim = GehringLinear(hidden_size, output_embed_dim, bias=False) \ if hidden_size != output_embed_dim else None if adaptive_softmax_cutoff is not None: adaptive_inputs = None if isinstance(embedder, AdaptiveEmbedding): adaptive_inputs = embedder elif hasattr(embedder, 'token_embedder_adaptive'): adaptive_inputs = embedder.token_embedder_adaptive elif tie_adaptive_weights: raise ValueError('Cannot locate adaptive_inputs.') self.adaptive_softmax = AdaptiveSoftmax( vocab_size, output_embed_dim, eval_str_list(adaptive_softmax_cutoff, type=int), dropout=adaptive_softmax_dropout, adaptive_inputs=adaptive_inputs, factor=adaptive_softmax_factor, tie_proj=tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(vocab_size, output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5)