def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, max_decoding_steps: int, target_namespace: str = "tokens", target_embedding_dim: int = None, attention_function: SimilarityFunction = None, scheduled_sampling_ratio: float = 0.0) -> None: super(SimpleSeq2Seq, self).__init__(vocab) self._source_embedder = source_embedder self._encoder = encoder self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace self._attention_function = attention_function self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) num_classes = self.vocab.get_vocab_size(self._target_namespace) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with that of the final hidden states of the encoder. Also, if # we're using attention with ``DotProductSimilarity``, this is needed. self._decoder_output_dim = self._encoder.get_output_dim() target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim() self._target_embedder = Embedding(num_classes, target_embedding_dim) if self._attention_function: self._decoder_attention = Attention(self._attention_function) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the input vector of the decoder at each time step. self._decoder_input_dim = self._encoder.get_output_dim() + target_embedding_dim else: self._decoder_input_dim = target_embedding_dim # TODO (pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
def __init__(self, args, vocab_indexer, vocab, decoder_hidden_size=600, emb_size=128, num_classes=None, start_idx=1, end_idx=2, padding_idx=0, typ='lstm', max_decoding_steps=120, sampling_scheme: str = "first_word", line_separator_symbol: str = "<eos>", reverse_each_line: str = False, n_lines_per_sample: int = 14, tie_weights: bool = True, dropout_ratio: float = 0.3, phoneme_embeddings_dim: int = 128, encoder_type: str = None, encoder_input_size: int = 100, encoder_hidden_size: int = 100, encoder_n_layers: int = 1, n_lines_to_gen: int = 4): super(VanillaLM, self).__init__() self.args = args self.vocab_indexer = vocab_indexer self.vocab = vocab self._scheduled_sampling_ratio = 0.0 self._max_decoding_steps = max_decoding_steps decoder_input_size = emb_size self._decoder_input_dim = decoder_input_size self._decoder_output_dim = decoder_hidden_size self._target_embedder = nn.Embedding(num_classes, emb_size) self._context_embedder = nn.Embedding( num_classes, phoneme_embeddings_dim ) ## TODO: Not clear why this is phoneme_embeddings_dim self.padding_idx = padding_idx self.start_idx = start_idx self.end_idx = end_idx self.type = typ self.use_cuda = args.use_cuda #True decoder_embedding_dim = emb_size self._target_embedding_dim = decoder_embedding_dim assert self.type == "lstm", "Incorrect decoder type" self._lm_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) self._intermediate_projection_layer = Linear( self._decoder_output_dim, self._target_embedding_dim) # , bias=False) self._activation = torch.tanh self._num_classes = num_classes self._output_projection_layer = Linear(self._target_embedding_dim, self._num_classes) self._dropout_ratio = dropout_ratio self._dropout = nn.Dropout(p=dropout_ratio, inplace=False) self._lockdropout = LockedDropout() self._encoder_type = encoder_type if self._encoder_type is not None: self._encoder_input_size = encoder_input_size self._encoder_hidden_size = encoder_hidden_size self._encoder_namespace = encoder_namespace self._encoder = nn.LSTM(input_size=self._encoder_input_size, hidden_size=self._encoder_hidden_size, batch_first=True, bias=False, num_layers=encoder_n_layers, bidirectional=False) if tie_weights: # assert self._target_embedding_dim == self._target_embedder.token_embedder_tokens.get_output_dim(), "Dimension mis-match!" self._output_projection_layer.weight = self._target_embedder.weight # in the config, make these options consistent with those in the reader self._sampling_scheme = sampling_scheme # "first_sentence" # "first_word" self.line_separator = line_separator_symbol self.reverse_each_line = reverse_each_line self.n_lines_per_sample = n_lines_per_sample self._n_lines_to_gen = n_lines_to_gen self._attention = False self.END_SYMBOL = line_separator_symbol
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, max_decoding_steps: int, attention: Attention, schema_path: str = None, missing_alignment_int: int = 0, indexfield_padding_index: int = -1, beam_size: int = None, target_namespace: str = "tokens", target_embedding_dim: int = None, scheduled_sampling_ratio: float = 0., use_bleu: bool = True, emb_dropout: float = 0.0, dec_dropout: float = 0.0, attn_loss_lambda: float = 0.5, token_based_metric: Metric = None) -> None: super(AttnSupSeq2Seq, self).__init__(vocab) self._target_namespace = target_namespace self._scheduled_sampling_ratio = scheduled_sampling_ratio self._indexfield_padding_index = indexfield_padding_index self._missing_alignment_int = missing_alignment_int # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) if use_bleu: pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) # pylint: disable=protected-access self._bleu = BLEU(exclude_indices={ pad_index, self._end_index, self._start_index }) else: self._bleu = None if token_based_metric: self._token_based_metric = token_based_metric else: self._token_based_metric = TokenSequenceAccuracy() # log attention supervision CE loss as a metric self._attn_sup_loss = Average() self._sql_metrics = schema_path is not None if self._sql_metrics: # SQL specific metrics: match between the templates free of schema constants, # and match between the schema constants self._schema_free_match = GlobalTemplAccuracy( schema_path=schema_path) self._kb_match = KnowledgeBaseConstsAccuracy( schema_path=schema_path) # At prediction time, we use a beam search to find the most likely sequence of target tokens. beam_size = beam_size or 1 self._max_decoding_steps = max_decoding_steps self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) # Dense embedding of source vocab tokens. self._source_embedder = source_embedder self._emb_dropout = Dropout(p=emb_dropout) self._dec_dropout = Dropout(p=dec_dropout) self._attn_loss_lambda = attn_loss_lambda # Encodes the sequence of source embeddings into a sequence of hidden states. self._encoder = encoder num_classes = self.vocab.get_vocab_size(self._target_namespace) # Attention mechanism applied to the encoder output for each step. self._attention = attention self._attention._normalize = False # Dense embedding of vocab words in the target space. target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim( ) self._target_embedder = Embedding(num_classes, target_embedding_dim) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. self._encoder_output_dim = self._encoder.get_output_dim() self._decoder_output_dim = self._encoder_output_dim # A weighted average over encoder outputs will be concatenated to the previous target embedding # to form the input to the decoder at each time step. self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim # We'll use an LSTM cell as the recurrent cell that produces a hidden state # for the decoder at each time step. # TODO (pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) # We project the hidden state from the decoder into the output vocabulary space # in order to get log probabilities of each target token, at each time step. self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
def __init__( self, vocab: Vocabulary, input_dim: int, decoder_hidden_size: int, max_decoding_steps: int, output_proj_input_dim: int, target_namespace: str = "targets", target_embedding_dim: int = None, attention: str = "none", dropout: float = 0.0, scheduled_sampling_ratio: float = 0.0, ) -> None: super(Seq2SeqDecoder, self).__init__(vocab) self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self._unk_index = self.vocab.get_token_index("@@UNKNOWN@@", self._target_namespace) num_classes = self.vocab.get_vocab_size(self._target_namespace) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with that of the final hidden states of the encoder. Also, if # we're using attention with ``DotProductSimilarity``, this is needed. self._encoder_output_dim = input_dim self._decoder_hidden_dim = decoder_hidden_size if self._encoder_output_dim != self._decoder_hidden_dim: self._projection_encoder_out = Linear(self._encoder_output_dim, self._decoder_hidden_dim) else: self._projection_encoder_out = lambda x: x self._decoder_output_dim = self._decoder_hidden_dim self._output_proj_input_dim = output_proj_input_dim self._target_embedding_dim = target_embedding_dim self._target_embedder = Embedding(num_classes, self._target_embedding_dim) # Used to get an initial hidden state from the encoder states self._sent_pooler = Pooler(project=True, d_inp=input_dim, d_proj=decoder_hidden_size) if attention == "Bahdanau": self._decoder_attention = BahdanauAttention( decoder_hidden_size + target_embedding_dim, input_dim) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the input vector of the decoder at each time # step. self._decoder_input_dim = input_dim + target_embedding_dim elif attention == "bilinear": self._decoder_attention = BilinearAttention( decoder_hidden_size + target_embedding_dim, input_dim) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the input vector of the decoder at each time # step. self._decoder_input_dim = input_dim + target_embedding_dim elif attention == "none": self._decoder_attention = None self._decoder_input_dim = target_embedding_dim else: raise Exception("attention not implemented {}".format(attention)) self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_hidden_dim) # Allow for a bottleneck layer between encoder outputs and distribution over vocab # The bottleneck layer consists of a linear transform and helps to reduce # number of parameters if self._output_proj_input_dim != self._decoder_output_dim: self._projection_bottleneck = Linear(self._decoder_output_dim, self._output_proj_input_dim) else: self._projection_bottleneck = lambda x: x self._output_projection_layer = Linear(self._output_proj_input_dim, num_classes) self._dropout = torch.nn.Dropout(p=dropout)
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, attention: Attention, beam_size: int, max_decoding_steps: int, target_embedding_dim: int = 30, copy_token: str = "@COPY@", source_namespace: str = "source_tokens", target_namespace: str = "target_tokens", metric: Metric = BLEU()) -> None: super(CopyNet, self).__init__(vocab) self._metric = metric self._source_namespace = source_namespace self._target_namespace = target_namespace self._src_start_index = self.vocab.get_token_index(START_SYMBOL, self._source_namespace) self._src_end_index = self.vocab.get_token_index(END_SYMBOL, self._source_namespace) self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self._oov_index = self.vocab.get_token_index(self.vocab._oov_token, self._target_namespace) # pylint: disable=protected-access self._pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) # pylint: disable=protected-access self._copy_index = self.vocab.get_token_index(copy_token, self._target_namespace) if self._copy_index == self._oov_index: raise ConfigurationError(f"Special copy token {copy_token} missing from target vocab namespace. " f"You can ensure this token is added to the target namespace with the " f"vocabulary parameter 'tokens_to_add'.") self._target_vocab_size = self.vocab.get_vocab_size(self._target_namespace) # Encoding modules. self._source_embedder = source_embedder self._encoder = encoder # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. # We arbitrarily set the decoder's input dimension to be the same as the output dimension. self.encoder_output_dim = self._encoder.get_output_dim() self.decoder_output_dim = self.encoder_output_dim self.decoder_input_dim = self.decoder_output_dim target_vocab_size = self.vocab.get_vocab_size(self._target_namespace) # The decoder input will be a function of the embedding of the previous predicted token, # an attended encoder hidden state called the "attentive read", and another # weighted sum of the encoder hidden state called the "selective read". # While the weights for the attentive read are calculated by an `Attention` module, # the weights for the selective read are simply the predicted probabilities # corresponding to each token in the source sentence from the previous timestep. self._target_embedder = Embedding(target_vocab_size, target_embedding_dim) self._attention = attention self._input_projection_layer = Linear( target_embedding_dim + self.encoder_output_dim * 2, self.decoder_input_dim) # We then run the projected decoder input through an LSTM cell to produce # the next hidden state. self._decoder_cell = LSTMCell(self.decoder_input_dim, self.decoder_output_dim) # We create a "generation" score for each token in the target vocab # with a linear projection of the decoder hidden state. self._output_generation_layer = Linear(self.decoder_output_dim, target_vocab_size) # We create a "copying" score for each source token by applying a non-linearity # (tanh) to a linear projection of the encoded hidden state for that token, # and then taking the dot product of the result with the decoder hidden state. self._output_copying_layer = Linear(self.encoder_output_dim, self.decoder_output_dim) # At prediction time, we'll use a beam search to find the best target sequence. self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size)
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, max_decoding_steps: int, attention: Attention = None, attention_function: SimilarityFunction = None, beam_size: int = None, target_namespace: str = "tokens", target_embedding_dim: int = None, scheduled_sampling_ratio: float = 0., use_bleu: bool = True, emb_dropout: float = 0.5) -> None: super(Seq2Seq, self).__init__(vocab) self._target_namespace = target_namespace self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) if use_bleu: pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) # pylint: disable=protected-access self._bleu = BLEU(exclude_indices={pad_index, self._end_index, self._start_index}) else: self._bleu = None self._token_based_metric = TokenSequenceAccuracy() # At prediction time, we use a beam search to find the most likely sequence of target tokens. beam_size = beam_size or 1 self._max_decoding_steps = max_decoding_steps self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) # Dense embedding of source vocab tokens. self._source_embedder = source_embedder self._emb_dropout = Dropout(p=emb_dropout) # Encodes the sequence of source embeddings into a sequence of hidden states. self._encoder = encoder num_classes = self.vocab.get_vocab_size(self._target_namespace) # Attention mechanism applied to the encoder output for each step. if attention: if attention_function: raise ConfigurationError("You can only specify an attention module or an " "attention function, but not both.") self._attention = attention elif attention_function: self._attention = LegacyAttention(attention_function) else: self._attention = None # Dense embedding of vocab words in the target space. target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim() self._target_embedder = Embedding(num_classes, target_embedding_dim) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. self._encoder_output_dim = self._encoder.get_output_dim() self._decoder_output_dim = self._encoder_output_dim if self._attention: # If using attention, a weighted average over encoder outputs will be concatenated # to the previous target embedding to form the input to the decoder at each # time step. self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim else: # Otherwise, the input to the decoder is just the previous target embedding. self._decoder_input_dim = target_embedding_dim # We'll use an LSTM cell as the recurrent cell that produces a hidden state # for the decoder at each time step. # TODO (pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) # We project the hidden state from the decoder into the output vocabulary space # in order to get log probabilities of each target token, at each time step. self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, target_embedder: TextFieldEmbedder, source_encoder: Seq2VecEncoder, target_encoder: Seq2SeqEncoder, max_decoding_steps: int, attention: Attention = None, beam_size: int = None, target_namespace: str = "tokens", scheduled_sampling_ratio: float = 0., use_bleu: bool = True) -> None: super(AssociativeSeq2SeqHiddenDiff, self).__init__(vocab) self._target_namespace = target_namespace self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) if use_bleu: pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) # pylint: disable=protected-access self._bleu = BLEU(exclude_indices={ pad_index, self._end_index, self._start_index }) else: self._bleu = None # At prediction time, we use a beam search to find the most likely sequence of target tokens. beam_size = beam_size or 1 self._max_decoding_steps = max_decoding_steps self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) # Dense embedding of source vocab tokens. self._source_embedder = source_embedder # Encodes the sequence of source embeddings into a sequence of hidden states. self._source_encoder = source_encoder self._target_encoder = target_encoder self._encoder_output_dim = self._target_encoder.get_output_dim() self._decoder_output_dim = self._encoder_output_dim target_embedding_dim = source_embedder.get_output_dim() if attention: self._attention = attention self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim else: self._attention = None self._decoder_input_dim = target_embedding_dim + self._source_encoder.get_output_dim( ) num_classes = self.vocab.get_vocab_size(self._target_namespace) self._target_embedder = target_embedder self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
def __init__(self, vocab: Vocabulary, source_embedder_1: TextFieldEmbedder, source_encoder_1: Seq2SeqEncoder, beam_size: int, max_decoding_steps: int, decoder_output_dim: int, target_embedding_dim: int = 30, namespace: str = "tokens", tensor_based_metric: Metric = None, align_embeddings: bool = True, source_embedder_2: TextFieldEmbedder = None, source_encoder_2: Seq2SeqEncoder = None) -> None: super().__init__(vocab) self._source_embedder_1 = source_embedder_1 self._source_embedder_2 = source_embedder_1 or self._source_embedder_1 self._source_encoder_1 = source_encoder_1 self._source_encoder_2 = source_encoder_2 or self._source_encoder_1 self._source_namespace = namespace self._target_namespace = namespace self.encoder_output_dim_1 = self._source_encoder_1.get_output_dim() self.encoder_output_dim_2 = self._source_encoder_2.get_output_dim() self.cated_encoder_out_dim = self.encoder_output_dim_1 + self.encoder_output_dim_2 self.decoder_output_dim = decoder_output_dim # TODO: AllenNLP实现的Addictive Attention可能没有bias self._attention_1 = AdditiveAttention(self.decoder_output_dim, self.encoder_output_dim_1) self._attention_2 = AdditiveAttention(self.decoder_output_dim, self.encoder_output_dim_2) if not align_embeddings: self.target_embedding_dim = target_embedding_dim self._target_vocab_size = self.vocab.get_vocab_size( namespace=self._target_namespace) self._target_embedder = Embedding(self._target_vocab_size, target_embedding_dim) else: self._target_embedder = self._source_embedder_1._token_embedders[ "tokens"] self._target_vocab_size = self.vocab.get_vocab_size( namespace=self._target_namespace) self.target_embedding_dim = self._target_embedder.get_output_dim() self.decoder_input_dim = self.encoder_output_dim_1 + self.encoder_output_dim_2 + \ self.target_embedding_dim self._decoder_cell = LSTMCell(self.decoder_input_dim, self.decoder_output_dim) # 用于将两个encoder的最后隐层状态映射成解码器初始状态 self._encoder_out_projection_layer = torch.nn.Linear( in_features=self.cated_encoder_out_dim, out_features=self.decoder_output_dim ) # TODO: bias - true of false? # 软门控机制参数,用于计算lambda self._gate_projection_layer = torch.nn.Linear( in_features=self.decoder_output_dim + self.decoder_input_dim, out_features=1, bias=False) self._start_index = self.vocab.get_token_index(START_SYMBOL, namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, namespace) self._pad_index = self.vocab.get_token_index(self.vocab._padding_token, namespace) self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) self._tensor_based_metric = tensor_based_metric or \ BLEU(exclude_indices={self._pad_index, self._end_index, self._start_index})
def __init__( self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, vecoder: Seq2VecEncoder, sen_encoder: Seq2VecEncoder, max_decoding_steps: int = 32, attention: Attention = None, beam_size: int = None, target_namespace: str = "tokens", scheduled_sampling_ratio: float = 0.5, ) -> None: super().__init__(vocab) self._target_namespace = target_namespace self._scheduled_sampling_ratio = scheduled_sampling_ratio # Maybe we can try self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self.pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) self._max_decoding_steps = max_decoding_steps self.vocab = vocab # anything about dims self.sen_num = 10 # with open('../data/0510/cy/kg_and_train.pk', 'rb') as f: with open('cy/openkg.pk', 'rb') as f: self.kg_mat = torch.tensor(pickle.load(f)).float() self.symp_mat = torch.nn.Parameter(self.kg_mat).cuda() self.evovl_mat = torch.zeros(len(self.kg_mat), len(self.kg_mat)).cuda() # with open('../data/0510/cy/comp_topic2num.pk', 'rb') as f: with open('cy/comp_topic2num.pk', 'rb') as f: self.word_idx = pickle.load(f) self.idx_word = {v: k for k, v in self.word_idx.items()} self.vocab_to_idx = {} self.idx_to_vocab_list = [] self.vocab_list = [] for word, k in self.word_idx.items(): self.vocab_to_idx[vocab.get_token_index(word.strip())] = k self.idx_to_vocab_list.append(vocab.get_token_index(word.strip())) self.symp_size = len(self.symp_mat) + self.sen_num self.topic = len(self.symp_mat) self._encoder = encoder self._vecoder = vecoder self._sen_encoder = sen_encoder self.outfeature = self._sen_encoder.get_output_dim() # anything about graph self.symp_state = torch.nn.Parameter( torch.Tensor(self.symp_size, self.outfeature)) torch.nn.init.xavier_uniform_(self.symp_state, gain=1.414) self.predict_layer = torch.nn.Parameter( torch.Tensor(self.symp_size, self.outfeature)) self.predict_bias = torch.nn.Parameter(torch.Tensor(self.symp_size)) torch.nn.init.kaiming_uniform_(self.predict_layer) torch.nn.init.uniform_(self.predict_bias, -1 / self.symp_size**0.5, 1 / self.symp_size**0.5) self.attn_one = GATAttention(self.outfeature, self.outfeature, 1) self.attn_two = GATAttention(self.outfeature, self.outfeature, 1) self.attn_three = GATAttention(self.outfeature, self.outfeature, 1) # Metric self.kd_metric = KD_Metric() self.bleu_aver = NLTK_BLEU(ngram_weights=(0.25, 0.25, 0.25, 0.25)) self.bleu1 = NLTK_BLEU(ngram_weights=(1, 0, 0, 0)) self.bleu2 = NLTK_BLEU(ngram_weights=(0, 1, 0, 0)) self.bleu4 = NLTK_BLEU(ngram_weights=(0, 0, 0, 1)) self.topic_acc = Average() # anything about module self._source_embedder = source_embedder num_classes = self.vocab.get_vocab_size(self._target_namespace) target_embedding_dim = source_embedder.get_output_dim() self._target_embedder = Embedding(num_classes, target_embedding_dim) self._encoder_output_dim = self._encoder.get_output_dim( ) # 600 要不把前两个都换成outfeater得了 self._decoder_output_dim = self._encoder_output_dim * 2 self._decoder_input_dim = target_embedding_dim self._attention = None if attention: self._attention = attention self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim # 在这里把那个embedding融合进入试试? self.before_linear = Linear(2 * self.outfeature, self.outfeature) self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) self._output_projection_layer = Linear(self.outfeature * 2, num_classes) self.linear_all = Linear(self.outfeature * 3 + self._decoder_input_dim, 1) self.attention_linear = Linear(self.outfeature, self.outfeature) self.decoder_linear = Linear(self.outfeature * 2, self.outfeature) self.get_attn = Linear(self.outfeature, 1, bias=False) self.topic_acc = MyAverage() self.topic_rec = MyAverage() self.topic_f1 = F1() self.dink1 = Distinct1() self.dink2 = Distinct2() self.last_sen = 2
def __init__( self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, # just Embedding layer encoder1: Seq2SeqEncoder, # user encoder encoder2: Seq2SeqEncoder, # system encoder attention: Attention, # decoding attention max_decoding_steps: int = 200, # max timesteps of decoder beam_size: int = 3, # beam search parameter target_namespace: str = "target_tokens", # two separate vocabulary target_embedding_dim: int = None, # target word embedding dimension scheduled_sampling_ratio: float = 0., # maybe unnecessary projection_dim: int = None, # use_coverage: bool = False, # coverage penalty, optional coverage_loss_weight: float = None, domain_lambda: float = 0.5, # the penalty weight in final loss function, need to be tuned initializer: InitializerApplicator = InitializerApplicator() ) -> None: super(SPNet, self).__init__(vocab) # General variables # target_namespace: target_tokens; source_namespace: tokens; self._target_namespace = target_namespace self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self._source_unk_index = self.vocab.get_token_index(DEFAULT_OOV_TOKEN) self._target_unk_index = self.vocab.get_token_index( DEFAULT_OOV_TOKEN, self._target_namespace) self._source_vocab_size = self.vocab.get_vocab_size() self._target_vocab_size = self.vocab.get_vocab_size( self._target_namespace) # Encoder setting self._source_embedder = source_embedder self._encoder1 = encoder1 self._encoder2 = encoder2 # We assume that the 2 encoders have the same hidden state size self._encoder_output_dim = self._encoder1.get_output_dim() # Decoder setting self._target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim( ) self._num_classes = self.vocab.get_vocab_size(self._target_namespace) self._target_embedder = Embedding(self._num_classes, self._target_embedding_dim) self._decoder_input_dim = self._encoder_output_dim * 2 # default as the decoder_output_dim # input projection of decoder: [context_attn, target_emb] -> [decoder_input_dim] self._input_projection_layer = Linear( self._target_embedding_dim + self._encoder_output_dim * 2, self._decoder_input_dim) self._decoder_output_dim = self._encoder_output_dim * 2 self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) self._projection_dim = projection_dim or self._source_embedder.get_output_dim( ) self._output_projection_layer = Linear(self._decoder_output_dim, self._num_classes) self._p_gen_layer = Linear( self._encoder_output_dim * 2 + self._decoder_output_dim * 2 + self._decoder_input_dim, 1) self._attention = attention # coverage penalty setting self._use_coverage = use_coverage self._coverage_loss_weight = coverage_loss_weight self._eps = 1e-45 # Decoding strategy setting self._scheduled_sampling_ratio = scheduled_sampling_ratio self._max_decoding_steps = max_decoding_steps self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) # multitasking of domain classification self._domain_penalty = domain_lambda # penalty term = 0.5 as default self._classifier_params = Params({ "input_dim": self._decoder_output_dim, "hidden_dims": [128, 7], "activations": ["relu", "linear"], "dropout": [0.2, 0.0], "num_layers": 2 }) self._domain_classifier = FeedForward.from_params( self._classifier_params) initializer(self)
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, max_decoding_steps: int, target_namespace: str = "tokens", target_embedder: TextFieldEmbedder = None, attention_function: SimilarityFunction = None, scheduled_sampling_ratio: float = 0.25) -> None: super(PointerGeneratorPattern, self).__init__(vocab) self._source_embedder = source_embedder self._encoder = encoder self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace self._attention_function = attention_function self._scheduled_sampling_ratio = scheduled_sampling_ratio self._pattern_pos = [ '@@np@@', '@@ns@@', '@@ni@@', '@@nz@@', '@@m@@', '@@i@@', '@@id@@', '@@t@@', '@@j@@' ] self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) num_classes = self.vocab.get_vocab_size(self._target_namespace) self._target_embedder = target_embedder or source_embedder #!!! attention on decoder output, not on decoder input !!!# self._decoder_input_dim = self._target_embedder.get_output_dim() # decoder use UniLSTM while encoder use BiLSTM self._decoder_hidden_dim = self._encoder.get_output_dim() # decoder: h0 c0 projection_layer from final_encoder_output self.decode_h0_projection_layer = Linear( self._encoder.get_output_dim(), self._decoder_hidden_dim) self.decode_c0_projection_layer = Linear( self._encoder.get_output_dim(), self._decoder_hidden_dim) self._decoder_attention = Attention(self._attention_function) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the decoder_hidden of the decoder at each time step. # V[s_t, h*_t] + b self._decoder_output_dim = self._decoder_hidden_dim + self._encoder.get_output_dim( ) #[s_t, h*_t] # TODO (pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_hidden_dim) self._output_attention_layer = Linear(self._decoder_output_dim, self._decoder_hidden_dim) #V[s_t, h*_t] + b self._output_projection_layer = Linear(self._decoder_hidden_dim, num_classes) # num_classes->V' # generationp robability self._pointer_gen_layer = Linear( self._decoder_hidden_dim + self._encoder.get_output_dim() + self._decoder_input_dim, 1) # metrics self.metrics = { "ROUGE-1": Rouge(1), "ROUGE-2": Rouge(2), }
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, target_namespace: str, encoder: Seq2SeqEncoder, decoder: Dict, max_decoding_steps: int, target_embedding_dim: int = None, attention: Dict = None, beam_size: int = None, scheduled_sampling_ratio: float = 0., use_bleu: bool = True, visualize_attention: bool = True) -> None: super(NmtSeq2Seq, self).__init__(vocab) self._scheduled_sampling_ratio = scheduled_sampling_ratio self._target_namespace = target_namespace # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) if use_bleu: pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) # pylint: disable=protected-access self._bleu = BLEU(exclude_indices={ pad_index, self._end_index, self._start_index }) else: self._bleu = None # At prediction time, we use a beam search to find the most likely sequence of target tokens. beam_size = beam_size or 1 self._max_decoding_steps = max_decoding_steps self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) # Dense embedding of source vocab tokens. self._source_embedder = source_embedder # Encodes the sequence of source embeddings into a sequence of hidden states. self._encoder = encoder num_classes = self.vocab.get_vocab_size(self._target_namespace) # Attention mechanism params applied to the encoder output for each step. self._attention = attention self._visualize_attention = visualize_attention # Dense embedding of vocab words in the target space. target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim( ) self._target_embedder = Embedding(num_classes, target_embedding_dim) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. self._encoder_output_dim = self._encoder.get_output_dim() # self._decoder_output_dim = self._encoder_output_dim self._decoder_input_dim = decoder["input_size"] # If using attention make sure the .jsonnet params reflect this architecture: # input_to_decoder_rnn = [prev_word + attended_context_vector] self._decoder_output_dim = decoder['hidden_size'] # We'll use an RNN cell as the recurrent cell that produces a hidden state # for the decoder at each time step. decoder_cell_type = decoder["type"] if decoder_cell_type == "gru": self._decoder_cell = GRUCell(self._decoder_input_dim, self._decoder_output_dim) elif decoder_cell_type == "lstm": self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) else: raise ValueError( "Dialogue encoder of type {} not supported yet!".format( decoder_cell_type)) # We project the hidden state from the decoder into the output vocabulary space # in order to get log probabilities of each target token, at each time step. self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, extra_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, extra_encoder: Seq2SeqEncoder, max_decoding_steps: int, beam_size: int = None, target_namespace: str = "tokens", target_embedding_dim: int = None, scheduled_sampling_ratio: float = 0., use_bleu: bool = True) -> None: super(InformedSeq2Seq, self).__init__(vocab) self._target_namespace = target_namespace self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) if use_bleu: pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) # pylint: disable=protected-access self._bleu = BLEU(exclude_indices={ pad_index, self._end_index, self._start_index }) else: self._bleu = None # At prediction time, we use a beam search to find the most likely sequence of target tokens. beam_size = beam_size or 1 self._max_decoding_steps = max_decoding_steps self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) # Dense embedding of source vocab tokens. self._source_embedder = source_embedder self._extra_embedder = extra_embedder # Encodes the sequence of source embeddings into a sequence of hidden states. self._encoder = encoder self._extra_encoder = extra_encoder num_classes = self.vocab.get_vocab_size(self._target_namespace) # Dense embedding of vocab words in the target space. # TODO: target_embedding_dim should be size of the concatenated vector target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim( ) self._target_embedder = Embedding(num_classes, target_embedding_dim) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. # TODO: encoder_output_dim should be size of the concatenated vector self._encoder_output_dim = self._encoder.get_output_dim() self._decoder_output_dim = self._encoder_output_dim self._decoder_input_dim = target_embedding_dim # We'll use an LSTM cell as the recurrent cell that produces a hidden state # for the decoder at each time step. # TODO (pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) # We project the hidden state from the decoder into the output vocabulary space # in order to get log probabilities of each target token, at each time step. self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
def __init__(self, vocab: Vocabulary, source_embedding: Embedding, target_embedding: Embedding, encoder: Seq2SeqEncoder, target_namespace: str, max_decoding_steps: int, attention: Attention = None, attention_function: SimilarityFunction = None, beam_size: int = None, scheduled_sampling_ratio: float = 0.) -> None: super().__init__(vocab) self._target_namespace = target_namespace self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) # Dense embedding of source vocab tokens. self._source_embedding = source_embedding # Encodes the sequence of source embeddings into a sequence of hidden states. self._encoder = encoder num_classes = self.vocab.get_vocab_size(self._target_namespace) # Attention mechanism applied to the encoder output for each step. if attention: if attention_function: raise ConfigurationError( "You can only specify an attention module or an " "attention function, but not both.") self._attention = attention elif attention_function: self._attention = LegacyAttention(attention_function) else: self._attention = None # Dense embedding of vocab words in the target space. self._target_embedding = target_embedding target_embedding_dim = self._target_embedding.get_output_dim() # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. self._encoder_output_dim = self._encoder.get_output_dim() self._decoder_output_dim = self._encoder_output_dim if self._attention: # If using attention, a weighted average over encoder output will be concatenated # to the previous target embedding to form the input to the decoder at each # time step. self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim else: # Otherwise, the input to the decoder is just the previous target embedding. self._decoder_input_dim = target_embedding_dim # We'll use an LSTM cell as the recurrent cell that produces a hidden state # for the decoder at each time step. # TODO (pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) # We project the hidden state from the decoder into the output vocabulary space # in order to get log probabilities of each target token, at each time step. self._output_projection_layer = Linear(self._decoder_output_dim, num_classes) # At prediction time, we can use a beam search to find the most likely sequence of target tokens. # If the beam_size parameter is not given, we'll just use a greedy search (equivalent to beam_size = 1). self._max_decoding_steps = max_decoding_steps if beam_size is not None: self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) else: self._beam_search = None
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, attention: Attention, max_decoding_steps: int, beam_size: int = None, target_namespace: str = "tokens", target_embedding_dim: int = None, scheduled_sampling_ratio: float = 0., projection_dim: int = None, use_coverage: bool = False, coverage_shift: float = 0., coverage_loss_weight: float = None, embed_attn_to_output: bool = False) -> None: super(PointerGeneratorNetwork, self).__init__(vocab) self._target_namespace = target_namespace self._start_index = self.vocab.get_token_index(START_SYMBOL, target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, target_namespace) self._unk_index = self.vocab.get_token_index(DEFAULT_OOV_TOKEN, target_namespace) self._vocab_size = self.vocab.get_vocab_size(target_namespace) assert self._vocab_size > 2, \ "Target vocabulary is empty. Make sure 'target_namespace' option of the model is correct." # Encoder self._source_embedder = source_embedder self._encoder = encoder self._encoder_output_dim = self._encoder.get_output_dim() # Decoder self._target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim( ) self._num_classes = self.vocab.get_vocab_size(target_namespace) self._target_embedder = Embedding(self._num_classes, self._target_embedding_dim) self._decoder_input_dim = self._encoder_output_dim + self._target_embedding_dim self._decoder_output_dim = self._encoder_output_dim self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) self._projection_dim = projection_dim or self._source_embedder.get_output_dim( ) hidden_projection_dim = self._decoder_output_dim if not embed_attn_to_output else self._decoder_output_dim * 2 self._hidden_projection_layer = Linear(hidden_projection_dim, self._projection_dim) self._output_projection_layer = Linear(self._projection_dim, self._num_classes) self._p_gen_layer = Linear( self._decoder_output_dim * 3 + self._decoder_input_dim, 1) self._attention = attention self._use_coverage = use_coverage self._coverage_loss_weight = coverage_loss_weight self._eps = 1e-31 self._embed_attn_to_output = embed_attn_to_output self._coverage_shift = coverage_shift # Metrics self._p_gen_sum = 0.0 self._p_gen_iterations = 0 self._coverage_loss_sum = 0.0 self._coverage_iterations = 0 # Decoding self._scheduled_sampling_ratio = scheduled_sampling_ratio self._max_decoding_steps = max_decoding_steps self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size or 1)
def __init__(self, vocab: Vocabulary, bert_model: BertQA, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, attention: Attention, beam_size: int, max_decoding_steps: int, target_embedding_dim: int = 30, copy_token: str = "@COPY@", source_namespace: str = "source_tokens", target_namespace: str = "target_tokens", tensor_based_metric: Metric = None, token_based_metric: Metric = None, initializer: InitializerApplicator = InitializerApplicator(), dropout: float = 0.0) -> None: super().__init__(vocab) self.bert_model = bert_model self._source_namespace = source_namespace self._target_namespace = target_namespace self._src_start_index = self.vocab.get_token_index( START_SYMBOL, self._source_namespace) self._src_end_index = self.vocab.get_token_index( END_SYMBOL, self._source_namespace) self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self._oov_index = self.vocab.get_token_index(self.vocab._oov_token, self._target_namespace) # pylint: disable=protected-access self._pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) # pylint: disable=protected-access self._copy_index = self.vocab.add_token_to_namespace( copy_token, self._target_namespace) self._tensor_based_metric = tensor_based_metric or \ BLEU(exclude_indices={self._pad_index, self._end_index, self._start_index}) self._token_based_metric = token_based_metric self._action_accuracy = CategoricalAccuracy() self._target_vocab_size = self.vocab.get_vocab_size( self._target_namespace) # Encoding modules. self._source_embedder = source_embedder self._encoder = encoder # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. # We arbitrarily set the decoder's input dimension to be the same as the output dimension. self.encoder_output_dim = self._encoder.get_output_dim() self.decoder_output_dim = self.encoder_output_dim self.decoder_input_dim = self.decoder_output_dim embedding_dim = self.bert_model._text_field_embedder.get_output_dim() self._action_predictor = Linear(embedding_dim, 4) self._init_decoder_projection = Linear(self.encoder_output_dim, self.decoder_output_dim) target_vocab_size = self.vocab.get_vocab_size(self._target_namespace) # The decoder input will be a function of the embedding of the previous predicted token, # an attended encoder hidden state called the "attentive read", and another # weighted sum of the encoder hidden state called the "selective read". # While the weights for the attentive read are calculated by an `Attention` module, # the weights for the selective read are simply the predicted probabilities # corresponding to each token in the source sentence that matches the target # token from the previous timestep. self._target_embedder = Embedding(target_vocab_size, target_embedding_dim) self._attention = attention self._input_projection_layer = Linear( target_embedding_dim + self.encoder_output_dim * 2, self.decoder_input_dim) # We then run the projected decoder input through an LSTM cell to produce # the next hidden state. self._decoder_cell = LSTMCell(self.decoder_input_dim, self.decoder_output_dim) # We create a "generation" score for each token in the target vocab # with a linear projection of the decoder hidden state. self._output_generation_layer = Linear(self.decoder_output_dim, target_vocab_size) # We create a "copying" score for each source token by applying a non-linearity # (tanh) to a linear projection of the encoded hidden state for that token, # and then taking the dot product of the result with the decoder hidden state. self._output_copying_layer = Linear(self.encoder_output_dim, self.decoder_output_dim) # At prediction time, we'll use a beam search to find the best target sequence. self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x initializer(self)
def __init__( self, task: str, vocab: Vocabulary, input_dim: int, max_decoding_steps: int, loss_weight: float = 1.0, attention: Attention = None, beam_size: int = None, target_namespace: str = "target_tokens", target_embedding_dim: int = None, scheduled_sampling_ratio: float = 0.0, use_bleu: bool = True, bleu_ngram_weights: Iterable[float] = (0.25, 0.25, 0.25, 0.25), target_decoder_layers: int = 1, **kwargs, ) -> None: super().__init__(vocab, **kwargs) self.task = task self.vocab = vocab self.loss_weight = loss_weight self._target_namespace = task + '_target_words' self._target_decoder_layers = target_decoder_layers self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) if use_bleu: pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) self._bleu = BLEU(bleu_ngram_weights, exclude_indices={ pad_index, self._end_index, self._start_index }) else: self._bleu = None self.metrics = {"bleu": self._bleu} # At prediction time, we use a beam search to find the most likely sequence of target tokens. beam_size = beam_size or 1 self._max_decoding_steps = max_decoding_steps self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) num_classes = self.vocab.get_vocab_size( namespace=self._target_namespace) # Attention mechanism applied to the encoder output for each step. self._attention = attention # The input to the decoder is just the previous target embedding. target_embedding_dim = target_embedding_dim or self._encoder_output_dim self._decoder_input_dim = target_embedding_dim # Dense embedding of vocab words in the target space. self._target_embedder = Embedding(num_embeddings=num_classes, embedding_dim=target_embedding_dim) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. self._encoder_output_dim = input_dim self._decoder_output_dim = self._encoder_output_dim if self._attention: # If using attention, a weighted average over encoder outputs will be concatenated # to the previous target embedding to form the input to the decoder at each # time step. self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim else: # Otherwise, the input to the decoder is just the previous target embedding. self._decoder_input_dim = target_embedding_dim # We'll use an LSTM cell as the recurrent cell that produces a hidden state # for the decoder at each time step. if self._target_decoder_layers > 1: self._decoder_cell = LSTM( self._decoder_input_dim, self._decoder_output_dim, self._target_decoder_layers, ) else: self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) # We project the hidden state from the decoder into the output vocabulary space # in order to get log probabilities of each target token, at each time step. self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
def __init__( self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, max_decoding_steps: int, target_namespace: str = "tokens", target_embedding_dim: int = None, attention_function: SimilarityFunction = None, scheduled_sampling_ratio: float = 0.0, weight_function="softmax", gumbel_tau: float = 0.66, gumbel_hard: bool = True, gumbel_eps: float = 1e-10, infer_with: str = "distribution", self_feed_with: str = "argmax_distribution", ) -> None: super(Rnn2RnnDifferentiableNll, self).__init__(vocab) self._source_embedder = source_embedder self._encoder = encoder self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace self._attention_function = attention_function self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) num_classes = self.vocab.get_vocab_size(self._target_namespace) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with that of the final hidden states of the encoder. Also, if # we're using attention with ``DotProductSimilarity``, this is needed. self._decoder_output_dim = self._encoder.get_output_dim() target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim( ) self._target_embedder = Embedding(num_classes, target_embedding_dim) if self._attention_function: self._decoder_attention = LegacyAttention(self._attention_function) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the input vector of the decoder at each time step. self._decoder_input_dim = self._encoder.get_output_dim( ) + target_embedding_dim else: self._decoder_input_dim = target_embedding_dim # TODO (pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) self._output_projection_layer = Linear(self._decoder_output_dim, num_classes) self._weights_calculation_function = weight_function self._gumbel_tau = gumbel_tau self._gumbel_hard = gumbel_hard self._gamble_eps = gumbel_eps if self_feed_with not in { "distribution", "argmax_logits", "argmax_distribution", "detach_distribution" }: raise ValueError( "Allowed vals for selffeed are {distribution, argmax_logits, argmax_distribution, detach_distribution}" ) if infer_with not in { "distribution", "argmax_logits", "argmax_distribution" }: raise ValueError( "Allowed vals for ifer_with are {distribution, argmax_logits, argmax_distribution}" ) self._infer_with = infer_with self._self_feed_with = self_feed_with
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, max_decoding_steps: int, seq_metrics: Metric, attention: Attention, beam_size: int = None, source_namespace: str = 'source_tokens', target_namespace: str = "tokens", target_embedding_dim: int = None, scheduled_sampling_ratio: float = 0., use_bleu: bool = False, encoder_input_dropout: int = 0.0, encoder_output_dropout: int = 0.0, dropout=0.0, feed_output_attention_to_decoder: bool = False, keep_decoder_output_dim_same_as_encoder: bool = True, initializer: InitializerApplicator = InitializerApplicator()) -> None: super(RecombinationSeq2SeqWithCopy, self).__init__(vocab) self._source_namespace = source_namespace self._target_namespace = target_namespace self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self._pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) # pylint: disable=protected-access # Evaluation Metrics if use_bleu: pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) # pylint: disable=protected-access self._bleu = BLEU(exclude_indices={pad_index, self._end_index, self._start_index}) else: self._bleu = None self._seq_metric = seq_metrics # At prediction time, we use a beam search to find the most likely sequence of target tokens. beam_size = beam_size or 1 self._max_decoding_steps = max_decoding_steps self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) # Dense embedding of source vocab tokens. self._source_embedder = source_embedder # Encoder # Encodes the sequence of source embeddings into a sequence of hidden states. self._encoder = encoder self._encoder_output_dim = self._encoder.get_output_dim() # Attention mechanism applied to the encoder output for each step. self._attention = attention self._feed_output_attention_to_decoder = feed_output_attention_to_decoder if self._feed_output_attention_to_decoder: # If using attention, a weighted average over encoder outputs will be concatenated # to the previous target embedding to form the input to the decoder at each # time step. self._decoder_input_dim = self._encoder_output_dim + target_embedding_dim else: # Otherwise, the input to the decoder is just the previous target embedding. self._decoder_input_dim = target_embedding_dim # Decoder # Dense embedding of vocab words in the target space. num_classes = self.vocab.get_vocab_size(self._target_namespace) self._num_classes = num_classes target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim() self._target_embedder = Embedding(num_classes, target_embedding_dim) # TODO: relax this assumption # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. self._keep_decoder_output_dim_same_as_encoder = keep_decoder_output_dim_same_as_encoder if not self._keep_decoder_output_dim_same_as_encoder: self._decoder_output_dim = int(self._encoder_output_dim / 2) if encoder.is_bidirectional() \ else self._encoder_output_dim else: self._decoder_output_dim = self._encoder_output_dim self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) self._transform_decoder_init_state = torch.nn.Sequential( torch.nn.Linear(self._encoder_output_dim, self._decoder_output_dim), torch.nn.Tanh() ) # Generate Score self._output_projection_layer = Linear(self._decoder_output_dim + self._encoder_output_dim, num_classes) # Dropout Layers self._encoder_input_dropout = torch.nn.Dropout(p=encoder_input_dropout) self._encoder_output_dropout = torch.nn.Dropout(p=encoder_output_dropout) self._output_dropout = torch.nn.Dropout(p=dropout) self._embedded_dropout = torch.nn.Dropout(p=dropout) initializer(self)
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, attention: Attention, beam_size: int, max_decoding_steps: int, binary_pred_feature_dim: int = 0, language_flag_dim: int = 0, number_of_languages: int = 2, target_embedding_dim: int = 100, copy_token: str = "@COPY@", source_namespace: str = "source_tokens", target_namespace: str = "target_tokens", tensor_based_metric: Metric = None, token_based_metric: Metric = None) -> None: super().__init__(vocab) self._source_namespace = source_namespace self._target_namespace = target_namespace self._src_start_index = self.vocab.get_token_index( START_SYMBOL, self._source_namespace) self._src_end_index = self.vocab.get_token_index( END_SYMBOL, self._source_namespace) self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self._oov_index = self.vocab.get_token_index(self.vocab._oov_token, self._target_namespace) # pylint: disable=protected-access self._pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) # pylint: disable=protected-access self._copy_index = self.vocab.add_token_to_namespace( copy_token, self._target_namespace) self._tensor_based_metric = tensor_based_metric or \ BLEU(exclude_indices={self._pad_index, self._end_index, self._start_index}) self._token_based_metric = token_based_metric self._target_vocab_size = self.vocab.get_vocab_size( self._target_namespace) # There are exactly features for the verb predicate embedding (Verb or Non-Verb). if binary_pred_feature_dim > 0: self._binary_feature_embedding = Embedding( 2, binary_pred_feature_dim) else: self._binary_feature_embedding = None # Language Token Embeddings! if language_flag_dim > 0: self._language_embedding = Embedding(number_of_languages, language_flag_dim) else: self._language_embedding = None # Encoding modules self._source_embedder = source_embedder self._encoder = encoder # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. # We arbitrarily set the decoder's input dimension to be the same as the output dimension. self.encoder_output_dim = self._encoder.get_output_dim() self.decoder_output_dim = self.encoder_output_dim self.decoder_input_dim = self.decoder_output_dim target_vocab_size = self.vocab.get_vocab_size(self._target_namespace) # The decoder input will be a function of the embedding of the previous predicted token, # an attended encoder hidden state called the "attentive read", and another # weighted sum of the encoder hidden state called the "selective read". # While the weights for the attentive read are calculated by an `Attention` module, # the weights for the selective read are simply the predicted probabilities # corresponding to each token in the source sentence that matches the target # token from the previous timestep. self._target_embedder = Embedding(target_vocab_size, target_embedding_dim) self._attention = attention self._input_projection_layer = Linear( target_embedding_dim + language_flag_dim + self.encoder_output_dim * 2, self.decoder_input_dim) self._language_dec_indicator = None self._beam_size = beam_size # We then run the projected decoder input through an LSTM cell to produce # the next hidden state. self._decoder_cell = LSTMCell(self.decoder_input_dim, self.decoder_output_dim) # We create a "generation" score for each token in the target vocab # with a linear projection of the decoder hidden state. self._output_generation_layer = Linear(self.decoder_output_dim, target_vocab_size) # We create a "copying" score for each source token by applying a non-linearity # (tanh) to a linear projection of the encoded hidden state for that token, # and then taking the dot product of the result with the decoder hidden state. self._output_copying_layer = Linear(self.encoder_output_dim, self.decoder_output_dim) # At prediction time, we'll use a beam search to find the best target sequence. self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size)
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, max_decoding_steps: int, target_namespace: str = "tokens", target_embedder: TextFieldEmbedder = None, attention_function: SimilarityFunction = None, scheduled_sampling_ratio: float = 0.25, pointer_gen: bool = True, language_model: bool = True, max_oovs: int = None) -> None: super(PointerGenerator, self).__init__(vocab) self._source_embedder = source_embedder self._encoder = encoder self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace self._attention_function = attention_function self._scheduled_sampling_ratio = scheduled_sampling_ratio self._pointer_gen = pointer_gen self._language_model = language_model if self._pointer_gen: self._max_oovs = max_oovs self.vocab.set_max_oovs(self._max_oovs) # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) num_classes = self.vocab.get_vocab_size(self._target_namespace) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with that of the final hidden states of the encoder. Also, if # we're using attention with ``DotProductSimilarity``, this is needed. self._target_embedder = target_embedder or source_embedder #!!! attention on decoder output, not on decoder input !!!# self._decoder_input_dim = self._target_embedder.get_output_dim() # decoder use UniLSTM while encoder use BiLSTM self._decoder_hidden_dim = self._encoder.get_output_dim()//2 # decoder: h0 c0 projection_layer from final_encoder_output self.decode_h0_projection_layer = Linear(self._encoder.get_output_dim(),self._decoder_hidden_dim) self.decode_c0_projection_layer = Linear(self._encoder.get_output_dim(),self._decoder_hidden_dim) self._decoder_attention = Attention(self._attention_function) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the decoder_hidden of the decoder at each time step. # V[s_t, h*_t] + b self._decoder_output_dim = self._decoder_hidden_dim + self._encoder.get_output_dim() #[s_t, h*_t] # TODO (pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_hidden_dim) self._output_attention_layer = Linear(self._decoder_output_dim, self._decoder_hidden_dim) #V[s_t, h*_t] + b self._output_projection_layer = Linear(self._decoder_hidden_dim, num_classes) # num_classes->V' # generationp robability if self._pointer_gen: self._pointer_gen_layer = Linear(self._decoder_hidden_dim+self._encoder.get_output_dim()+self._decoder_input_dim, 1) # metrics self.metrics = { "ROUGE-1": Rouge(1), "ROUGE-2": Rouge(2), }
class SimpleSeq2Seq(Model): """ This ``SimpleSeq2Seq`` class is a :class:`Model` which takes a sequence, encodes it, and then uses the encoded representations to decode another sequence. You can use this as the basis for a neural machine translation system, an abstractive summarization system, or any other common seq2seq problem. The model here is simple, but should be a decent starting place for implementing recent models for these tasks. This ``SimpleSeq2Seq`` model takes an encoder (:class:`Seq2SeqEncoder`) as an input, and implements the functionality of the decoder. In this implementation, the decoder uses the encoder's outputs in two ways. The hidden state of the decoder is initialized with the output from the final time-step of the encoder, and when using attention, a weighted average of the outputs from the encoder is concatenated to the inputs of the decoder at every timestep. Parameters ---------- vocab : ``Vocabulary``, required Vocabulary containing source and target vocabularies. They may be under the same namespace (``tokens``) or the target tokens can have a different namespace, in which case it needs to be specified as ``target_namespace``. source_embedder : ``TextFieldEmbedder``, required Embedder for source side sequences encoder : ``Seq2SeqEncoder``, required The encoder of the "encoder/decoder" model max_decoding_steps : int, required Length of decoded sequences target_namespace : str, optional (default = 'tokens') If the target side vocabulary is different from the source side's, you need to specify the target's namespace here. If not, we'll assume it is "tokens", which is also the default choice for the source side, and this might cause them to share vocabularies. target_embedding_dim : int, optional (default = source_embedding_dim) You can specify an embedding dimensionality for the target side. If not, we'll use the same value as the source embedder's. attention_function: ``SimilarityFunction``, optional (default = None) If you want to use attention to get a dynamic summary of the encoder outputs at each step of decoding, this is the function used to compute similarity between the decoder hidden state and encoder outputs. scheduled_sampling_ratio: float, optional (default = 0.0) At each timestep during training, we sample a random number between 0 and 1, and if it is not less than this value, we use the ground truth labels for the whole batch. Else, we use the predictions from the previous time step for the whole batch. If this value is 0.0 (default), this corresponds to teacher forcing, and if it is 1.0, it corresponds to not using target side ground truth labels. See the following paper for more information: Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks. Bengio et al., 2015. """ def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, max_decoding_steps: int, target_namespace: str = "target_tags", target_embedding_dim: int = None, attention_function: SimilarityFunction = None, scheduled_sampling_ratio: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(SimpleSeq2Seq, self).__init__(vocab, regularizer) self._source_embedder = source_embedder self._encoder = encoder self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace self._attention_function = attention_function self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) num_classes = self.vocab.get_vocab_size(self._target_namespace) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with that of the final hidden states of the encoder. Also, if # we're using attention with ``DotProductSimilarity``, this is needed. self._decoder_output_dim = self._encoder.get_output_dim() target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim( ) self._target_embedder = Embedding(num_classes, target_embedding_dim) if self._attention_function: self._decoder_attention = Attention(self._attention_function) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the input vector of the decoder at each time step. self._decoder_input_dim = self._encoder.get_output_dim( ) + target_embedding_dim else: self._decoder_input_dim = target_embedding_dim # TODO (pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) # self._decoder_cell = GRUCell(self._decoder_input_dim, self._decoder_output_dim, bias=False) self._output_projection_layer = Linear(self._decoder_output_dim, num_classes) self.metrics = { "accuracy": CategoricalAccuracy(), "accuracy3": CategoricalAccuracy(top_k=3) } self.span_metric = SpanBasedF1Measure( vocab, tag_namespace=target_namespace, ignore_classes=[START_SYMBOL[2:], END_SYMBOL[2:]]) initializer(self) # Initialize forget gate encoder_parameters = self._encoder.state_dict() for pname in encoder_parameters: if 'bias_' in pname: print(pname) b = encoder_parameters[pname] l = len(b) b[l // 4:l // 2] = 1.0 decoder_parameters = self._decoder_cell.state_dict() for pname in decoder_parameters: if 'bias_' in pname: print(pname) b = decoder_parameters[pname] l = len(b) b[l // 4:l // 2] = 1.0 def _examine_source_indices(self, preindices): if not isinstance(preindices, numpy.ndarray): preindices = preindices.data.cpu().numpy() all_predicted_tokens = [] for indices in preindices: predicted_tokens = [ self.vocab.get_token_from_index(x, namespace="source_tokens") for x in list(indices) ] all_predicted_tokens.append(predicted_tokens) return all_predicted_tokens def _examine_target_indices(self, preindices): if not isinstance(preindices, numpy.ndarray): preindices = preindices.data.cpu().numpy() all_predicted_tokens = [] for indices in preindices: indices = list(indices) # Collect indices till the first end_symbol # if self._end_index in indices: # indices = indices[:indices.index(self._end_index)] predicted_tokens = [ self.vocab.get_token_from_index( x, namespace=self._target_namespace) for x in indices ] all_predicted_tokens.append(predicted_tokens) return all_predicted_tokens def _print_source_target_triplets(self, src, tgt, true_tgt): src = self._examine_source_indices(src) true_tgt = self._examine_target_indices(true_tgt) tgt = self._examine_target_indices(tgt) for i in [0, int(len(src) / 2), -1]: print('Source: ', ' '.join(src[i])) print('Target: ', ' '.join(tgt[i])) print('True target: ', ' '.join(true_tgt[i][1:])) print('') @overrides def forward( self, # type: ignore source_tokens: Dict[str, torch.LongTensor], target_tokens: Dict[str, torch.LongTensor] = None ) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Decoder logic for producing the entire target sequence. Parameters ---------- source_tokens : Dict[str, torch.LongTensor] The output of ``TextField.as_array()`` applied on the source ``TextField``. This will be passed through a ``TextFieldEmbedder`` and then through an encoder. target_tokens : Dict[str, torch.LongTensor], optional (default = None) Output of ``Textfield.as_array()`` applied on target ``TextField``. We assume that the target tokens are also represented as a ``TextField``. """ # embed() # (batch_size, input_sequence_length, encoder_output_dim) embedded_input = self._source_embedder(source_tokens) batch_size, _, _ = embedded_input.size() source_mask = get_text_field_mask(source_tokens) encoder_outputs = self._encoder(embedded_input, source_mask) final_encoder_output = encoder_outputs[:, -1] # (batch_size, encoder_output_dim) if target_tokens: targets = target_tokens["tokens"] target_sequence_length = targets.size()[1] # The last input from the target is either padding or the end symbol. Either way, we # don't have to process it. num_decoding_steps = target_sequence_length - 1 else: num_decoding_steps = self._max_decoding_steps decoder_hidden = final_encoder_output decoder_context = Variable(encoder_outputs.data.new().resize_( batch_size, self._decoder_output_dim).fill_(0)) last_predictions = None step_logits = [] step_probabilities = [] step_predictions = [] for timestep in range(num_decoding_steps): if self.training and all( torch.rand(1) >= self._scheduled_sampling_ratio): input_choices = targets[:, timestep] else: if timestep == 0: # For the first timestep, when we do not have targets, we input start symbols. # (batch_size,) input_choices = Variable( source_mask.data.new().resize_(batch_size).fill_( self._start_index)) else: input_choices = last_predictions decoder_input = self._prepare_decode_step_input( input_choices, decoder_hidden, encoder_outputs, source_mask) decoder_hidden, decoder_context = self._decoder_cell( decoder_input, (decoder_hidden, decoder_context)) # (batch_size, num_classes) output_projections = self._output_projection_layer(decoder_hidden) # list of (batch_size, 1, num_classes) step_logits.append(output_projections.unsqueeze(1)) class_probabilities = F.softmax(output_projections, dim=-1) _, predicted_classes = torch.max(class_probabilities, 1) step_probabilities.append(class_probabilities.unsqueeze(1)) last_predictions = predicted_classes # (batch_size, 1) step_predictions.append(last_predictions.unsqueeze(1)) # step_logits is a list containing tensors of shape (batch_size, 1, num_classes) # This is (batch_size, num_decoding_steps, num_classes) logits = torch.cat(step_logits, 1) class_probabilities = torch.cat(step_probabilities, 1) all_predictions = torch.cat(step_predictions, 1) output_dict = { "logits": logits, "class_probabilities": class_probabilities, "predictions": all_predictions } if target_tokens: target_mask = get_text_field_mask(target_tokens) loss = self._get_loss(logits, targets, target_mask) output_dict["loss"] = loss # TODO: Define metrics relevant_targets = targets[:, 1:].contiguous( ) # (batch_size, num_decoding_steps) relevant_mask = target_mask[:, 1:].contiguous() for metric in self.metrics.values(): metric(logits, relevant_targets, relevant_mask.float()) class_probabilities = logits * 0. for i, instance_tags in enumerate( all_predictions.cpu().data.numpy()): for j, tag_id in enumerate(instance_tags): class_probabilities[i, j, tag_id] = 1 # embed() self.span_metric(class_probabilities, relevant_targets, relevant_mask) self._print_source_target_triplets(source_tokens['tokens'], all_predictions, target_tokens['tokens']) return output_dict def _prepare_decode_step_input( self, input_indices: torch.LongTensor, decoder_hidden_state: torch.LongTensor = None, encoder_outputs: torch.LongTensor = None, encoder_outputs_mask: torch.LongTensor = None) -> torch.LongTensor: """ Given the input indices for the current timestep of the decoder, and all the encoder outputs, compute the input at the current timestep. Note: This method is agnostic to whether the indices are gold indices or the predictions made by the decoder at the last timestep. So, this can be used even if we're doing some kind of scheduled sampling. If we're not using attention, the output of this method is just an embedding of the input indices. If we are, the output will be a concatentation of the embedding and an attended average of the encoder inputs. Parameters ---------- input_indices : torch.LongTensor Indices of either the gold inputs to the decoder or the predicted labels from the previous timestep. decoder_hidden_state : torch.LongTensor, optional (not needed if no attention) Output of from the decoder at the last time step. Needed only if using attention. encoder_outputs : torch.LongTensor, optional (not needed if no attention) Encoder outputs from all time steps. Needed only if using attention. encoder_outputs_mask : torch.LongTensor, optional (not needed if no attention) Masks on encoder outputs. Needed only if using attention. """ # input_indices : (batch_size,) since we are processing these one timestep at a time. # (batch_size, target_embedding_dim) embedded_input = self._target_embedder(input_indices) if self._attention_function: # encoder_outputs : (batch_size, input_sequence_length, encoder_output_dim) # Ensuring mask is also a FloatTensor. Or else the multiplication within attention will # complain. encoder_outputs_mask = encoder_outputs_mask.float() # (batch_size, input_sequence_length) input_weights = self._decoder_attention(decoder_hidden_state, encoder_outputs, encoder_outputs_mask) # (batch_size, encoder_output_dim) attended_input = weighted_sum(encoder_outputs, input_weights) # (batch_size, encoder_output_dim + target_embedding_dim) return torch.cat((attended_input, embedded_input), -1) else: return embedded_input @staticmethod def _get_loss(logits: torch.LongTensor, targets: torch.LongTensor, target_mask: torch.LongTensor) -> torch.LongTensor: """ Takes logits (unnormalized outputs from the decoder) of size (batch_size, num_decoding_steps, num_classes), target indices of size (batch_size, num_decoding_steps+1) and corresponding masks of size (batch_size, num_decoding_steps+1) steps and computes cross entropy loss while taking the mask into account. The length of ``targets`` is expected to be greater than that of ``logits`` because the decoder does not need to compute the output corresponding to the last timestep of ``targets``. This method aligns the inputs appropriately to compute the loss. During training, we want the logit corresponding to timestep i to be similar to the target token from timestep i + 1. That is, the targets should be shifted by one timestep for appropriate comparison. Consider a single example where the target has 3 words, and padding is to 7 tokens. The complete sequence would correspond to <S> w1 w2 w3 <E> <P> <P> and the mask would be 1 1 1 1 1 0 0 and let the logits be l1 l2 l3 l4 l5 l6 We actually need to compare: the sequence w1 w2 w3 <E> <P> <P> with masks 1 1 1 1 0 0 against l1 l2 l3 l4 l5 l6 (where the input was) <S> w1 w2 w3 <E> <P> """ relevant_targets = targets[:, 1:].contiguous( ) # (batch_size, num_decoding_steps) relevant_mask = target_mask[:, 1:].contiguous( ) # (batch_size, num_decoding_steps) loss = sequence_cross_entropy_with_logits(logits, relevant_targets, relevant_mask) return loss @overrides def decode( self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: """ This method overrides ``Model.decode``, which gets called after ``Model.forward``, at test time, to finalize predictions. The logic for the decoder part of the encoder-decoder lives within the ``forward`` method. This method trims the output predictions to the first end symbol, replaces indices with corresponding tokens, and adds a field called ``predicted_tokens`` to the ``output_dict``. """ predicted_indices = output_dict["predictions"] if not isinstance(predicted_indices, numpy.ndarray): predicted_indices = predicted_indices.data.cpu().numpy() all_predicted_tokens = [] for indices in predicted_indices: indices = list(indices) # Collect indices till the first end_symbol if self._end_index in indices: indices = indices[:indices.index(self._end_index)] predicted_tokens = [ self.vocab.get_token_from_index( x, namespace=self._target_namespace) for x in indices ] all_predicted_tokens.append(predicted_tokens) output_dict["predicted_tokens"] = all_predicted_tokens return output_dict @overrides def get_metrics(self, reset: bool = False) -> Dict[str, float]: accs = { metric_name: metric.get_metric(reset) for metric_name, metric in self.metrics.items() } metric_dict = self.span_metric.get_metric(reset=reset) f1 = {x: y for x, y in metric_dict.items() if "overall" in x} return {**f1, **accs} @classmethod def from_params(cls, vocab, params: Params) -> 'SimpleSeq2Seq': source_embedder_params = params.pop("source_embedder") source_embedder = TextFieldEmbedder.from_params( vocab, source_embedder_params) encoder = Seq2SeqEncoder.from_params(params.pop("encoder")) max_decoding_steps = params.pop("max_decoding_steps") target_namespace = params.pop("target_namespace", "target_tags") # If no attention function is specified, we should not use attention, not attention with # default similarity function. attention_function_type = params.pop("attention_function", None) if attention_function_type is not None: attention_function = SimilarityFunction.from_params( attention_function_type) else: attention_function = None scheduled_sampling_ratio = params.pop_float("scheduled_sampling_ratio", 0.0) initializer = InitializerApplicator.from_params( params.pop('initializer', [])) regularizer = RegularizerApplicator.from_params( params.pop('regularizer', [])) return cls(vocab, source_embedder=source_embedder, encoder=encoder, max_decoding_steps=max_decoding_steps, target_namespace=target_namespace, attention_function=attention_function, scheduled_sampling_ratio=scheduled_sampling_ratio, initializer=initializer, regularizer=regularizer)
def __init__( self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2VecEncoder, kg_encoder: Seq2VecEncoder, max_decoding_steps: int = 64, attention: Attention = None, target_namespace: str = "tokens", scheduled_sampling_ratio: float = 0.4, ) -> None: super().__init__(vocab) self._target_namespace = target_namespace self._scheduled_sampling_ratio = scheduled_sampling_ratio # Maybe we can try self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self.pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) self.hidden_dim = 300 self._max_decoding_steps = max_decoding_steps self.kd_metric = KD_Metric() self.bleu_aver = NLTK_BLEU(ngram_weights=(0.25, 0.25, 0.25, 0.25)) self.bleu1 = NLTK_BLEU(ngram_weights=(1, 0, 0, 0)) self.bleu2 = NLTK_BLEU(ngram_weights=(0, 1, 0, 0)) self.bleu4 = NLTK_BLEU(ngram_weights=(0, 0, 0, 1)) self.topic_acc = Average() self.distinct1 = Distinct1() self.distinct2 = Distinct2() # anything about module self._source_embedder = source_embedder num_classes = self.vocab.get_vocab_size(self._target_namespace) target_embedding_dim = source_embedder.get_output_dim() self._target_embedder = Embedding(num_classes, target_embedding_dim) self._encoder = encoder self._kg_encoder = kg_encoder self._encoder_output_dim = self._encoder.get_output_dim() self._decoder_output_dim = self._encoder_output_dim # self.select_entity_num = 3 self._decoder_input_dim = self.hidden_dim * 2 + total_entiy #self.select_entity_num self._attention = None if attention: self._attention = attention self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim self._decoder_cell = LSTMCell(self.hidden_dim * 2, self._decoder_output_dim) self._output_projection_layer = Linear(self.hidden_dim, num_classes) # with open('cy/comp_topic2num.pk', 'rb') as f: with open('fd/word2idx.pk', 'rb') as f: self.word_idx = pickle.load(f) self.vocab_to_idx = {} self.idx_to_vocab_list = [] for word, k in self.word_idx.items(): self.vocab_to_idx[vocab.get_token_index(word.strip())] = k self.idx_to_vocab_list.append(vocab.get_token_index(word.strip())) self.entity_size = total_entiy self.entity_embedding = torch.nn.Parameter( torch.Tensor(self.entity_size, self.hidden_dim)) torch.nn.init.xavier_uniform_(self.entity_embedding, gain=1.414) self.entity_linear = Linear(self.hidden_dim * 2, self.entity_size) self.gen_linear = Linear(self.hidden_dim, 1) self.clac_num = 0
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, max_decoding_steps: int, target_namespace: str = "target_tags", target_embedding_dim: int = None, attention_function: SimilarityFunction = None, scheduled_sampling_ratio: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(SimpleSeq2Seq, self).__init__(vocab, regularizer) self._source_embedder = source_embedder self._encoder = encoder self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace self._attention_function = attention_function self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) num_classes = self.vocab.get_vocab_size(self._target_namespace) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with that of the final hidden states of the encoder. Also, if # we're using attention with ``DotProductSimilarity``, this is needed. self._decoder_output_dim = self._encoder.get_output_dim() target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim( ) self._target_embedder = Embedding(num_classes, target_embedding_dim) if self._attention_function: self._decoder_attention = Attention(self._attention_function) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the input vector of the decoder at each time step. self._decoder_input_dim = self._encoder.get_output_dim( ) + target_embedding_dim else: self._decoder_input_dim = target_embedding_dim # TODO (pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) # self._decoder_cell = GRUCell(self._decoder_input_dim, self._decoder_output_dim, bias=False) self._output_projection_layer = Linear(self._decoder_output_dim, num_classes) self.metrics = { "accuracy": CategoricalAccuracy(), "accuracy3": CategoricalAccuracy(top_k=3) } self.span_metric = SpanBasedF1Measure( vocab, tag_namespace=target_namespace, ignore_classes=[START_SYMBOL[2:], END_SYMBOL[2:]]) initializer(self) # Initialize forget gate encoder_parameters = self._encoder.state_dict() for pname in encoder_parameters: if 'bias_' in pname: print(pname) b = encoder_parameters[pname] l = len(b) b[l // 4:l // 2] = 1.0 decoder_parameters = self._decoder_cell.state_dict() for pname in decoder_parameters: if 'bias_' in pname: print(pname) b = decoder_parameters[pname] l = len(b) b[l // 4:l // 2] = 1.0
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, beam_search: Lazy[BeamSearch] = Lazy(BeamSearch), attention: Attention = None, target_namespace: str = "tokens", target_embedding_dim: int = None, scheduled_sampling_ratio: float = 0.0, use_bleu: bool = True, bleu_ngram_weights: Iterable[float] = (0.25, 0.25, 0.25, 0.25), target_pretrain_file: str = None, target_decoder_layers: int = 1, **kwargs) -> None: super().__init__(vocab) self._target_namespace = target_namespace self._target_decoder_layers = target_decoder_layers self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) if use_bleu: pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) self._bleu = BLEU( bleu_ngram_weights, exclude_indices={ pad_index, self._end_index, self._start_index }, ) else: self._bleu = None # At prediction time, we'll use a beam search to find the best target sequence. # For backwards compatibility, check if beam_size or max_decoding_steps were passed in as # kwargs. If so, update the BeamSearch object before constructing and raise a DeprecationWarning deprecation_warning = ( "The parameter {} has been deprecated." " Provide this parameter as argument to beam_search instead.") beam_search_extras = {} if "beam_size" in kwargs: beam_search_extras["beam_size"] = kwargs["beam_size"] warnings.warn(deprecation_warning.format("beam_size"), DeprecationWarning) if "max_decoding_steps" in kwargs: beam_search_extras["max_steps"] = kwargs["max_decoding_steps"] warnings.warn(deprecation_warning.format("max_decoding_steps"), DeprecationWarning) self._beam_search = beam_search.construct(end_index=self._end_index, vocab=self.vocab, **beam_search_extras) # Dense embedding of source vocab tokens. self._source_embedder = source_embedder # Encodes the sequence of source embeddings into a sequence of hidden states. self._encoder = encoder num_classes = self.vocab.get_vocab_size(self._target_namespace) # Attention mechanism applied to the encoder output for each step. self._attention = attention # Dense embedding of vocab words in the target space. target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim( ) if not target_pretrain_file: self._target_embedder = Embedding( num_embeddings=num_classes, embedding_dim=target_embedding_dim) else: self._target_embedder = Embedding( embedding_dim=target_embedding_dim, pretrained_file=target_pretrain_file, vocab_namespace=self._target_namespace, vocab=self.vocab, ) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. self._encoder_output_dim = self._encoder.get_output_dim() self._decoder_output_dim = self._encoder_output_dim if self._attention: # If using attention, a weighted average over encoder outputs will be concatenated # to the previous target embedding to form the input to the decoder at each # time step. self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim else: # Otherwise, the input to the decoder is just the previous target embedding. self._decoder_input_dim = target_embedding_dim # We'll use an LSTM cell as the recurrent cell that produces a hidden state # for the decoder at each time step. # TODO (pradeep): Do not hardcode decoder cell type. if self._target_decoder_layers > 1: self._decoder_cell = LSTM( self._decoder_input_dim, self._decoder_output_dim, self._target_decoder_layers, ) else: self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) # We project the hidden state from the decoder into the output vocabulary space # in order to get log probabilities of each target token, at each time step. self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
def __init__( self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, max_decoding_steps: int, attention: Attention = None, beam_size: int = None, target_namespace: str = "tokens", target_embedding_dim: int = None, scheduled_sampling_ratio: float = 0.0, use_bleu: bool = True, bleu_ngram_weights: Iterable[float] = (0.25, 0.25, 0.25, 0.25), target_pretrain_file: str = None, target_decoder_layers: int = 1, ) -> None: super().__init__(vocab) self._target_namespace = target_namespace self._target_decoder_layers = target_decoder_layers self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) if use_bleu: pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) self._bleu = BLEU( bleu_ngram_weights, exclude_indices={ pad_index, self._end_index, self._start_index }, ) else: self._bleu = None # At prediction time, we use a beam search to find the most likely sequence of target tokens. beam_size = beam_size or 1 self._max_decoding_steps = max_decoding_steps self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) # Dense embedding of source vocab tokens. self._source_embedder = source_embedder # Encodes the sequence of source embeddings into a sequence of hidden states. self._encoder = encoder num_classes = self.vocab.get_vocab_size(self._target_namespace) # Attention mechanism applied to the encoder output for each step. self._attention = attention # Dense embedding of vocab words in the target space. target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim( ) if not target_pretrain_file: self._target_embedder = Embedding( num_embeddings=num_classes, embedding_dim=target_embedding_dim) else: self._target_embedder = Embedding( embedding_dim=target_embedding_dim, pretrained_file=target_pretrain_file, vocab_namespace=self._target_namespace, vocab=self.vocab, ) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. self._encoder_output_dim = self._encoder.get_output_dim() self._decoder_output_dim = self._encoder_output_dim if self._attention: # If using attention, a weighted average over encoder outputs will be concatenated # to the previous target embedding to form the input to the decoder at each # time step. self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim else: # Otherwise, the input to the decoder is just the previous target embedding. self._decoder_input_dim = target_embedding_dim # We'll use an LSTM cell as the recurrent cell that produces a hidden state # for the decoder at each time step. # TODO (pradeep): Do not hardcode decoder cell type. if self._target_decoder_layers > 1: self._decoder_cell = LSTM( self._decoder_input_dim, self._decoder_output_dim, self._target_decoder_layers, ) else: self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) # We project the hidden state from the decoder into the output vocabulary space # in order to get log probabilities of each target token, at each time step. self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
def __init__( self, vocab: Vocabulary, attention: Attention, beam_size: int, max_decoding_steps: int, target_embedding_dim: int = 30, copy_token: str = "@COPY@", source_namespace: str = "bert", target_namespace: str = "target_tokens", tensor_based_metric: Metric = None, token_based_metric: Metric = None, initializer: InitializerApplicator = InitializerApplicator(), ) -> None: super().__init__(vocab) self._source_namespace = source_namespace self._target_namespace = target_namespace self._src_start_index = self.vocab.get_token_index( START_SYMBOL, self._source_namespace) self._src_end_index = self.vocab.get_token_index( END_SYMBOL, self._source_namespace) self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self._oov_index = self.vocab.get_token_index(self.vocab._oov_token, self._target_namespace) self._pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) self._copy_index = self.vocab.add_token_to_namespace( copy_token, self._target_namespace) self._tensor_based_metric = tensor_based_metric or BLEU( exclude_indices={ self._pad_index, self._end_index, self._start_index }) self._token_based_metric = token_based_metric self._target_vocab_size = self.vocab.get_vocab_size( self._target_namespace) # Encoding modules. bert_token_embedding = PretrainedBertEmbedder('bert-base-uncased', requires_grad=True) self._source_embedder = bert_token_embedding self._encoder = PassThroughEncoder( input_dim=self._source_embedder.get_output_dim()) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. # We arbitrarily set the decoder's input dimension to be the same as the output dimension. self.encoder_output_dim = self._encoder.get_output_dim() self.decoder_output_dim = self.encoder_output_dim self.decoder_input_dim = self.decoder_output_dim target_vocab_size = self.vocab.get_vocab_size(self._target_namespace) # The decoder input will be a function of the embedding of the previous predicted token, # an attended encoder hidden state called the "attentive read", and another # weighted sum of the encoder hidden state called the "selective read". # While the weights for the attentive read are calculated by an `Attention` module, # the weights for the selective read are simply the predicted probabilities # corresponding to each token in the source sentence that matches the target # token from the previous timestep. self._target_embedder = Embedding(target_vocab_size, target_embedding_dim) self._attention = attention self._input_projection_layer = Linear( target_embedding_dim + self.encoder_output_dim * 2, self.decoder_input_dim) # We then run the projected decoder input through an LSTM cell to produce # the next hidden state. self._decoder_cell = LSTMCell(self.decoder_input_dim, self.decoder_output_dim) # We create a "generation" score for each token in the target vocab # with a linear projection of the decoder hidden state. self._output_generation_layer = Linear(self.decoder_output_dim, target_vocab_size) # We create a "copying" score for each source token by applying a non-linearity # (tanh) to a linear projection of the encoded hidden state for that token, # and then taking the dot product of the result with the decoder hidden state. self._output_copying_layer = Linear(self.encoder_output_dim, self.decoder_output_dim) # At prediction time, we'll use a beam search to find the best target sequence. self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) initializer(self)
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, attention: Attention, max_decoding_steps: int, beam_size: int = None, target_namespace: str = "tokens", target_embedding_dim: int = None, scheduled_sampling_ratio: float = 0., projection_dim: int = None, use_coverage: bool = False, coverage_loss_weight: float = None) -> None: super(PointerGeneratorNetwork, self).__init__(vocab) self._target_namespace = target_namespace self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self._source_unk_index = self.vocab.get_token_index(DEFAULT_OOV_TOKEN) self._target_unk_index = self.vocab.get_token_index( DEFAULT_OOV_TOKEN, self._target_namespace) self._source_vocab_size = self.vocab.get_vocab_size() self._target_vocab_size = self.vocab.get_vocab_size( self._target_namespace) # Encoder self._source_embedder = source_embedder self._encoder = encoder self._encoder_output_dim = self._encoder.get_output_dim() # Decoder self._target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim( ) self._num_classes = self.vocab.get_vocab_size(self._target_namespace) self._target_embedder = Embedding(self._num_classes, self._target_embedding_dim) self._decoder_input_dim = self._encoder_output_dim + self._target_embedding_dim self._decoder_output_dim = self._encoder_output_dim self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) self._projection_dim = projection_dim or self._source_embedder.get_output_dim( ) self._hidden_projection_layer = Linear(self._decoder_output_dim, self._projection_dim) self._output_projection_layer = Linear(self._projection_dim, self._num_classes) self._p_gen_layer = Linear( self._decoder_output_dim * 3 + self._decoder_input_dim, 1) self._attention = attention self._use_coverage = use_coverage self._coverage_loss_weight = coverage_loss_weight self._eps = 1e-31 # Decoding self._scheduled_sampling_ratio = scheduled_sampling_ratio self._max_decoding_steps = max_decoding_steps self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size or 1)