def rnn_decoder(decoder_params): decoder_embedding_layer = DropoutEmbeddings( ntokens=decoder_params.ntokens, emb_size=decoder_params.emb_size, ) if decoder_params.attention: # attention decoder must have double the input_size to accommodate for the attention concat decoder_rnn = RNNLayers(input_size=decoder_params.emb_size * 2, output_size=decoder_params.emb_size, nhid=decoder_params.nhid, bidir=False, nlayers=decoder_params.nlayers, cell_type="gru") projection_layer = AttentionProjection( output_size=decoder_params.ntokens, input_size=decoder_params.emb_size, att_nhid=decoder_params.att_hid, tie_encoder=None, dropout=0.0) decoder = AttentionDecoder(decoder_layer=decoder_rnn, embedding_layer=decoder_embedding_layer, projection_layer=projection_layer, pad_token=1, eos_token=2, max_tokens=decoder_params.max_tokens) else: decoder_rnn = RNNLayers(input_size=decoder_params.emb_size, output_size=decoder_params.emb_size, nhid=decoder_params.nhid, bidir=False, nlayers=decoder_params.nlayers, cell_type="gru") projection_layer = Projection(output_size=decoder_params.ntokens, input_size=decoder_params.emb_size, dropout=0.0, tie_encoder=None) decoder = Decoder( decoder_layer=decoder_rnn, projection_layer=projection_layer, embedding_layer=decoder_embedding_layer, pad_token=0, eos_token=1, max_tokens=decoder_params.max_tokens, ) decoder = to_gpu(decoder) decoder.reset(decoder_params.batch_size) return decoder, decoder_params
def __init__(self, ntoken: int, emb_sz: int, nhid: HParam, nlayers: int, bidir: bool = False, cell_type="gru", **kwargs): super().__init__() # allow for the same or different parameters between encoder and decoder nhid = get_list(nhid, 2) dropoute = get_kwarg(kwargs, name="dropout_e", default_value=0.1) # encoder embedding dropout dropoute = get_list(dropoute, 2) dropouti = get_kwarg(kwargs, name="dropout_i", default_value=0.65) # input dropout dropouti = get_list(dropouti, 2) dropouth = get_kwarg(kwargs, name="dropout_h", default_value=0.3) # RNN output layers dropout dropouth = get_list(dropouth, 2) wdrop = get_kwarg(kwargs, name="wdrop", default_value=0.5) # RNN weights dropout wdrop = get_list(wdrop, 2) train_init = get_kwarg(kwargs, name="train_init", default_value=False) dropoutinit = get_kwarg( kwargs, name="dropout_init", default_value=0.1) # RNN initial states dropout dropoutinit = get_list(dropoutinit, 2) self.cell_type = cell_type self.nt = ntoken self.bidir = bidir encoder_embedding_layer = DropoutEmbeddings(ntokens=ntoken, emb_size=emb_sz, dropoute=dropoute[0], dropouti=dropouti[0]) encoder_rnn = RNNLayers(input_size=emb_sz, output_size=kwargs.get("output_size_encoder", emb_sz), nhid=nhid[0], bidir=bidir, dropouth=dropouth[0], wdrop=wdrop[0], nlayers=nlayers, cell_type=self.cell_type, train_init=train_init, dropoutinit=dropoutinit[0]) self.query_encoder = Encoder(embedding_layer=encoder_embedding_layer, encoder_layer=encoder_rnn) self.se_enc = RNNLayers(cell_type=self.cell_type, input_size=encoder_rnn.output_size, output_size=nhid[1], nhid=nhid[1], nlayers=1, dropouth=dropouth[1], wdrop=wdrop[1], train_init=train_init, dropoutinit=dropoutinit[1])
def __init__(self, ntoken: HParam, emb_sz: HParam, nhid: HParam, nlayers: HParam, att_nhid: int, pad_token: int, eos_token: int, max_tokens: int = 50, share_embedding_layer: bool = False, tie_decoder: bool = True, bidir: bool = False, **kwargs): """ Args: ntoken (Union[List[int],int]): Number of tokens for the encoder and the decoder emb_sz (Union[List[int],int]): Embedding size for the encoder and decoder embeddings nhid (Union[List[int],int]): Number of hidden dims for the encoder and the decoder nlayers (Union[List[int],int]): Number of layers for the encoder and the decoder att_nhid (int): Number of hidden dims for the attention Module pad_token (int): The index of the token used for padding eos_token (int): The index of the token used for eos max_tokens (int): The maximum number of steps the decoder iterates before stopping share_embedding_layer (bool): if True the decoder shares its input and output embeddings tie_decoder (bool): if True the encoder and the decoder share their embeddings bidir (bool): if True use a bidirectional encoder **kwargs: Extra embeddings that will be passed to the encoder and the decoder """ super().__init__() # allow for the same or different parameters between encoder and decoder ntoken, emb_sz, nhid, nlayers = get_list(ntoken, 2), get_list(emb_sz, 2), \ get_list(nhid, 2), get_list(nlayers, 2) dropoutd = get_kwarg(kwargs, name="dropoutd", default_value=0.5) # output dropout dropoute = get_kwarg(kwargs, name="dropout_e", default_value=0.1) # encoder embedding dropout dropoute = get_list(dropoute, 2) dropouti = get_kwarg(kwargs, name="dropout_i", default_value=0.65) # input dropout dropouti = get_list(dropouti, 2) dropouth = get_kwarg(kwargs, name="dropout_h", default_value=0.3) # RNN output layers dropout dropouth = get_list(dropouth, 2) wdrop = get_kwarg(kwargs, name="wdrop", default_value=0.5) # RNN weights dropout wdrop = get_list(wdrop, 2) cell_type = get_kwarg(kwargs, name="cell_type", default_value="lstm") self.nlayers = nlayers self.nhid = nhid self.emb_sz = emb_sz self.pr_force = 1.0 encoder_embedding_layer = DropoutEmbeddings(ntokens=ntoken[0], emb_size=emb_sz[0], dropoute=dropoute[0], dropouti=dropouti[0]) encoder_rnn = RNNLayers( input_size=emb_sz[0], output_size=kwargs.get("output_size", emb_sz[0]), nhid=nhid[0], bidir=bidir, dropouth=dropouth[0], wdrop=wdrop[0], nlayers=nlayers[0], cell_type=cell_type, ) self.encoder = Encoder(embedding_layer=encoder_embedding_layer, encoder_layer=encoder_rnn) if share_embedding_layer: decoder_embedding_layer = encoder_embedding_layer else: decoder_embedding_layer = DropoutEmbeddings(ntokens=ntoken[-1], emb_size=emb_sz[-1], dropoute=dropoute[1], dropouti=dropouti[1]) decoder_rnn = RNNLayers(input_size=kwargs.get("input_size", emb_sz[-1] * 2), output_size=kwargs.get("output_size", emb_sz[-1]), nhid=nhid[-1], bidir=False, dropouth=dropouth[1], wdrop=wdrop[1], nlayers=nlayers[-1], cell_type=cell_type) projection_layer = AttentionProjection( output_size=ntoken[-1], input_size=emb_sz[-1], dropout=dropoutd, att_nhid=att_nhid, tie_encoder=decoder_embedding_layer if tie_decoder else None) self.decoder = AttentionDecoder( decoder_layer=decoder_rnn, projection_layer=projection_layer, embedding_layer=decoder_embedding_layer, pad_token=pad_token, eos_token=eos_token, max_tokens=max_tokens, )
class HREDEncoder(nn.Module): def __init__(self, ntoken: int, emb_sz: int, nhid: HParam, nlayers: int, bidir: bool = False, cell_type="gru", **kwargs): super().__init__() # allow for the same or different parameters between encoder and decoder nhid = get_list(nhid, 2) dropoute = get_kwarg(kwargs, name="dropout_e", default_value=0.1) # encoder embedding dropout dropoute = get_list(dropoute, 2) dropouti = get_kwarg(kwargs, name="dropout_i", default_value=0.65) # input dropout dropouti = get_list(dropouti, 2) dropouth = get_kwarg(kwargs, name="dropout_h", default_value=0.3) # RNN output layers dropout dropouth = get_list(dropouth, 2) wdrop = get_kwarg(kwargs, name="wdrop", default_value=0.5) # RNN weights dropout wdrop = get_list(wdrop, 2) train_init = get_kwarg(kwargs, name="train_init", default_value=False) dropoutinit = get_kwarg( kwargs, name="dropout_init", default_value=0.1) # RNN initial states dropout dropoutinit = get_list(dropoutinit, 2) self.cell_type = cell_type self.nt = ntoken self.bidir = bidir encoder_embedding_layer = DropoutEmbeddings(ntokens=ntoken, emb_size=emb_sz, dropoute=dropoute[0], dropouti=dropouti[0]) encoder_rnn = RNNLayers(input_size=emb_sz, output_size=kwargs.get("output_size_encoder", emb_sz), nhid=nhid[0], bidir=bidir, dropouth=dropouth[0], wdrop=wdrop[0], nlayers=nlayers, cell_type=self.cell_type, train_init=train_init, dropoutinit=dropoutinit[0]) self.query_encoder = Encoder(embedding_layer=encoder_embedding_layer, encoder_layer=encoder_rnn) self.se_enc = RNNLayers(cell_type=self.cell_type, input_size=encoder_rnn.output_size, output_size=nhid[1], nhid=nhid[1], nlayers=1, dropouth=dropouth[1], wdrop=wdrop[1], train_init=train_init, dropoutinit=dropoutinit[1]) def forward(self, inputs): query_encoder_outputs = self.query_level_encoding(inputs) outputs = self.se_enc(query_encoder_outputs) last_output = self.se_enc.hidden[-1] return outputs, last_output def reset(self, bs): self.query_encoder.reset(bs) self.se_enc.reset(bs) def query_level_encoding(self, encoder_inputs): query_encoder_outputs = [] for index, context in enumerate(encoder_inputs): self.query_encoder.reset(bs=encoder_inputs.size(2)) state = self.query_encoder.hidden outputs = self.query_encoder(context, state) # context has size [sl, bs] out = concat_bidir_state( self.query_encoder.encoder_layer.get_last_hidden_state(), cell_type=self.cell_type, nlayers=1, bidir=self.query_encoder.encoder_layer.bidir) query_encoder_outputs.append( out) # get the last sl output of the query_encoder # BPTT if the dialogue is too long repackage the first half of the outputs to decrease # the gradient backpropagation and fit it into memory # out = repackage_var(outputs[-1][ # -1]) if max_sl * num_utterances > self.BPTT_MAX_UTTERANCES and index <= num_utterances // 2 else \ # outputs[-1][-1] query_encoder_outputs = torch.cat(query_encoder_outputs, dim=0) # [cl, bs, nhid] return query_encoder_outputs @property def embedding_layer(self): return self.query_encoder.embedding_layer @property def output_size(self): return self.se_enc.output_size @property def query_encoder_layer(self): return self.query_encoder.encoder_layer @property def session_encoder_layer(self): return self.se_enc
def __init__(self, ntoken: int, emb_sz: HParam, nhid: HParam, nlayers: HParam, pad_token: int, eos_token: int, max_tokens: int = 50, share_embedding_layer: bool = False, tie_decoder: bool = True, bidir: bool = False, session_constraint: bool = False, **kwargs): """ Args: ntoken (int): Number of tokens for the encoder and the decoder emb_sz (Union[List[int],int]): Embedding size for the encoder and decoder embeddings nhid (Union[List[int],int]): Number of hidden dims for the encoder (first two values) and the decoder nlayers (Union[List[int],int]): Number of layers for the encoder and the decoder pad_token (int): The index of the token used for padding eos_token (int): The index of the token used for eos max_tokens (int): The maximum number of steps the decoder iterates before stopping share_embedding_layer (bool): if True the decoder shares its input and output embeddings tie_decoder (bool): if True the encoder and the decoder share their embeddings bidir (bool): if True use a bidirectional encoder session_constraint (bool) If true the session will be concated as a constraint to the decoder input **kwargs: Extra embeddings that will be passed to the encoder and the decoder """ super().__init__() # allow for the same or different parameters between encoder and decoder ntoken, emb_sz, nhid, nlayers = get_list(ntoken), get_list( emb_sz, 2), get_list(nhid, 3), get_list(nlayers, 3) dropoutd = get_kwarg(kwargs, name="dropout_d", default_value=0.5) # output dropout dropoute = get_kwarg(kwargs, name="dropout_e", default_value=0.1) # encoder embedding dropout dropoute = get_list(dropoute, 2) dropouti = get_kwarg(kwargs, name="dropout_i", default_value=0.65) # input dropout dropouti = get_list(dropouti, 2) dropouth = get_kwarg(kwargs, name="dropout_h", default_value=0.3) # RNN output layers dropout dropouth = get_list(dropouth, 3) wdrop = get_kwarg(kwargs, name="wdrop", default_value=0.5) # RNN weights dropout wdrop = get_list(wdrop, 3) train_init = kwargs.pop( "train_init", False) # Have trainable initial states to the RNNs dropoutinit = get_kwarg( kwargs, name="dropout_init", default_value=0.1) # RNN initial states dropout dropoutinit = get_list(dropoutinit, 3) self.cell_type = "gru" self.nt = ntoken[-1] self.pr_force = 1.0 encoder_embedding_layer = DropoutEmbeddings(ntokens=ntoken[0], emb_size=emb_sz[0], dropoute=dropoute[0], dropouti=dropouti[0]) encoder_rnn = RNNLayers(input_size=emb_sz[0], output_size=kwargs.get("output_size_encoder", emb_sz[0]), nhid=nhid[0], bidir=bidir, dropouth=dropouth[0], wdrop=wdrop[0], nlayers=nlayers[0], cell_type=self.cell_type, train_init=train_init, dropoutinit=dropoutinit[0]) self.query_encoder = Encoder(embedding_layer=encoder_embedding_layer, encoder_layer=encoder_rnn) self.se_enc = RNNLayers(cell_type=self.cell_type, input_size=encoder_rnn.output_size, output_size=nhid[1], nhid=nhid[1], nlayers=1, dropouth=dropouth[1], wdrop=wdrop[1], train_init=train_init, dropoutinit=dropoutinit[1]) if share_embedding_layer: decoder_embedding_layer = encoder_embedding_layer else: decoder_embedding_layer = DropoutEmbeddings(ntokens=ntoken[0], emb_size=emb_sz[1], dropoute=dropoute[1], dropouti=dropouti[1]) input_size_decoder = kwargs.get("input_size_decoder", emb_sz[1]) input_size_decoder = input_size_decoder + self.se_enc.output_size if session_constraint else input_size_decoder decoder_rnn = RNNLayers(input_size=input_size_decoder, output_size=kwargs.get("output_size_decoder", emb_sz[1]), nhid=nhid[2], bidir=False, dropouth=dropouth[2], wdrop=wdrop[2], nlayers=nlayers[2], cell_type=self.cell_type, train_init=train_init, dropoutinit=dropoutinit[2]) self.session_constraint = session_constraint # allow for changing sizes of decoder output input_size = decoder_rnn.output_size nhid = emb_sz[1] if input_size != emb_sz[1] else None projection_layer = Projection( output_size=ntoken[0], input_size=input_size, nhid=nhid, dropout=dropoutd, tie_encoder=decoder_embedding_layer if tie_decoder else None) self.decoder = Decoder( decoder_layer=decoder_rnn, projection_layer=projection_layer, embedding_layer=decoder_embedding_layer, pad_token=pad_token, eos_token=eos_token, max_tokens=max_tokens, ) self.decoder_state_linear = nn.Linear( in_features=self.se_enc.output_size, out_features=self.decoder.layers[0].output_size)
class HRED(nn.Module): """Basic HRED model paper: A Hierarchical Latent Variable Encoder-Decoder Model for Generating Dialogues. Iulian Vlad Serban et al. 2016a. github: https://github.com/julianser/hed-dlg-truncated arxiv: http://arxiv.org/abs/1605.06069 """ BPTT_MAX_UTTERANCES = 1000 def __init__(self, ntoken: int, emb_sz: HParam, nhid: HParam, nlayers: HParam, pad_token: int, eos_token: int, max_tokens: int = 50, share_embedding_layer: bool = False, tie_decoder: bool = True, bidir: bool = False, session_constraint: bool = False, **kwargs): """ Args: ntoken (int): Number of tokens for the encoder and the decoder emb_sz (Union[List[int],int]): Embedding size for the encoder and decoder embeddings nhid (Union[List[int],int]): Number of hidden dims for the encoder (first two values) and the decoder nlayers (Union[List[int],int]): Number of layers for the encoder and the decoder pad_token (int): The index of the token used for padding eos_token (int): The index of the token used for eos max_tokens (int): The maximum number of steps the decoder iterates before stopping share_embedding_layer (bool): if True the decoder shares its input and output embeddings tie_decoder (bool): if True the encoder and the decoder share their embeddings bidir (bool): if True use a bidirectional encoder session_constraint (bool) If true the session will be concated as a constraint to the decoder input **kwargs: Extra embeddings that will be passed to the encoder and the decoder """ super().__init__() # allow for the same or different parameters between encoder and decoder ntoken, emb_sz, nhid, nlayers = get_list(ntoken), get_list( emb_sz, 2), get_list(nhid, 3), get_list(nlayers, 3) dropoutd = get_kwarg(kwargs, name="dropout_d", default_value=0.5) # output dropout dropoute = get_kwarg(kwargs, name="dropout_e", default_value=0.1) # encoder embedding dropout dropoute = get_list(dropoute, 2) dropouti = get_kwarg(kwargs, name="dropout_i", default_value=0.65) # input dropout dropouti = get_list(dropouti, 2) dropouth = get_kwarg(kwargs, name="dropout_h", default_value=0.3) # RNN output layers dropout dropouth = get_list(dropouth, 3) wdrop = get_kwarg(kwargs, name="wdrop", default_value=0.5) # RNN weights dropout wdrop = get_list(wdrop, 3) train_init = kwargs.pop( "train_init", False) # Have trainable initial states to the RNNs dropoutinit = get_kwarg( kwargs, name="dropout_init", default_value=0.1) # RNN initial states dropout dropoutinit = get_list(dropoutinit, 3) self.cell_type = "gru" self.nt = ntoken[-1] self.pr_force = 1.0 encoder_embedding_layer = DropoutEmbeddings(ntokens=ntoken[0], emb_size=emb_sz[0], dropoute=dropoute[0], dropouti=dropouti[0]) encoder_rnn = RNNLayers(input_size=emb_sz[0], output_size=kwargs.get("output_size_encoder", emb_sz[0]), nhid=nhid[0], bidir=bidir, dropouth=dropouth[0], wdrop=wdrop[0], nlayers=nlayers[0], cell_type=self.cell_type, train_init=train_init, dropoutinit=dropoutinit[0]) self.query_encoder = Encoder(embedding_layer=encoder_embedding_layer, encoder_layer=encoder_rnn) self.se_enc = RNNLayers(cell_type=self.cell_type, input_size=encoder_rnn.output_size, output_size=nhid[1], nhid=nhid[1], nlayers=1, dropouth=dropouth[1], wdrop=wdrop[1], train_init=train_init, dropoutinit=dropoutinit[1]) if share_embedding_layer: decoder_embedding_layer = encoder_embedding_layer else: decoder_embedding_layer = DropoutEmbeddings(ntokens=ntoken[0], emb_size=emb_sz[1], dropoute=dropoute[1], dropouti=dropouti[1]) input_size_decoder = kwargs.get("input_size_decoder", emb_sz[1]) input_size_decoder = input_size_decoder + self.se_enc.output_size if session_constraint else input_size_decoder decoder_rnn = RNNLayers(input_size=input_size_decoder, output_size=kwargs.get("output_size_decoder", emb_sz[1]), nhid=nhid[2], bidir=False, dropouth=dropouth[2], wdrop=wdrop[2], nlayers=nlayers[2], cell_type=self.cell_type, train_init=train_init, dropoutinit=dropoutinit[2]) self.session_constraint = session_constraint # allow for changing sizes of decoder output input_size = decoder_rnn.output_size nhid = emb_sz[1] if input_size != emb_sz[1] else None projection_layer = Projection( output_size=ntoken[0], input_size=input_size, nhid=nhid, dropout=dropoutd, tie_encoder=decoder_embedding_layer if tie_decoder else None) self.decoder = Decoder( decoder_layer=decoder_rnn, projection_layer=projection_layer, embedding_layer=decoder_embedding_layer, pad_token=pad_token, eos_token=eos_token, max_tokens=max_tokens, ) self.decoder_state_linear = nn.Linear( in_features=self.se_enc.output_size, out_features=self.decoder.layers[0].output_size) def forward(self, *inputs, num_beams=0): encoder_inputs, decoder_inputs = assert_dims( inputs, [2, None, None]) # dims: [sl, bs] for encoder and decoder # reset the states for the new batch num_utterances, max_sl, bs = encoder_inputs.size() self.reset_encoders(bs) query_encoder_outputs = self.query_level_encoding(encoder_inputs) outputs = self.se_enc(query_encoder_outputs) last_output = self.se_enc.hidden[-1] state = self.decoder.hidden # Tanh seems to deteriorate performance so not used state[0] = self.decoder_state_linear(last_output) # .tanh() constraints = last_output if self.session_constraint else None # dims [1, bs, ed] outputs_dec, predictions = self.decoding(decoder_inputs, num_beams, state, constraints=constraints) return predictions, [*outputs, *outputs_dec] def reset_encoders(self, bs): self.query_encoder.reset(bs) self.se_enc.reset(bs) self.decoder.reset(bs) def decoding(self, decoder_inputs, num_beams, state, constraints=None): if self.training: self.decoder.pr_force = self.pr_force nb = 1 if self.pr_force < 1 else 0 else: nb = num_beams outputs_dec = self.decoder(decoder_inputs, hidden=state, num_beams=nb, constraints=constraints) predictions = outputs_dec[-1][:decoder_inputs.size( 0)] if num_beams == 0 else self.decoder.beam_outputs return outputs_dec, predictions def query_level_encoding(self, encoder_inputs): query_encoder_outputs = [] for index, context in enumerate(encoder_inputs): self.query_encoder.reset(bs=encoder_inputs.size(2)) state = self.query_encoder.hidden outputs = self.query_encoder(context, state) # context has size [sl, bs] out = concat_bidir_state( self.query_encoder.encoder_layer.hidden[-1], cell_type=self.cell_type, nlayers=1, bidir=self.query_encoder.encoder_layer.bidir) query_encoder_outputs.append( out) # get the last sl output of the query_encoder # BPTT if the dialogue is too long repackage the first half of the outputs to decrease # the gradient backpropagation and fit it into memory # out = repackage_var(outputs[-1][ # -1]) if max_sl * num_utterances > self.BPTT_MAX_UTTERANCES and index <= num_utterances // 2 else \ # outputs[-1][-1] query_encoder_outputs = torch.cat(query_encoder_outputs, dim=0) # [cl, bs, nhid] return query_encoder_outputs
class HREDAttention(nn.Module): """Basic HRED model paper: A Hierarchical Latent Variable Encoder-Decoder Model for Generating Dialogues. Iulian Vlad Serban et al. 2016a. github: https://github.com/julianser/hed-dlg-truncated arxiv: http://arxiv.org/abs/1605.06069 """ BPTT_MAX_UTTERANCES = 1000 def __init__(self, ntoken: int, emb_sz: HParam, nhid: HParam, nlayers: HParam, att_nhid: int, pad_token: int, eos_token: int, max_tokens: int = 50, share_embedding_layer: bool = False, tie_decoder: bool = True, bidir: bool = False, **kwargs): """ Args: ntoken (int): Number of tokens for the encoder and the decoder emb_sz (Union[List[int],int]): Embedding size for the encoder and decoder embeddings nhid (Union[List[int],int]): Number of hidden dims for the encoder (first two values) and the decoder nlayers (Union[List[int],int]): Number of layers for the encoder and the decoder att_nhid (int): Number of hidden dims for the attention Module pad_token (int): The index of the token used for padding eos_token (int): The index of the token used for eos max_tokens (int): The maximum number of steps the decoder iterates before stopping share_embedding_layer (bool): if True the decoder shares its input and output embeddings tie_decoder (bool): if True the encoder and the decoder share their embeddings bidir (bool): if True use a bidirectional encoder **kwargs: Extra embeddings that will be passed to the encoder and the decoder """ super().__init__() # allow for the same or different parameters between encoder and decoder ntoken, emb_sz, nhid, nlayers = get_list(ntoken), get_list( emb_sz, 2), get_list(nhid, 3), get_list(nlayers, 3) dropoutd = get_kwarg(kwargs, name="dropoutd", default_value=0.5) # output dropout dropoute = get_kwarg(kwargs, name="dropout_e", default_value=0.1) # encoder embedding dropout dropoute = get_list(dropoute, 2) dropouti = get_kwarg(kwargs, name="dropout_i", default_value=0.65) # input dropout dropouti = get_list(dropouti, 2) dropouth = get_kwarg(kwargs, name="dropout_h", default_value=0.3) # RNN output layers dropout dropouth = get_list(dropouth, 3) wdrop = get_kwarg(kwargs, name="wdrop", default_value=0.5) # RNN weights dropout wdrop = get_list(wdrop, 3) self.cell_type = "gru" self.nt = ntoken[-1] self.pr_force = 1.0 self.nlayers = nlayers encoder_embedding_layer = DropoutEmbeddings(ntokens=ntoken[0], emb_size=emb_sz[0], dropoute=dropoute[0], dropouti=dropouti[0]) encoder_rnn = RNNLayers( input_size=emb_sz[0], output_size=kwargs.get("output_size_encoder", emb_sz[0]), nhid=nhid[0], bidir=bidir, dropouth=dropouth[0], wdrop=wdrop[0], nlayers=nlayers[0], cell_type=self.cell_type, ) self.query_encoder = Encoder(embedding_layer=encoder_embedding_layer, encoder_layer=encoder_rnn) self.session_encoder = RNNLayers( input_size=encoder_rnn.output_size, nhid=nhid[1], output_size=kwargs.get("output_size", emb_sz[0]), nlayers=1, bidir=False, cell_type=self.cell_type, wdrop=wdrop[1], dropouth=dropouth[1], ) if share_embedding_layer: decoder_embedding_layer = encoder_embedding_layer else: decoder_embedding_layer = DropoutEmbeddings(ntokens=ntoken[-1], emb_size=emb_sz[-1], dropoute=dropoute[1], dropouti=dropouti[1]) decoder_rnn = RNNLayers(input_size=kwargs.get("input_size", emb_sz[-1] * 2), output_size=kwargs.get("output_size", emb_sz[-1]), nhid=nhid[-1], bidir=False, dropouth=dropouth[2], wdrop=wdrop[2], nlayers=nlayers[-1], cell_type=self.cell_type) projection_layer = AttentionProjection( output_size=ntoken[-1], input_size=emb_sz[-1], dropout=dropoutd, att_nhid=att_nhid, att_type="SDP", tie_encoder=decoder_embedding_layer if tie_decoder else None) self.decoder = AttentionDecoder( decoder_layer=decoder_rnn, projection_layer=projection_layer, embedding_layer=decoder_embedding_layer, pad_token=pad_token, eos_token=eos_token, max_tokens=max_tokens, ) def forward(self, *inputs, num_beams=0): encoder_inputs, decoder_inputs = assert_dims( inputs, [2, None, None]) # dims: [sl, bs] for encoder and decoder # reset the states for the new batch bs = encoder_inputs.size(2) self.session_encoder.reset(bs) self.decoder.reset(bs) query_encoder_outputs = [] outputs = [] num_utterances, max_sl, *_ = encoder_inputs.size() for index, context in enumerate(encoder_inputs): self.query_encoder.reset(bs) outputs = self.query_encoder(context) # context has size [sl, bs] # BPTT if the dialogue is too long repackage the first half of the outputs to decrease # the gradient backpropagation and fit it into memory # to test before adding back out = repackage_var(outputs[-1][ -1]) if max_sl * num_utterances > self.BPTT_MAX_UTTERANCES and index <= num_utterances // 2 else \ outputs[-1][-1] query_encoder_outputs.append( out) # get the last sl output of the query_encoder query_encoder_outputs = torch.stack(query_encoder_outputs, dim=0) # [cl, bs, nhid] session_outputs = self.session_encoder(query_encoder_outputs) self.decoder.projection_layer.reset(keys=session_outputs[-1]) if self.training: self.decoder.pr_force = self.pr_force nb = 1 if self.pr_force < 1 else 0 else: nb = num_beams state = self.decoder.hidden outputs_dec = self.decoder(decoder_inputs, hidden=state, num_beams=nb) predictions = outputs_dec[-1][:decoder_inputs.size( 0)] if num_beams == 0 else self.decoder.beam_outputs return predictions, [*outputs, *outputs_dec]
def test_BiRNNEncoder(): ntoken = 4 emb_sz = 2 nhid = 6 nlayers = 2 # Given a birnnencoder embedding = DropoutEmbeddings(ntokens=ntoken, emb_size=emb_sz, pad_token=0, dropouti=0.0, dropoute=0.0) rnn_layers = RNNLayers( input_size=emb_sz, nhid=nhid, nlayers=nlayers, output_size=emb_sz, dropouth=0.0, wdrop=0.0, ) encoder = Encoder(embedding_layer=embedding, encoder_layer=rnn_layers) encoder = to_gpu(encoder) assert encoder is not None weight = encoder.embedding_layer.weight assert (4, 2) == weight.shape sl = 2 bs = 3 np.random.seed(0) inputs = np.random.randint(0, ntoken, sl * bs).reshape(sl, bs) vin = V(T(inputs)) # Then the initial output states should be zero encoder.reset(bs) initial_hidden = encoder.encoder_layer.hidden h = [] c = [] for layer in initial_hidden: h.append(layer[0].data.cpu().numpy()) c.append(layer[1].data.cpu().numpy()) assert h[-1].sum() == 0 assert c[-1].sum() == 0 embeddings = encoder.embedding_layer(vin) assert (2, 3, emb_sz) == embeddings.shape # Then the the new states are different from before outputs = encoder(vin) assert_dims(outputs, [nlayers, sl, bs, (nhid, encoder.output_size)]) initial_hidden = encoder.encoder_layer.hidden h1 = [] c1 = [] for hl, cl, layer in zip(h, c, initial_hidden): h1.append(to_np(layer[0])) c1.append(to_np(layer[0])) assert ~np.allclose(hl, h1[-1]) assert ~np.allclose(cl, c1[-1]) # Then the the new states are different from before outputs = encoder(vin) assert_dims(outputs, [nlayers, sl, bs, (nhid, encoder.output_size)]) initial_hidden = encoder.encoder_layer.hidden for hl, cl, layer in zip(h1, c1, initial_hidden): h_new = to_np(layer[0]) c_new = to_np(layer[0]) assert ~np.allclose(hl, h_new) assert ~np.allclose(cl, c_new)