def __init__(self, args, pretrained): super(BiDAF, self).__init__() self.args = args # 1. Character Embedding Layer self.char_emb = nn.Embedding(args.char_vocab_size, args.char_dim, padding_idx=1) nn.init.uniform_(self.char_emb.weight, -0.001, 0.001) self.char_conv = nn.Conv2d(1, args.char_channel_size, (args.char_dim, args.char_channel_width)) # 2. Word Embedding Layer # initialize word embedding with GloVe self.word_emb = nn.Embedding.from_pretrained(pretrained, freeze=True) # highway network assert self.args.hidden_size * 2 == (self.args.char_channel_size + self.args.word_dim) for i in range(2): setattr( self, 'highway_linear{}'.format(i), nn.Sequential( Linear(args.hidden_size * 2, args.hidden_size * 2), nn.ReLU())) setattr( self, 'highway_gate{}'.format(i), nn.Sequential( Linear(args.hidden_size * 2, args.hidden_size * 2), nn.Sigmoid())) # 3. Contextual Embedding Layer self.context_LSTM = LSTM(input_size=args.hidden_size * 2, hidden_size=args.hidden_size, bidirectional=True, batch_first=True, dropout=args.dropout) # 4. Attention Flow Layer self.att_weight_c = Linear(args.hidden_size * 2, 1) self.att_weight_q = Linear(args.hidden_size * 2, 1) self.att_weight_cq = Linear(args.hidden_size * 2, 1) # 5. Modeling Layer self.modeling_LSTM1 = LSTM(input_size=args.hidden_size * 8, hidden_size=args.hidden_size, bidirectional=True, batch_first=True, dropout=args.dropout) self.modeling_LSTM2 = LSTM(input_size=args.hidden_size * 2, hidden_size=args.hidden_size, bidirectional=True, batch_first=True, dropout=args.dropout) # 6. Output Layer self.p1_weight_g = Linear(args.hidden_size * 8, 1, dropout=args.dropout) self.p1_weight_m = Linear(args.hidden_size * 2, 1, dropout=args.dropout) self.p2_weight_g = Linear(args.hidden_size * 8, 1, dropout=args.dropout) self.p2_weight_m = Linear(args.hidden_size * 2, 1, dropout=args.dropout) self.output_LSTM = LSTM(input_size=args.hidden_size * 2, hidden_size=args.hidden_size, bidirectional=True, batch_first=True, dropout=args.dropout) self.dropout = nn.Dropout(p=args.dropout)
def __init__(self, hps): super(BiDAF, self).__init__() self.hps = hps # 1. Character Embedding Layer self.char_emb = nn.Embedding(hps["char_vocab_size"], hps["char_dim"], padding_idx=1) nn.init.uniform_(self.char_emb.weight, -0.001, 0.001) hps['char_channel_size'] = hps["hidden_size"] * 2 - hps["word_dim"] assert hps['char_channel_size'] > 0 self.char_conv = nn.Conv2d( 1, hps["char_channel_size"], (hps["char_dim"], hps["char_channel_width"])) # 2. Word Embedding Layer # initialize word embedding with GloVe self.word_emb = nn.Embedding(50000, 50) # highway network # assert self.hps["hidden_size"] * 2 == (self.hps["char_channel_size"] + self.hps["word_dim"]) for i in range(2): setattr( self, f'highway_linear{i}', nn.Sequential( Linear(hps["hidden_size"] * 2, hps["hidden_size"] * 2), nn.ReLU())) setattr( self, f'highway_gate{i}', nn.Sequential( Linear(hps["hidden_size"] * 2, hps["hidden_size"] * 2), nn.Sigmoid())) # 3. Contextual Embedding Layer self.context_LSTM = LSTM(input_size=hps["hidden_size"] * 2, hidden_size=hps["hidden_size"], bidirectional=True, batch_first=True, dropout=hps["dropout"]) # 4. Attention Flow Layer self.att_weight_c = Linear(hps["hidden_size"] * 2, 1) self.att_weight_q = Linear(hps["hidden_size"] * 2, 1) self.att_weight_cq = Linear(hps["hidden_size"] * 2, 1) # 5. Modeling Layer self.modeling_LSTM1 = LSTM(input_size=hps["hidden_size"] * 8, hidden_size=hps["hidden_size"], bidirectional=True, batch_first=True, dropout=hps["dropout"]) self.modeling_LSTM2 = LSTM(input_size=hps["hidden_size"] * 2, hidden_size=hps["hidden_size"], bidirectional=True, batch_first=True, dropout=hps["dropout"]) # 6. Output Layer self.p1_weight_g = Linear(hps["hidden_size"] * 8, 1, dropout=hps["dropout"]) self.p1_weight_m = Linear(hps["hidden_size"] * 2, 1, dropout=hps["dropout"]) self.p2_weight_g = Linear(hps["hidden_size"] * 8, 1, dropout=hps["dropout"]) self.p2_weight_m = Linear(hps["hidden_size"] * 2, 1, dropout=hps["dropout"]) self.output_LSTM = LSTM(input_size=hps["hidden_size"] * 2, hidden_size=hps["hidden_size"], bidirectional=True, batch_first=True, dropout=hps["dropout"]) self.dropout = nn.Dropout(p=hps["dropout"])
def __init__(self, args): super(BiDAF, self).__init__() self.args = args assert self.args.hidden_size * 2 == (self.args.char_channel_size) for i in range(2): setattr( self, f'highway_linear{i}', nn.Sequential( Linear(args.hidden_size * 2, args.hidden_size * 2), nn.ReLU())) setattr( self, f'highway_gate{i}', nn.Sequential( Linear(args.hidden_size * 2, args.hidden_size * 2), nn.Sigmoid())) # 3. Contextual Embedding Layer self.context_LSTM = LSTM(input_size=args.hidden_size * 2, hidden_size=args.hidden_size, bidirectional=True, batch_first=True, dropout=args.dropout) # 4. Attention Flow Layer self.att_weight_c = Linear(args.hidden_size * 2, 1) self.att_weight_q = Linear(args.hidden_size * 2, 1) self.att_weight_cq = Linear(args.hidden_size * 2, 1) # 5. Modeling Layer self.modeling_LSTM1 = LSTM(input_size=args.hidden_size * 8, hidden_size=args.hidden_size, bidirectional=True, batch_first=True, dropout=args.dropout) self.modeling_LSTM2 = LSTM(input_size=args.hidden_size * 2, hidden_size=args.hidden_size, bidirectional=True, batch_first=True, dropout=args.dropout) # 6. Output Layer self.p1_weight_g = Linear(args.hidden_size * 8, 1, dropout=args.dropout) self.p1_weight_m = Linear(args.hidden_size * 2, 1, dropout=args.dropout) self.p2_weight_g = Linear(args.hidden_size * 8, 1, dropout=args.dropout) self.p2_weight_m = Linear(args.hidden_size * 2, 1, dropout=args.dropout) self.output_LSTM = LSTM(input_size=args.hidden_size * 2, hidden_size=args.hidden_size, bidirectional=True, batch_first=True, dropout=args.dropout) # Prevent from over-fitting self.dropout = nn.Dropout(p=args.dropout)
def __init__(self, char_vocab_size, word_vocab_size, pretrained, word_dim=100, char_dim=8, char_channel_width=5, char_channel_size=100, dropout_rate=0.2, hidden_size=100): super(BiDAF, self).__init__() self.word_dim = word_dim self.char_dim = char_dim self.char_channel_width = char_channel_width self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.char_vocab_size = char_vocab_size self.char_channel_size = char_channel_size self.word_vocab_size = word_vocab_size # 1. Character Embedding Layer self.char_emb = nn.Embedding(self.char_vocab_size, self.char_dim, padding_idx=1) nn.init.uniform_(self.char_emb.weight, -0.001, 0.001) self.char_conv = nn.Conv2d(1, self.char_channel_size, (self.char_dim, self.char_channel_width)) # 2. Word Embedding Layer # initialize word embedding with GloVe # Freeze layer to prevent gradient update self.word_emb = nn.Embedding.from_pretrained(pretrained, freeze=True) # highway network assert (self.hidden_size * 2) == (self.char_channel_size + self.word_dim) # Create 2 hidden layers for i in range(2): setattr( self, 'highway_linear' + str(i), nn.Sequential( Linear(self.hidden_size * 2, self.hidden_size * 2), nn.ReLU())) setattr( self, 'highway_gate' + str(i), nn.Sequential( Linear(self.hidden_size * 2, self.hidden_size * 2), nn.Sigmoid())) # 3. Contextual Embedding Layer self.context_LSTM = LSTM(input_size=self.hidden_size * 2, hidden_size=self.hidden_size, bidirectional=True, batch_first=True, dropout=self.dropout_rate) # 4. Attention Flow Layer self.att_weight_c = Linear(self.hidden_size * 2, 1) self.att_weight_q = Linear(self.hidden_size * 2, 1) self.att_weight_cq = Linear(self.hidden_size * 2, 1) # 5. Modeling Layer self.modeling_LSTM1 = LSTM(input_size=self.hidden_size * 8, hidden_size=self.hidden_size, bidirectional=True, batch_first=True, dropout=self.dropout_rate) self.modeling_LSTM2 = LSTM(input_size=self.hidden_size * 2, hidden_size=self.hidden_size, bidirectional=True, batch_first=True, dropout=self.dropout_rate) # 6. Output Layer # No softmax applied here reason: https://stackoverflow.com/questions/57516027/does-pytorch-apply-softmax-automatically-in-nn-linear self.p1_weight_g = Linear(self.hidden_size * 8, 1, dropout=self.dropout_rate) self.p1_weight_m = Linear(self.hidden_size * 2, 1, dropout=self.dropout_rate) self.p2_weight_g = Linear(self.hidden_size * 8, 1, dropout=self.dropout_rate) self.p2_weight_m = Linear(self.hidden_size * 2, 1, dropout=self.dropout_rate) self.output_LSTM = LSTM(input_size=self.hidden_size * 2, hidden_size=self.hidden_size, bidirectional=True, batch_first=True, dropout=self.dropout_rate) self.dropout = nn.Dropout(p=self.dropout_rate)