def layer_forward(self, x, hx, cell, batch_sizes, reverse=False): h, c = hx init_h, init_c = h, c output, seq_len = [], len(x) steps = reversed(range(seq_len)) if reverse else range(seq_len) if self.training: hid_mask = SharedDropout.get_mask(h, self.dropout) for t in steps: last_batch_size, batch_size = len(h), batch_sizes[t] if last_batch_size < batch_size: h = torch.cat((h, init_h[last_batch_size:batch_size])) c = torch.cat((c, init_c[last_batch_size:batch_size])) else: h = h[:batch_size] c = c[:batch_size] h, c = cell(input=x[t], hx=(h, c)) output.append(h) if self.training: h = h * hid_mask[:batch_size] if reverse: output.reverse() output = torch.cat(output) return output
def forward(self, x, hx=None): x, batch_sizes = x batch_size = batch_sizes[0] if hx is None: init = x.new_zeros(batch_size, self.hidden_size) hx = (init, init) for layer in range(self.num_layers): if self.training: mask = SharedDropout.get_mask(x[:batch_size], self.dropout) mask = torch.cat( [mask[:batch_size] for batch_size in batch_sizes]) x *= mask x = torch.split(x, batch_sizes.tolist()) f_output = self.layer_forward(x=x, hx=hx, cell=self.f_cells[layer], batch_sizes=batch_sizes, reverse=False) b_output = self.layer_forward(x=x, hx=hx, cell=self.b_cells[layer], batch_sizes=batch_sizes, reverse=True) x = torch.cat([f_output, b_output], -1) x = PackedSequence(x, batch_sizes) return x
def layer_forward(self, x, hx, cell, batch_sizes, reverse=False): hx_0 = hx_i = hx hx_n, output = [], [] steps = reversed(range(len(x))) if reverse else range(len(x)) if self.training: hid_mask = SharedDropout.get_mask(hx_0[0], self.dropout) for t in steps: last_batch_size, batch_size = len(hx_i[0]), batch_sizes[t] if last_batch_size < batch_size: hx_i = [ torch.cat((h, ih[last_batch_size:batch_size])) for h, ih in zip(hx_i, hx_0) ] else: hx_n.append([h[batch_size:] for h in hx_i]) hx_i = [h[:batch_size] for h in hx_i] hx_i = [h for h in cell(x[t], hx_i)] output.append(hx_i[0]) if self.training: hx_i[0] = hx_i[0] * hid_mask[:batch_size] if reverse: hx_n = hx_i output.reverse() else: hx_n.append(hx_i) hx_n = [torch.cat(h) for h in zip(*reversed(hx_n))] output = torch.cat(output) return output, hx_n
def __init__(self, params): super(BiaffineParser, self).__init__() self.params = params # self.word_dropout = nn.Dropout(p=params['word_dropout']) # self.word_dropout_p = params['word_dropout'] # BERT # self.bert = BertModel.from_pretrained('bert-base-multilingual-cased') self.bert = BertModel.from_pretrained('bert-base-cased') self.bert_dropout = SharedDropout(p=params['bert_dropout']) # the MLP layers self.mlp_arc_h = MLP(n_in=params['n_bert_hidden'], n_hidden=params['n_mlp_arc'], dropout=params['mlp_dropout']) self.mlp_arc_d = MLP(n_in=params['n_bert_hidden'], n_hidden=params['n_mlp_arc'], dropout=params['mlp_dropout']) self.mlp_rel_h = MLP(n_in=params['n_bert_hidden'], n_hidden=params['n_mlp_rel'], dropout=params['mlp_dropout']) self.mlp_rel_d = MLP(n_in=params['n_bert_hidden'], n_hidden=params['n_mlp_rel'], dropout=params['mlp_dropout']) # the Biaffine layers self.arc_attn = Biaffine(n_in=params['n_mlp_arc'], bias_x=True, bias_y=False) self.rel_attn = Biaffine(n_in=params['n_mlp_rel'], n_out=params['n_rels'], bias_x=True, bias_y=True)
def __init__(self, n_in, n_hidden, dropout=0): super(MLP, self).__init__() self.linear = nn.Linear(n_in, n_hidden) self.activation = nn.LeakyReLU(negative_slope=0.1) self.dropout = SharedDropout(p=dropout) self.reset_parameters()
def __init__(self, n_in, n_hidden, activation=nn.LeakyReLU(0.1), dropout=0): super(MLP, self).__init__() self.linear = nn.Linear(n_in, n_hidden) self.activation = activation self.dropout = SharedDropout(dropout) self.reset_parameters()
def __init__(self, args): super(Model, self).__init__() self.args = args # the embedding layer self.word_embed = nn.Embedding(num_embeddings=args.n_words, embedding_dim=args.n_embed) if args.feat == 'char': self.feat_embed = CHAR_LSTM(n_chars=args.n_feats, n_embed=args.n_char_embed, n_out=args.n_embed) elif args.feat == 'bert': self.feat_embed = BertEmbedding(model=args.bert_model, n_layers=args.n_bert_layers, n_out=args.n_embed) else: self.feat_embed = nn.Embedding(num_embeddings=args.n_feats, embedding_dim=args.n_embed) self.embed_dropout = IndependentDropout(p=args.embed_dropout) # the word-lstm layer self.lstm = BiLSTM(input_size=args.n_embed * 2, hidden_size=args.n_lstm_hidden, num_layers=args.n_lstm_layers, dropout=args.lstm_dropout) self.lstm_dropout = SharedDropout(p=args.lstm_dropout) # the MLP layers self.mlp_arc_h = MLP(n_in=args.n_lstm_hidden * 2, n_hidden=args.n_mlp_arc, dropout=args.mlp_dropout) self.mlp_arc_d = MLP(n_in=args.n_lstm_hidden * 2, n_hidden=args.n_mlp_arc, dropout=args.mlp_dropout) self.mlp_rel_h = MLP(n_in=args.n_lstm_hidden * 2, n_hidden=args.n_mlp_rel, dropout=args.mlp_dropout) self.mlp_rel_d = MLP(n_in=args.n_lstm_hidden * 2, n_hidden=args.n_mlp_rel, dropout=args.mlp_dropout) # the Biaffine layers self.arc_attn = Biaffine(n_in=args.n_mlp_arc, bias_x=True, bias_y=False) self.rel_attn = Biaffine(n_in=args.n_mlp_rel, n_out=args.n_rels, bias_x=True, bias_y=True) self.pad_index = args.pad_index self.unk_index = args.unk_index
def __init__(self, args): super(Model, self).__init__() self.args = args # the embedding layer if args.bert is False: self.word_embed = nn.Embedding(num_embeddings=args.n_words, embedding_dim=args.word_embed) if args.freeze_word_emb: self.word_embed.weight.requires_grad = False else: self.word_embed = BertEmbedding(model=args.bert_model, n_layers=args.n_bert_layers, n_out=args.word_embed) self.feat_embed = nn.Embedding(num_embeddings=args.n_feats, embedding_dim=args.n_embed) if args.freeze_feat_emb: self.feat_embed.weight.requires_grad = False self.embed_dropout = IndependentDropout(p=args.embed_dropout) # the word-lstm layer self.lstm = BiLSTM(input_size=args.word_embed + args.n_embed, hidden_size=args.n_lstm_hidden, num_layers=args.n_lstm_layers, dropout=args.lstm_dropout) self.lstm_dropout = SharedDropout(p=args.lstm_dropout) # the MLP layers self.mlp_arc_h = MLP(n_in=args.n_lstm_hidden * 2, n_hidden=args.n_mlp_arc, dropout=args.mlp_dropout) self.mlp_arc_d = MLP(n_in=args.n_lstm_hidden * 2, n_hidden=args.n_mlp_arc, dropout=args.mlp_dropout) # the Biaffine layers self.arc_attn = Biaffine(n_in=args.n_mlp_arc, bias_x=True, bias_y=False) self.pad_index = args.pad_index self.unk_index = args.unk_index self.multinomial = nn.Parameter(torch.ones(args.n_feats, args.n_feats))
def __init__(self, vocab, n_embed, n_char_embed, n_char_out, n_lstm_hidden, n_lstm_layers, n_mlp_arc, n_mlp_lab, n_labels, drop): super(BiAffineParser, self).__init__() self.vocab = vocab # the embedding layer self.embed = nn.Embedding(vocab.n_train_words, n_embed) self.pretrained = nn.Embedding.from_pretrained(vocab.embeddings) # the char-lstm layer self.char_lstm = CharLSTM(n_char=vocab.n_chars, n_embed=n_char_embed, n_out=n_char_out) self.embed_drop = IndependentDropout(p=drop) # the word-lstm layer self.lstm = ParserLSTM(input_size=n_embed + n_char_out, hidden_size=n_lstm_hidden, num_layers=n_lstm_layers, batch_first=True, dropout=drop, bidirectional=True) self.lstm_drop = SharedDropout(p=drop) # the MLP layers self.mlp_arc_h = MLP(n_in=n_lstm_hidden * 2, n_hidden=n_mlp_arc, drop=drop) self.mlp_arc_d = MLP(n_in=n_lstm_hidden * 2, n_hidden=n_mlp_arc, drop=drop) self.mlp_lab_h = MLP(n_in=n_lstm_hidden * 2, n_hidden=n_mlp_lab, drop=drop) self.mlp_lab_d = MLP(n_in=n_lstm_hidden * 2, n_hidden=n_mlp_lab, drop=drop) # the BiAffine layers self.arc_attn = BiAffine(n_in=n_mlp_arc, bias_x=True, bias_y=False) self.lab_attn = BiAffine(n_in=n_mlp_lab, n_out=n_labels, bias_x=True, bias_y=True) self.reset_parameters()
def forward(self, sequence, hx=None): x, batch_sizes = sequence.data, sequence.batch_sizes.tolist() batch_size = batch_sizes[0] h_n, c_n = [], [] if hx is None: ih = x.new_zeros(self.num_layers * 2, batch_size, self.hidden_size) h, c = ih, ih else: h, c = self.permute_hidden(hx, sequence.sorted_indices) h = h.view(self.num_layers, 2, batch_size, self.hidden_size) c = c.view(self.num_layers, 2, batch_size, self.hidden_size) for i in range(self.num_layers): x = torch.split(x, batch_sizes) if self.training: mask = SharedDropout.get_mask(x[0], self.dropout) x = [i * mask[:len(i)] for i in x] x_f, (h_f, c_f) = self.layer_forward(x=x, hx=(h[i, 0], c[i, 0]), cell=self.f_cells[i], batch_sizes=batch_sizes) x_b, (h_b, c_b) = self.layer_forward(x=x, hx=(h[i, 1], c[i, 1]), cell=self.b_cells[i], batch_sizes=batch_sizes, reverse=True) x = torch.cat((x_f, x_b), -1) h_n.append(torch.stack((h_f, h_b))) c_n.append(torch.stack((c_f, c_b))) x = PackedSequence(x, sequence.batch_sizes, sequence.sorted_indices, sequence.unsorted_indices) hx = torch.cat(h_n, 0), torch.cat(c_n, 0) hx = self.permute_hidden(hx, sequence.unsorted_indices) return x, hx
def __init__(self, args, mask_token_id=0): super().__init__() self.args = args if args.n_embed: # the embedding layer self.word_embed = nn.Embedding(num_embeddings=args.n_words, embedding_dim=args.n_embed) self.unk_index = args.unk_index else: self.word_embed = None if args.feat == 'char': self.feat_embed = CharLSTM(n_chars=args.n_feats, n_embed=args.n_char_embed, n_out=args.n_feat_embed, pad_index=args.feat_pad_index) self.pad_index = args.pad_index elif args.feat == 'bert': self.feat_embed = BertEmbedding(model=args.bert_model, n_layers=args.n_bert_layers, n_out=args.n_feat_embed, requires_grad=args.bert_fine_tune, mask_token_id=mask_token_id, token_dropout=args.token_dropout, mix_dropout=args.mix_dropout, use_hidden_states=args.use_hidden_states, use_attentions=args.use_attentions, attention_layer=args.attention_layer) #self.args.n_mlp_arc = self.feat_embed.bert.config.max_position_embeddings self.args.n_feat_embed = self.feat_embed.n_out # taken from the model self.args.n_bert_layers = self.feat_embed.n_layers # taken from the model self.pad_index = self.feat_embed.pad_index # taken from the model self.args.pad_index = self.pad_index # update else: self.feat_embed = nn.Embedding(num_embeddings=args.n_feats, embedding_dim=args.n_feat_embed) self.pad_index = args.pad_index self.embed_dropout = IndependentDropout(p=args.embed_dropout) if args.n_lstm_layers: # the lstm layer self.lstm = BiLSTM(input_size=args.n_embed+args.n_feat_embed, hidden_size=args.n_lstm_hidden, num_layers=args.n_lstm_layers, dropout=args.lstm_dropout) self.lstm_dropout = SharedDropout(p=args.lstm_dropout) mlp_input_size = args.n_lstm_hidden*2 else: self.lstm = None mlp_input_size = args.n_embed + args.n_feat_embed # the MLP layers self.mlp_arc_d = MLP(n_in=mlp_input_size, n_out=args.n_mlp_arc, dropout=args.mlp_dropout) self.mlp_arc_h = MLP(n_in=mlp_input_size, n_out=args.n_mlp_arc, dropout=args.mlp_dropout) self.mlp_rel_d = MLP(n_in=mlp_input_size, n_out=args.n_mlp_rel, dropout=args.mlp_dropout) self.mlp_rel_h = MLP(n_in=mlp_input_size, n_out=args.n_mlp_rel, dropout=args.mlp_dropout) # the Biaffine layers self.arc_attn = Biaffine(n_in=args.n_mlp_arc, bias_x=True, bias_y=False) self.rel_attn = Biaffine(n_in=args.n_mlp_rel, n_out=args.n_rels, bias_x=True, bias_y=True) # transformer attention if args.use_attentions: self.attn_mix = nn.Parameter(torch.randn(1)) #2)) # 1)) # # distance # self.args.distance = False # DEBUG # if self.args.distance: # self.distance = DeepBiaffine(mlp_input_size, mlp_input_size, self.args.deep_biaff_hidden_dim, 1, dropout=args.mlp_dropout) self.criterion = nn.CrossEntropyLoss()
def __init__(self, args): super(Model, self).__init__() self.args = args self.pretrained = False # the embedding layer self.char_embed = nn.Embedding(num_embeddings=args.n_chars, embedding_dim=args.n_embed) n_lstm_input = args.n_embed if args.feat == 'bert': self.feat_embed = BertEmbedding(model=args.bert_model, n_layers=args.n_bert_layers, n_out=args.n_feat_embed) n_lstm_input += args.n_feat_embed if self.args.feat in {'bigram', 'trigram'}: self.bigram_embed = nn.Embedding(num_embeddings=args.n_bigrams, embedding_dim=args.n_embed) n_lstm_input += args.n_embed if self.args.feat == 'trigram': self.trigram_embed = nn.Embedding(num_embeddings=args.n_trigrams, embedding_dim=args.n_embed) n_lstm_input += args.n_embed self.embed_dropout = IndependentDropout(p=args.embed_dropout) # the lstm layer self.lstm = BiLSTM(input_size=n_lstm_input, hidden_size=args.n_lstm_hidden, num_layers=args.n_lstm_layers, dropout=args.lstm_dropout) self.lstm_dropout = SharedDropout(p=args.lstm_dropout) # the MLP layers self.mlp_span_l = MLP(n_in=args.n_lstm_hidden * 2, n_out=args.n_mlp_span, dropout=args.mlp_dropout) self.mlp_span_r = MLP(n_in=args.n_lstm_hidden * 2, n_out=args.n_mlp_span, dropout=args.mlp_dropout) # the Biaffine layers self.span_attn = Biaffine(n_in=args.n_mlp_span, bias_x=True, bias_y=False) if args.link == 'mlp': # a representation that a fencepost is a split point self.mlp_span_s = MLP(n_in=args.n_lstm_hidden * 2, n_out=args.n_mlp_span, dropout=args.mlp_dropout) # scores for split points self.score_split = nn.Linear(args.n_mlp_span, 1) elif args.link == 'att': self.split_attn = ElementWiseBiaffine(n_in=args.n_lstm_hidden, bias_x=True, bias_y=False) self.pad_index = args.pad_index self.unk_index = args.unk_index
def __init__(self, args): super(Model, self).__init__() self.args = args # the embedding layer self.word_embed = nn.Embedding(num_embeddings=args.n_words, embedding_dim=args.n_embed) if args.use_char: self.char_embed = CHAR_LSTM(n_chars=args.n_char_feats, n_embed=args.n_char_embed, n_out=args.n_embed) if args.use_bert: self.bert_embed = BertEmbedding(model=args.bert_model, n_layers=args.n_bert_layers, n_out=args.n_embed) if args.use_pos: self.pos_embed = nn.Embedding(num_embeddings=args.n_pos_feats, embedding_dim=args.n_embed) self.embed_dropout = IndependentDropout(p=args.embed_dropout) # the word-lstm layer self.lstm = BiLSTM(input_size=args.n_embed * (args.use_char + args.use_bert + args.use_pos + 1), hidden_size=args.n_lstm_hidden, num_layers=args.n_lstm_layers, dropout=args.lstm_dropout) self.lstm_dropout = SharedDropout(p=args.lstm_dropout) # the MLP layers self.mlp_arc_h = MLP(n_in=args.n_lstm_hidden * 2, n_hidden=args.n_mlp_arc, dropout=args.mlp_dropout) self.mlp_arc_d = MLP(n_in=args.n_lstm_hidden * 2, n_hidden=args.n_mlp_arc, dropout=args.mlp_dropout) self.mlp_rel_h = MLP(n_in=args.n_lstm_hidden * 2, n_hidden=args.n_mlp_rel, dropout=args.mlp_dropout) self.mlp_rel_d = MLP(n_in=args.n_lstm_hidden * 2, n_hidden=args.n_mlp_rel, dropout=args.mlp_dropout) # the Biaffine layers self.arc_attn = Biaffine(n_in=args.n_mlp_arc, bias_x=True, bias_y=False) self.rel_attn = Biaffine(n_in=args.n_mlp_rel, n_out=args.n_rels, bias_x=True, bias_y=True) self.binary = args.binary # the Second Order Parts if self.args.use_second_order: self.use_sib = args.use_sib self.use_cop = args.use_cop self.use_gp = args.use_gp if args.use_sib: self.mlp_sib_h = MLP(n_in=args.n_lstm_hidden * 2, n_hidden=args.n_mlp_sec, dropout=args.mlp_dropout, identity=self.binary) self.mlp_sib_d = MLP(n_in=args.n_lstm_hidden * 2, n_hidden=args.n_mlp_sec, dropout=args.mlp_dropout, identity=self.binary) self.trilinear_sib = TrilinearScorer(args.n_mlp_sec, args.n_mlp_sec, args.n_mlp_sec, init_std=args.init_std, rank=args.n_mlp_sec, factorize=args.factorize) if args.use_cop: self.mlp_cop_h = MLP(n_in=args.n_lstm_hidden * 2, n_hidden=args.n_mlp_sec, dropout=args.mlp_dropout, identity=self.binary) self.mlp_cop_d = MLP(n_in=args.n_lstm_hidden * 2, n_hidden=args.n_mlp_sec, dropout=args.mlp_dropout, identity=self.binary) self.trilinear_cop = TrilinearScorer(args.n_mlp_sec, args.n_mlp_sec, args.n_mlp_sec, init_std=args.init_std, rank=args.n_mlp_sec, factorize=args.factorize) if args.use_gp: self.mlp_gp_h = MLP(n_in=args.n_lstm_hidden * 2, n_hidden=args.n_mlp_sec, dropout=args.mlp_dropout, identity=self.binary) self.mlp_gp_d = MLP(n_in=args.n_lstm_hidden * 2, n_hidden=args.n_mlp_sec, dropout=args.mlp_dropout, identity=self.binary) self.mlp_gp_hd = MLP(n_in=args.n_lstm_hidden * 2, n_hidden=args.n_mlp_sec, dropout=args.mlp_dropout, identity=self.binary) self.trilinear_gp = TrilinearScorer(args.n_mlp_sec, args.n_mlp_sec, args.n_mlp_sec, init_std=args.init_std, rank=args.n_mlp_sec, factorize=args.factorize) self.pad_index = args.pad_index self.unk_index = args.unk_index