def __init__(self, model: Transformer, weight_files): super(TransformerEnsemble, self).__init__() self.n = len(weight_files) self.models = ModuleList([copy.deepcopy(model) for _ in range(self.n)]) for i in range(self.n): state_dict_i = torch.load(weight_files[i])['state_dict'] self.models[i].load_state_dict(state_dict_i)
def __init__(self, vocab_size, max_len, N_dec, padding_idx, d_model=512, d_k=64, d_v=64, h=8, d_ff=2048, dropout=.1, self_att_module=None, enc_att_module=None, self_att_module_kwargs=None, enc_att_module_kwargs=None): super(MeshedDecoder, self).__init__() self.d_model = d_model self.word_emb = nn.Embedding(vocab_size, d_model, padding_idx=padding_idx) self.pos_emb = nn.Embedding.from_pretrained(sinusoid_encoding_table(max_len + 1, d_model, 0), freeze=True) self.layers = ModuleList( [MeshedDecoderLayer(d_model, d_k, d_v, h, d_ff, dropout, self_att_module=self_att_module, enc_att_module=enc_att_module, self_att_module_kwargs=self_att_module_kwargs, enc_att_module_kwargs=enc_att_module_kwargs) for _ in range(N_dec)]) self.fc = nn.Linear(d_model, vocab_size, bias=False) self.max_len = max_len self.padding_idx = padding_idx self.N = N_dec self.register_state('running_mask_self_attention', torch.zeros((1, 1, 0)).byte()) self.register_state('running_seq', torch.zeros((1,)).long())