def __init__(self, config, src_vocab, target_vocab, s_v, t_v, u): super(Transformer, self).__init__() self.config = config h, N, dropout = self.config.h, self.config.N, self.config.dropout d_model, d_ff = self.config.d_model, self.config.d_ff attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) attncross = MultiHeadedAttention(h, d_model * 2) ffcross = PositionwiseFeedForward(d_model * 2, d_ff, dropout) positioncross = PositionalEncoding(d_model * 2, dropout) self.encoder = Encoder( EncoderLayer(config.d_model, deepcopy(attn), deepcopy(ff), dropout), N) self.encoder_cross = EncoderCross( EncoderLayerCross((config.d_model) * 2, deepcopy(attncross), deepcopy(ffcross), dropout), N) self.src_embed = nn.Sequential( Embeddings(config.d_model, src_vocab, s_v, u), deepcopy(position)) # Embeddings followed by PE # self.src_embed.weight.data.copy_(src_vocab.vectors) self.target_embed = nn.Sequential( Embeddings(config.d_model, target_vocab, t_v, u), deepcopy(position)) # self.target_embed.weight.data.copy_(target_vocab.vectors) # Fully-Connected Layer self.fc = nn.Linear(self.config.d_model, self.config.output_size) self.sigmoid = nn.Sigmoid() self.cos = nn.CosineSimilarity(dim=1, eps=1e-6) self.softmax = nn.Softmax()
def __init__(self, config, src_vocab): super(Transformer, self).__init__() self.config = config self.src_vocab = src_vocab # 超参数 # h是多头数量, N是层数, dropout是比率 h, N, dropout = self.config.h, self.config.N, self.config.dropout # 词向量维度,全连接维度 d_model, d_ff = self.config.d_model, self.config.d_ff # 多头注意力层 attn = MultiHeadedAttention(h, d_model) # 全连接层 ff = PositionwiseFeedForward(d_model, d_ff, dropout) # 位置向量 position = PositionalEncoding(d_model, dropout) self.encoder = Encoder( EncoderLayer(config.d_model, deepcopy(attn), deepcopy(ff), dropout), N) self.src_embed = nn.Sequential( Embedding(self.config.d_model, self.src_vocab), deepcopy(position)) # embedding with position encoding self.fc = nn.Linear(self.config.d_model, self.config.output_size) self.softmax = nn.Softmax()
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1): "Helper: Construct a model from hyperparameters." c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), nn.Sequential(Embeddings(d_model, src_vocab), c(position)), nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), Generator(d_model, tgt_vocab)) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) return model
def construct_model(): x_input = tf.keras.layers.Input((28, 28, 1)) enc_x = EncoderLayer()(x_input) quant_x = VQVAE()(enc_x) x_dec = tf.keras.layers.Lambda( lambda quant_x: enc_x + tf.stop_gradient(quant_x - enc_x))(quant_x) dec_x = DecoderLayer()(x_dec) model = tf.keras.models.Model(x_input, dec_x) model.compile(optimizer=tf.keras.optimizers.Adam(), loss=vqvae_loss(0.25, enc_x, quant_x), experimental_run_tf_function=False) return model
def __init__(self, config, src_vocab): super(Transformer, self).__init__() self.config = config h, N, dropout = self.config.h, self.config.N, self.config.dropout d_model, d_ff = self.config.d_model, self.config.d_ff attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) self.encoder = Encoder( EncoderLayer(config.d_model, deepcopy(attn), deepcopy(ff), dropout), N) self.src_embed = nn.Sequential(Embeddings(config.d_model, src_vocab)) self.fc = nn.Linear(self.config.d_model, self.config.output_size) self.softmax = nn.Softmax()
def __init__(self, config, src_vocab): super(Matposer, self).__init__() self.config = config d_row, N, dropout = self.config.d_row, self.config.N, self.config.dropout d_model, d_ff = self.config.d_model, self.config.d_ff inter = Interactor(d_model, d_ff, out_row=d_row, dropout=dropout) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) self.encoder = Encoder( EncoderLayer(d_model, deepcopy(inter), deepcopy(ff), dropout), N) self.src_embed = nn.Sequential(Embeddings(d_model, src_vocab), deepcopy(position)) self.fc = nn.Linear(d_model, self.config.output_size) self.softmax = nn.Softmax()
def __init__(self, config): super(Transformer, self).__init__() self.config = config h, N, dropout = self.config.h, self.config.N, self.config.dropout d_model, d_ff = self.config.d_model, self.config.d_ff attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) self.encoder = Encoder(EncoderLayer(config.d_model, deepcopy(attn), deepcopy(ff), dropout), N) # self.src_embed = nn.Sequential(Embeddings(config.d_model, src_vocab), # deepcopy(position)) # Embeddings followed by PE # Fully-Connected Layer self.fc = nn.Linear( self.config.d_model, self.config.output_size )
def __init__(self, config, pre_train_weight, embedding_size): super(Transformer, self).__init__() self.config = config self.pre_train_weight = pre_train_weight self.embedding_size = embedding_size # 超参数 # h是多头数量, N是层数, dropout是比率 h, N, dropout = self.config.h, self.config.N, self.config.dropout # 词向量维度,全连接维度 d_model, d_ff = self.config.d_model, self.config.d_ff # 多头注意力层 attn = MultiHeadedAttention(h, d_model) # 全连接层 ff = PositionwiseFeedForward(d_model, d_ff, dropout) # 位置向量 position = PositionalEncoding(d_model, dropout) self.encoder = Encoder(EncoderLayer(config.d_model, deepcopy(attn), deepcopy(ff), dropout), N) self.src_embed = nn.Sequential(Embedding(self.config.d_model, self.pre_train_weight, self.embedding_size), deepcopy(position)) # embedding with position encoding
def __init__(self, config, src_vocab): super(Transformer, self).__init__() self.config = config h, N, dropout = self.config.h, self.config.N, self.config.dropout d_model, d_ff = self.config.d_model, self.config.d_ff self.src_vocab = src_vocab attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) self.encoder_layer = EncoderLayer(config.d_model, deepcopy(attn), deepcopy(ff), dropout) self.encoder = Encoder(self.encoder_layer, N) self.src_word_emb = nn.Embedding(src_vocab, config.d_model, padding_idx=0) # self.pos_bias = nn.Embedding(src_vocab, config.d_model, padding_idx=0) # self.pos_bias = nn.Embedding.from_pretrained(get_sinusoid_encoding_table_dim(src_vocab, config.d_model, padding_idx=0),freeze=True) # self.pos_bias = nn.Embedding.from_pretrained(get_sinusoid_encoding_table_vocab(src_vocab, config.d_model, padding_idx=0),freeze=True) # self.pos_bias = nn.Embedding(1, config.d_model, padding_idx=0) # self.pos_bias = nn.Embedding(src_vocab, 1, padding_idx=0) # self.position_enc = nn.Embedding(src_vocab, config.d_model, padding_idx=0) self.position_enc = nn.Embedding.from_pretrained( get_sinusoid_encoding_table(src_vocab, config.d_model, padding_idx=0), freeze=False) # position_enc = torch.randn(1000, config.d_model) # position_enc = position_enc.unsqueeze(0) # self.register_buffer('position_enc', position_enc) self.drop = nn.Dropout(p=dropout) self.fc = nn.Linear(self.config.d_model, self.config.output_size) self.softmax = nn.Softmax()
def __init__(self, enc_in, dec_in, c_out, out_len, factor=5, d_model=512, n_heads=8, e_layers=3, d_layers=2, d_ff=512, group_factors=None, group_operator='avg', group_step=1, dropout=0.0, attn='prob', embed='fixed', activation='gelu', has_minute=False, has_hour=True): super(HLInformer, self).__init__() self.pred_len = out_len self.attn = attn if group_factors is None: group_factors = [4, 1] else: group_factors = [*group_factors, 1] self.group_factors = group_factors # Grouping self.group_layers = nn.ModuleList([GroupLayer(gf, group_operator, group_step) for gf in group_factors]) # Encoding self.enc_embeddings = nn.ModuleList( [InformerDataEmbedding(enc_in, d_model, has_minute=has_minute, has_hour=has_hour) for _ in group_factors]) self.dec_embeddings = nn.ModuleList( [InformerDataEmbedding(dec_in, d_model, has_minute=has_minute, has_hour=has_hour) for _ in group_factors]) # Attention Attn = ProbAttention if attn == 'prob' else FullAttention # Encoder self.encoders = nn.ModuleList([Encoder( [ EncoderLayer( AttentionLayer(Attn(False, factor, attention_dropout=dropout), d_model, n_heads), d_model, d_ff, dropout=dropout, activation=activation ) for l in range(e_layers) ], [ ConvLayer( d_model ) for l in range(e_layers - 1) ], norm_layer=torch.nn.LayerNorm(d_model) ) for _ in group_factors]) # Decoder self.decoders = nn.ModuleList([Decoder( [ DecoderLayer( AttentionLayer(FullAttention(True, factor, attention_dropout=dropout), d_model, n_heads), AttentionLayer(FullAttention(False, factor, attention_dropout=dropout), d_model, n_heads), d_model, d_ff, dropout=dropout, activation=activation, ) for l in range(d_layers) ], norm_layer=torch.nn.LayerNorm(d_model) ) for _ in group_factors]) # self.end_conv1 = nn.Conv1d(in_channels=label_len+out_len, out_channels=out_len, kernel_size=1, bias=True) # self.end_conv2 = nn.Conv1d(in_channels=d_model, out_channels=c_out, kernel_size=1, bias=True) self.projections = nn.ModuleList( [nn.Linear(d_model * (i + 1), c_out, bias=True) for i, gf in enumerate(group_factors)])
def __init__(self, enc_in, dec_in, c_out, seq_len, label_len, out_len, batch_size, factor=5, d_model=512, n_heads=8, e_layers=3, d_layers=2, d_ff=512, dropout=0.0, attn='prob', embed='fixed', data='ETTh', activation='gelu'): super(Informer, self).__init__() self.pred_len = out_len self.attn = attn self.seq_len = seq_len self.label_len = label_len self.batch_size = batch_size # Encoding self.enc_embedding = DataEmbedding(enc_in, d_model, embed, data, dropout) self.dec_embedding = DataEmbedding(dec_in, d_model, embed, data, dropout) # Attention Attn = ProbAttention if attn == 'prob' else FullAttention # Encoder self.encoder = Encoder([ EncoderLayer(AttentionLayer( Attn(False, factor, attention_dropout=dropout), d_model, n_heads), d_model, d_ff, dropout=dropout, activation=activation) for l in range(e_layers) ], [ConvLayer(d_model) for l in range(e_layers - 1)], norm_layer=tf.keras.layers.LayerNormalization()) # Decoder self.decoder = Decoder([ DecoderLayer( AttentionLayer( FullAttention(True, factor, attention_dropout=dropout), d_model, n_heads), AttentionLayer( FullAttention(False, factor, attention_dropout=dropout), d_model, n_heads), d_model, d_ff, dropout=dropout, activation=activation, ) for l in range(d_layers) ], norm_layer=tf.keras.layers.LayerNormalization()) # self.end_conv1 = nn.Conv1d(in_channels=label_len+out_len, out_channels=out_len, kernel_size=1, bias=True) # self.end_conv2 = nn.Conv1d(in_channels=d_model, out_channels=c_out, kernel_size=1, bias=True) self.projection = tf.keras.layers.Dense(c_out)
def main(): train_data = SentenceDataset(args.train_file, encoding_type=args.encoding_type, filter_threshold=args.filter_threshold) val_data = SentenceDataset(args.val_file, encoding_type=args.encoding_type, filter_threshold=args.filter_threshold) train_loader = torch.utils.data.DataLoader(train_data, args.batch_size, shuffle=True) val_loader = torch.utils.data.DataLoader(val_data, args.batch_size) print(len(train_loader)) input_dim = len(train_data.vocab.source_vocab) output_dim = len(train_data.vocab.target_vocab) static = args.embedding_type == 'static' device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') enc_embedding = Embeddings(input_dim, args.hidden_dim, args.max_len, device, static) encoder_layer = EncoderLayer(args.hidden_dim, args.num_enc_heads, args.inner_dim, args.dropout) encoder = Encoder(enc_embedding, encoder_layer, args.num_enc_layers, args.dropout) dec_embedding = Embeddings(input_dim, args.hidden_dim, args.max_len, device, static) decoder_layer = DecoderLayer(args.hidden_dim, args.num_dec_heads, args.inner_dim, args.dropout) decoder = Decoder(output_dim, args.hidden_dim, dec_embedding, decoder_layer, args.num_dec_layers, args.dropout) pad_id = train_data.vocab.source_vocab['<pad>'] model = Transformer(encoder, decoder, pad_id, device) print('Transformer has {:,} trainable parameters'.format( count_parames(model))) if args.load_model is not None: model.load(args.load_model) else: model.apply(init_weights) if args.mode == 'test': inferencer = Inferencer(model, train_data.vocab, device) greedy_out = inferencer.infer_greedy( 'helo world, I m testin a typo corector') print(greedy_out) elif args.mode == 'train': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) loss_function = nn.NLLLoss(ignore_index=pad_id) print('Started training...') train(model, train_loader, val_loader, optimizer, loss_function, device) else: raise ValueError('Mode not recognized')