def __init__(self, dec_inp_size, dec_out_size, d_latent, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1, device='cpu', d_map_latent=8): super(DecoderY, self).__init__() self.dec_out_size = dec_out_size self.d_model = d_model self.device = device self.trg_embed = nn.Sequential( LinearEmbedding(dec_inp_size, d_model - d_map_latent), PositionalEncoding(d_model - d_map_latent, dropout)) self.decoder = Decoder( DecoderLayer( d_model, MultiHeadAttention(h, d_model), MultiHeadAttention(h, d_model), ConcatPointerwiseFeedforward(d_model, d_latent, d_ff, dropout), dropout), N) self.fc = nn.Linear(d_model, dec_out_size * 2) self.init_weights(self.decoder.parameters()) self.init_weights(self.fc.parameters()) self.map_encoder = load_map_encoder(device)
def __init__(self, src_len, tgt_len, enc_inp_size, dec_inp_size, dec_out_size, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1, device='cpu'): super(Generator, self).__init__() self.device = device self.src_len = src_len self.tgt_len = tgt_len self.dec_inp_size = dec_inp_size c = copy.deepcopy attn = MultiHeadAttention(h, d_model) ff = PointerwiseFeedforward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) self.generator = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), nn.Sequential(LinearEmbedding(enc_inp_size, d_model), c(position)), nn.Sequential(LinearEmbedding(dec_inp_size, d_model), c(position)), TFHeadGenerator(d_model, dec_out_size)) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in self.generator.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p)
def __init__(self, num_layers=None, d_model=None, num_heads=None, dff=None, input_vocab_size=None, target_vocab_size=None, model_dir=None, pe_input=None, pe_target=None, rate=0.1, decoder=None, final_layer=None, args=None): super(TransformerBert, self).__init__() self.encoder = BertEncoder(model_dir=model_dir, d_model=d_model, args=args) if decoder: self.decoder = decoder else: self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate) if final_layer: self.final_layer = final_layer else: self.final_layer = tf.keras.layers.Dense(target_vocab_size)
def create_model(cls, args): from transformer.conv_encoder import Conv2dSubsample from transformer.encoder import Encoder from transformer.attentionAssigner import Attention_Assigner # from transformer.attentionAssigner import Attention_Assigner_RNN as Attention_Assigner from transformer.decoder import Decoder_CIF as Decoder conv_encoder = Conv2dSubsample(d_input=args.d_input * args.LFR_m, d_model=args.d_model, n_layers=args.n_conv_layers) encoder = Encoder(d_input=args.d_model, n_layers=args.n_layers_enc, n_head=args.n_head, d_model=args.d_model, d_inner=args.d_inner, dropout=args.dropout) assigner = Attention_Assigner(d_input=args.d_model, d_hidden=args.d_assigner_hidden, w_context=args.w_context, n_layers=args.n_assigner_layers) decoder = Decoder(sos_id=args.sos_id, n_tgt_vocab=args.vocab_size, n_layers=args.n_layers_dec, n_head=args.n_head, d_model=args.d_model, d_inner=args.d_inner, dropout=args.dropout) model = cls(conv_encoder, encoder, assigner, decoder, args.spec_aug_cfg) return model
def __init__(self, enc_inp_size, dec_inp_size, dec_out_size, N=6, d_model=512, d_ff=2048, heads=8, dropout=0.1, mean=[0, 0], std=[0, 0]): super(IndividualTF, self).__init__() "Helper: Construct a model from hyperparameters." c = copy.deepcopy attn = MultiHeadAttention(heads, d_model) ff = PointerwiseFeedforward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) self.mean = np.array(mean) self.std = np.array(std) self.model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), nn.Sequential(LinearEmbedding(enc_inp_size, d_model), c(position)), nn.Sequential(LinearEmbedding(dec_inp_size, d_model), c(position)), Generator(d_model, dec_out_size)) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in self.model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p)
def create_model(cls, args): from transformer.decoder import Decoder from transformer.encoder import Encoder from transformer.conv_encoder import Conv2dSubsample conv_encoder = Conv2dSubsample(d_input=args.d_input * args.LFR_m, d_model=args.d_model, n_layers=args.n_conv_layers) encoder = Encoder(d_input=args.d_model, n_layers=args.n_layers_enc, n_head=args.n_head, d_model=args.d_model, d_inner=args.d_inner, dropout=args.dropout) decoder = Decoder(sos_id=args.sos_id, eos_id=args.eos_id, n_tgt_vocab=args.vocab_size, n_layers=args.n_layers_dec, n_head=args.n_head, d_model=args.d_model, d_inner=args.d_inner, dropout=args.dropout) model = cls(conv_encoder, encoder, decoder, spec_aug_cfg=args.spec_aug_cfg) return model
def make_model( src_vocab: int, tgt_vocab: int, n: int = 6, d_model: int = 512, d_ff: int = 2048, h: int = 8, dropout: float = 0.1, device: torch.device = torch.device("cpu"), ) -> EncoderDecoder: """Helper: Construct a model from hyperparameters.""" c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), n), Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), n), nn.Sequential(Embeddings(d_model, src_vocab), c(position)), nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), Generator(d_model, tgt_vocab), ).to(device) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) return model
def test_forward(self): # Parameters batch_size = 64 sequence_length = 10 d_k = d_v = d_model = input_size = 512 d_ff = 2048 nb_of_decoder_layers = 6 # Initialize decoder decoder_layer = DecoderLayer( size=input_size, self_attn=MultiHeadAttention(n_head=8, d_model=d_model, d_k=d_k, d_v=d_v, dropout=0.1), memory_attn=MultiHeadAttention(n_head=8, d_model=d_model, d_k=d_k, d_v=d_v, dropout=0.1), feed_forward=PositionwiseFeedForward(d_model=d_model, d_ff=d_ff, dropout=0.1), dropout=0.1) decoder = Decoder(layer=decoder_layer, N=nb_of_decoder_layers) # Initialize input and memory x = torch.ones((batch_size, sequence_length, input_size)) memory = torch.ones((batch_size, sequence_length, input_size)) # Subsequent mask: mask all words with length > i decoder_mask = subsequent_mask(sequence_length) # Forward pass with fool input and memory (same here) out = decoder.forward(x, memory, decoder_mask, None) # Unit Tests self.assertIsInstance(out, torch.Tensor) self.assertEqual(out.shape, x.shape) self.assertEqual(out.shape, memory.shape) self.assertEqual(x.shape, memory.shape) self.assertEqual(torch.isnan(out).sum(), 0)
class DecoderY(nn.Module): def __init__(self, dec_inp_size, dec_out_size, d_latent, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1, device='cpu', d_map_latent=8): super(DecoderY, self).__init__() self.dec_out_size = dec_out_size self.d_model = d_model self.device = device self.trg_embed = nn.Sequential( LinearEmbedding(dec_inp_size, d_model - d_map_latent), PositionalEncoding(d_model - d_map_latent, dropout)) self.decoder = Decoder( DecoderLayer( d_model, MultiHeadAttention(h, d_model), MultiHeadAttention(h, d_model), ConcatPointerwiseFeedforward(d_model, d_latent, d_ff, dropout), dropout), N) self.fc = nn.Linear(d_model, dec_out_size * 2) self.init_weights(self.decoder.parameters()) self.init_weights(self.fc.parameters()) self.map_encoder = load_map_encoder(device) def init_weights(self, params): for p in params: if p.dim() > 1: nn.init.xavier_uniform_(p) def forward(self, enc_out, latents, trg, src_mask, trg_mask, map): map = map.to(self.device) map_feat = self.map_encoder(trg[:, :, :2].reshape(-1, 2), map.reshape(-1, map.shape[2], map.shape[3], map.shape[4]), train=False) map_feat = map_feat.reshape((-1, trg.shape[1], map_feat.shape[-1])) trg_emb = torch.cat((self.trg_embed(trg), map_feat), dim=-1) dec_out = self.decoder(trg_emb, enc_out, latents.unsqueeze(1), src_mask, trg_mask) # bs, 12, 512 stats = self.fc(dec_out) # bs, 12, out*2 mu = stats[:, :, :self.dec_out_size] logvar = stats[:, :, self.dec_out_size:] return mu, logvar
def __init__(self, *args, embedding_rank=None, inner_rank=None, ffward_rank=None, **kwargs): # Run super constructor from NMTModel, but don't run NMTModel.__init__ super(NMTModel, self).__init__() self.vocab = pickle.load(open(paths.vocab, 'rb')) if embedding_rank is None: embedding_rank = transformer_config.embedding_rank if inner_rank is None: inner_rank = transformer_config.inner_rank if ffward_rank is None: ffward_rank = transformer_config.ffward_rank print(transformer_config.embedding_factorization, transformer_config.inner_factorization, transformer_config.ffward_factorization) print(embedding_rank, inner_rank, ffward_rank) self.encoder = Encoder(len(self.vocab.src), embedding_rank, inner_rank, ffward_rank) self.decoder = Decoder(len(self.vocab.tgt), embedding_rank, inner_rank, ffward_rank) self.gpu = False self.initialize() self.optimizer = NoamOpt(transformer_config.layer_dimension, train_config.lr, 4000, Adam( self.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9, ), beginning_step=0) self.num_accumulations = 0 self.accumulate = max(1, train_config.accumulate)
def __init__(self, vocabulary_size_in, vocabulary_size_out, constants, hyperparams): super(Transformer, self).__init__() self.constants = constants self.max_seq = hyperparams.MAX_SEQ self.EmbeddingSrc = Embedding(vocabulary_size=vocabulary_size_in, d_model=hyperparams.D_MODEL, constants=constants) self.EmbeddingTgt = Embedding(vocabulary_size=vocabulary_size_out, d_model=hyperparams.D_MODEL, constants=constants) self.Encoder = Encoder(nb_layers=hyperparams.NB_LAYERS, nb_heads=hyperparams.NB_HEADS, d_model=hyperparams.D_MODEL, nb_neurons=hyperparams.NB_NEURONS, dropout=hyperparams.DROPOUT) self.Decoder = Decoder(nb_layers=hyperparams.NB_LAYERS, nb_heads=hyperparams.NB_HEADS, d_model=hyperparams.D_MODEL, nb_neurons=hyperparams.NB_NEURONS, dropout=hyperparams.DROPOUT) self.Linear = nn.Linear(hyperparams.D_MODEL, vocabulary_size_out, bias=False) if hyperparams.SHARE_WEIGHTS: self.EmbeddingSrc.lookup_table.weight = self.Linear.weight self.EmbeddingTgt.lookup_table.weight = self.Linear.weight
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1): super(Transformer, self).__init__() self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate) self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate) self.final_layer = tf.keras.layers.Dense(target_vocab_size)
def __init__(self, n_src_vocab, n_trg_vocab, src_pad_idx, trg_pad_idx, d_word_vec=256, d_model=256, d_inner=512, n_layer=3, n_head=8, dropout=0.1, n_position=200): super(Transformer, self).__init__() self.src_pad_idx = src_pad_idx self.trg_pad_idx = trg_pad_idx self.encoder = Encoder(n_src_vocab=n_src_vocab, d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner, n_layer=n_layer, n_head=n_head, pad_idx=src_pad_idx, dropout=dropout, n_position=n_position, max_seq_len=32) self.decoder = Decoder(n_trg_vocab=n_trg_vocab, d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner, n_layer=n_layer, n_head=n_head, pad_idx=trg_pad_idx, n_position=n_position, dropout=dropout) self.trg_word_prj = nn.Linear(d_model, n_trg_vocab, bias=False) # for name, param in self.named_parameters(): # if param.dim() > 1: # nn.init.xavier_normal(param) for param in self.parameters(): if param.dim() > 1: nn.init.xavier_uniform_(param)
def create_model(cls, args): from transformer.decoder import Decoder from transformer.encoder import Encoder encoder = Encoder(d_input=args.d_input * args.LFR_m, n_layers=args.n_layers_enc, n_head=args.n_head, d_model=args.d_model, d_inner=args.d_inner, dropout=args.dropout) decoder = Decoder(sos_id=args.sos_id, eos_id=args.eos_id, n_tgt_vocab=args.vocab_size, n_layers=args.n_layers_dec, n_head=args.n_head, d_model=args.d_model, d_inner=args.d_inner, dropout=args.dropout) model = cls.create_model(encoder, decoder) return model
def __init__(self, src_vocab_size, tgt_vocab_size, device, d_model=512, p_dropout=0.1): super(Transformer, self).__init__() self.d_model = d_model self.src_embedding = nn.Embedding(src_vocab_size, d_model) self.positional_encoder1 = PositionalEncoder(device, d_model=d_model, p_dropout=p_dropout) self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model) self.positional_encoder2 = PositionalEncoder(device, d_model=d_model, p_dropout=p_dropout) self.encoder = Encoder(6, d_model) self.decoder = Decoder(6, d_model) self.linear = nn.Linear(d_model, tgt_vocab_size) self.softmax = nn.LogSoftmax(dim=-1) # Share weights self.linear.weight = self.tgt_embedding.weight
def train_net(args): torch.manual_seed(7) np.random.seed(7) checkpoint = args.checkpoint start_epoch = 0 best_loss = float('inf') writer = SummaryWriter() epochs_since_improvement = 0 # Initialize / load checkpoint if checkpoint is None: # model encoder = Encoder(n_src_vocab, args.n_layers_enc, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, pe_maxlen=args.pe_maxlen) decoder = Decoder( sos_id, eos_id, n_tgt_vocab, args.d_word_vec, args.n_layers_dec, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing, pe_maxlen=args.pe_maxlen) model = Transformer(encoder, decoder) # print(model) # model = nn.DataParallel(model) # optimizer optimizer = TransformerOptimizer( torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09)) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] model = checkpoint['model'] optimizer = checkpoint['optimizer'] # Move to GPU, if available model = model.to(device) # Custom dataloaders train_dataset = AiChallenger2017Dataset('train') train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, collate_fn=pad_collate, shuffle=True, num_workers=args.num_workers) valid_dataset = AiChallenger2017Dataset('valid') valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=args.batch_size, collate_fn=pad_collate, shuffle=False, num_workers=args.num_workers) # Epochs for epoch in range(start_epoch, args.epochs): # One epoch's training train_loss = train(train_loader=train_loader, model=model, optimizer=optimizer, epoch=epoch, logger=logger, writer=writer) writer.add_scalar('epoch/train_loss', train_loss, epoch) writer.add_scalar('epoch/learning_rate', optimizer.lr, epoch) print('\nLearning rate: {}'.format(optimizer.lr)) print('Step num: {}\n'.format(optimizer.step_num)) # One epoch's validation valid_loss = valid(valid_loader=valid_loader, model=model, logger=logger) writer.add_scalar('epoch/valid_loss', valid_loss, epoch) # Check if there was an improvement is_best = valid_loss < best_loss best_loss = min(valid_loss, best_loss) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(epoch, epochs_since_improvement, model, optimizer, best_loss, is_best)
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=0.0001, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=WORD_MAXLEN, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) parser.add_argument( '--word', action='store_true', help='Train/Predict model using word based label (default: False)') parser.add_argument('--gen_label_index', action='store_true', help='Generate word label index map(default: False)') parser.add_argument('--iteration', type=str, help='Iteratiom') parser.add_argument('--premodel_session', type=str, help='Session name of premodel') # transformer model parameter parser.add_argument('--d_model', type=int, default=128, help='transformer_d_model') parser.add_argument('--n_head', type=int, default=8, help='transformer_n_head') parser.add_argument('--num_encoder_layers', type=int, default=4, help='num_encoder_layers') parser.add_argument('--num_decoder_layers', type=int, default=4, help='transformer_num_decoder_layers') parser.add_argument('--dim_feedforward', type=int, default=2048, help='transformer_d_model') parser.add_argument('--dropout', type=float, default=0.1, help='transformer_dropout') # transformer warmup parameter parser.add_argument('--warmup_multiplier', type=int, default=3, help='transformer_warmup_multiplier') parser.add_argument('--warmup_epoch', type=int, default=10, help='transformer_warmup_epoch') args = parser.parse_args() char_loader = CharLabelLoader() char_loader.load_char2index('./hackathon.labels') label_loader = char_loader if args.word: if args.gen_label_index: generate_word_label_index_file(char_loader, TRAIN_LABEL_CHAR_PATH) from subprocess import call call(f'cat {TRAIN_LABEL_CHAR_PATH}', shell=True) # ??? ??? ??? ?? word_loader = CharLabelLoader() word_loader.load_char2index('./hackathon.pos.labels') label_loader = word_loader if os.path.exists(TRAIN_LABEL_CHAR_PATH): generate_word_label_file(char_loader, word_loader, TRAIN_LABEL_POS_PATH, TRAIN_LABEL_CHAR_PATH) char2index = label_loader.char2index index2char = label_loader.index2char SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') ############ model print("model: transformer") # model = Transformer(d_model= args.d_model, n_head= args.n_head, num_encoder_layers= args.num_encoder_layers, num_decoder_layers= args.num_decoder_layers, # dim_feedforward= args.dim_feedforward, dropout= args.dropout, vocab_size= len(char2index), sound_maxlen= SOUND_MAXLEN, word_maxlen= WORD_MAXLEN) encoder = Encoder(d_input=128, n_layers=6, n_head=4, d_k=128, d_v=128, d_model=128, d_inner=2048, dropout=0.1, pe_maxlen=SOUND_MAXLEN) decoder = Decoder(sos_id=SOS_token, eos_id=EOS_token, n_tgt_vocab=len(char2index), d_word_vec=128, n_layers=6, n_head=4, d_k=128, d_v=128, d_model=128, d_inner=2048, dropout=0.1, tgt_emb_prj_weight_sharing=True, pe_maxlen=SOUND_MAXLEN) model = Transformer(encoder, decoder) optimizer = TransformerOptimizer( torch.optim.Adam(model.parameters(), lr=0.0004, betas=(0.9, 0.98), eps=1e-09)) ############/ for param in model.parameters(): param.data.uniform_(-0.08, 0.08) model = nn.DataParallel(model).to(device) """ optimizer = optim.Adam(model.module.parameters(), lr=args.lr) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.max_epochs) scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=args.warmup_multiplier, total_epoch=args.warmup_epoch, after_scheduler=scheduler_cosine) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) """ bind_model(model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o # target_path = os.path.join(DATASET_PATH, 'train_label') target_path = TRAIN_LABEL_CHAR_PATH if args.word: target_path = TRAIN_LABEL_POS_PATH load_targets(target_path) train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, valid_ratio=0.05) if args.iteration: if args.premodel_session: nsml.load(args.iteration, session=args.premodel_session) logger.info(f'Load {args.premodel_session} {args.iteration}') else: nsml.load(args.iteration) logger.info(f'Load {args.iteration}') logger.info('start') train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): # learning rate scheduler train_queue = queue.Queue(args.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() train_loss, train_cer = train(model, train_batch_num, train_queue, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() print("~~~~~~~~~~~~") if epoch == 10 or (epoch > 48 and epoch % 10 == 9): valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, device, args.max_len, args.batch_size) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_model = (eval_loss < best_loss) nsml.save(args.save_name) if best_model: nsml.save('best') best_loss = eval_loss
def train_net(args): # 为了保证程序执行结果一致, 给随机化设定种子 torch.manual_seed(7) np.random.seed(7) checkpoint = args.checkpoint start_epoch = 0 writer = SummaryWriter() if checkpoint is None: # model encoder = Encoder(Config.vocab_size, args.n_layers_enc, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, pe_maxlen=args.pe_maxlen) decoder = Decoder(Config.sos_id, Config.eos_id, Config.vocab_size, args.d_word_vec, args.n_layers_dec, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing, pe_maxlen=args.pe_maxlen) model = Transformer(encoder, decoder) # optimizer optimizer = TransformerOptimizer( torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09)) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 model = checkpoint['model'] optimizer = checkpoint['optimizer'] # Move to GPU, if available model = model.to(Config.device) # Custom dataloaders 数据的加载 注意这里指定了一个参数collate_fn代表的数据需要padding train_dataset = TranslateDataset() train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, collate_fn=pad_collate, shuffle=True, num_workers=args.num_workers) # Epochs Loss_list = [] for epoch in range(start_epoch, args.epochs): # One epoch's training train_loss = train(train_loader=train_loader, model=model, optimizer=optimizer, epoch=epoch, logger=logger, writer=writer) l = str(train_loss) Loss_list.append(l) l_temp = l + '\n' with open('loss_epoch.txt', 'a+') as f: f.write(l_temp) writer.add_scalar('epoch/train_loss', train_loss, epoch) writer.add_scalar('epoch/learning_rate', optimizer.lr, epoch) print('\nLearning rate: {}'.format(optimizer.lr)) print('Step num: {}\n'.format(optimizer.step_num)) # Save checkpoint save_checkpoint(epoch, model, optimizer, train_loss) with open('loss.txt', 'w') as f: f.write('\n'.join(Loss_list))
class TransformerModel(NMTModel): """ A standard Encoder-Decoder architecture. Base for this and many other models. """ def __init__(self, *args, embedding_rank=None, inner_rank=None, ffward_rank=None, **kwargs): # Run super constructor from NMTModel, but don't run NMTModel.__init__ super(NMTModel, self).__init__() self.vocab = pickle.load(open(paths.vocab, 'rb')) if embedding_rank is None: embedding_rank = transformer_config.embedding_rank if inner_rank is None: inner_rank = transformer_config.inner_rank if ffward_rank is None: ffward_rank = transformer_config.ffward_rank print(transformer_config.embedding_factorization, transformer_config.inner_factorization, transformer_config.ffward_factorization) print(embedding_rank, inner_rank, ffward_rank) self.encoder = Encoder(len(self.vocab.src), embedding_rank, inner_rank, ffward_rank) self.decoder = Decoder(len(self.vocab.tgt), embedding_rank, inner_rank, ffward_rank) self.gpu = False self.initialize() self.optimizer = NoamOpt(transformer_config.layer_dimension, train_config.lr, 4000, Adam( self.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9, ), beginning_step=0) self.num_accumulations = 0 self.accumulate = max(1, train_config.accumulate) def reset_optimizer(self): self.optimizer = NoamOpt( transformer_config.layer_dimension, 1, 4000, Adam( self.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9, ), ) def __call__(self, src, tgt, update_params=True): "Take in and process masked src and target sequences." src_encoding, src_mask = self.encode(src) loss, norm = self.decode( src_encoding, src_mask, tgt, ) if update_params: self.step(loss) if self.gpu: loss = loss.cpu() return loss.detach().numpy() * norm def encode(self, src): src_encodings = self.prepare_sents(src, 'src') return self.encoder(src_encodings) def decode(self, src_encoding, src_mask, tgt): tgt_enc = self.prepare_sents(tgt, 'tgt') return self.decoder( src_encoding, src_mask, tgt_enc, ) def initialize(self): # Initialize parameters with Glorot # TODO: Make sure this works correctly for param in self.parameters(): if param.dim() > 1: nn.init.xavier_uniform(param) def update_lr(self, *args, **kwargs): """ Overwrite update_lr needed by other models becuase Transformer is very sensitive to hyperparameters and manages its own lr decay """ pass @staticmethod def load(model_path: str): dict_path = model_path + ".dict.pt" model = TransformerModel() print("Loading whole model") load_partial_state_dict(model, torch.load(dict_path)) return model def load_params(self, model_path, no_opt=False): dict_path = model_path + ".dict.pt" self.load_state_dict(torch.load(dict_path)) if not no_opt: opt_path = model_path + ".opt.pt" self.optimizer.load_state_dict(torch.load(opt_path)) def save(self, path: str, no_opt=False): dict_path = path + ".dict.pt" torch.save(self.state_dict(), dict_path) if not no_opt: opt_path = path + ".opt.pt" torch.save(self.optimizer.state_dict(), opt_path) def beam_search(self, src, max_step=100, replace=False, start_symbol=1): if decoder_config.greedy_search: batch_size = len(src) stop = 2 inferred = [None for _ in range(batch_size)] memory, src_mask = self.encode(src) pred_sents = torch.ones(batch_size, 1, dtype=torch.long).fill_(start_symbol) scores = np.zeros((batch_size, )) if self.gpu: pred_sents = pred_sents.cuda() for i in range(1, max_step + 1): out = self.decoder.get_word_scores(memory, src_mask, Variable(pred_sents)) next_scores, next_words = torch.max(out, dim=1) pred_sents = torch.cat( [pred_sents, next_words.unsqueeze(1)], dim=1) stopped_sentences = np.where( next_words.detach().cpu().numpy() == stop)[0] ongoing_sentences = np.where( next_words.detach().cpu().numpy() != stop)[0] place_in_inferred(inferred, pred_sents, scores, next_scores, stopped_sentences) pred_sents = pred_sents[ongoing_sentences] memory = memory[ongoing_sentences] src_mask = src_mask[ongoing_sentences] if len(ongoing_sentences) == 0: break place_in_inferred(inferred, pred_sents, scores, next_scores, np.arange(len(ongoing_sentences))) return [[convert_hypothesis(inferred[i], self.vocab, scores[i])] for i in range(batch_size)] else: # beam search try: memory, src_mask = self.encode(src) except IndexError: print(src) batch_size = len(src) beam_size = decoder_config.beam_size beam_batch = BeamBatch(batch_size, beam_size, memory, src_mask, self.gpu) for i in range(1, max_step + 1): sizes = beam_batch.get_sizes() memory, src_mask = beam_batch.expand_memory_and_mask() pred_sents = beam_batch.open_hyps_tensor() out = self.decoder.get_word_scores(memory, src_mask, Variable(pred_sents)) out = out.detach().cpu().numpy() next_words = np.argpartition(-out, beam_size - 1, axis=-1)[:, :beam_size] next_words = torch.LongTensor(next_words).cuda() next_words = next_words.view(sum(sizes) * beam_size, 1) pred_sents = pred_sents.repeat(beam_size, 1).reshape( beam_size, sum(sizes), -1).transpose(1, 0).reshape(beam_size * sum(sizes), -1) pred_sents = torch.cat([pred_sents, next_words], dim=1) next_scores = -np.partition( -out, beam_size - 1)[:, :beam_size].flatten() old_scores = np.array(beam_batch.get_open_scores()) old_scores = np.repeat(old_scores, beam_size) next_scores = score_update(old_scores, next_scores, i) beam_batch.update(sizes, next_scores, pred_sents) if beam_batch.is_closed(): break # print([2 in beam_batch.best_results()[i][0] for i in range(batch_size)]) # print([len(beam_batch.best_results()[i][0]) for i in range(batch_size)]) return [[ convert_hypothesis(beam_batch.best_results()[i][0], self.vocab, beam_batch.best_results()[i][1]) ] for i in range(batch_size)]
def __init__(self, params: dict): """ Instantiate the ``Transformer`` class. :param params: Dict containing the set of parameters for the entire model\ (e.g ``EncoderLayer``, ``DecoderLayer`` etc.) broken down in relevant sections, e.g.: params = { 'd_model': 512, 'src_vocab_size': 27000, 'tgt_vocab_size': 27000, 'N': 6, 'dropout': 0.1, 'attention': {'n_head': 8, 'd_k': 64, 'd_v': 64, 'dropout': 0.1}, 'feed-forward': {'d_ff': 2048, 'dropout': 0.1}, } """ # call base constructor super(Transformer, self).__init__() # Save params for Checkpoint self._params = params # instantiate Encoder layer enc_layer = EncoderLayer( size=params['d_model'], self_attention=MultiHeadAttention( n_head=params['attention']['n_head'], d_model=params['d_model'], d_k=params['attention']['d_k'], d_v=params['attention']['d_v'], dropout=params['attention']['dropout']), feed_forward=PositionwiseFeedForward( d_model=params['d_model'], d_ff=params['feed-forward']['d_ff'], dropout=params['feed-forward']['dropout']), dropout=params['dropout']) # instantiate Encoder self.encoder = Encoder(layer=enc_layer, n_layers=params['N']) # instantiate Decoder layer decoder_layer = DecoderLayer( size=params['d_model'], self_attn=MultiHeadAttention( n_head=params['attention']['n_head'], d_model=params['d_model'], d_k=params['attention']['d_k'], d_v=params['attention']['d_v'], dropout=params['attention']['dropout']), memory_attn=MultiHeadAttention( n_head=params['attention']['n_head'], d_model=params['d_model'], d_k=params['attention']['d_k'], d_v=params['attention']['d_v'], dropout=params['attention']['dropout']), feed_forward=PositionwiseFeedForward( d_model=params['d_model'], d_ff=params['feed-forward']['d_ff'], dropout=params['feed-forward']['dropout']), dropout=params['dropout']) # instantiate Decoder self.decoder = Decoder(layer=decoder_layer, N=params['N']) pos_encoding = PositionalEncoding(d_model=params['d_model'], dropout=params['dropout']) self.src_embeddings = nn.Sequential( Embeddings(d_model=params['d_model'], vocab_size=params['src_vocab_size']), pos_encoding) self.trg_embeddings = nn.Sequential( Embeddings(d_model=params['d_model'], vocab_size=params['tgt_vocab_size']), pos_encoding) self.classifier = OutputClassifier(d_model=params['d_model'], vocab=params['tgt_vocab_size']) # Initialize parameters with Glorot / fan_avg. for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p)
encoder = Encoder(args.d_input * args.LFR_m, args.n_layers_enc, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, pe_maxlen=args.pe_maxlen) decoder = Decoder( sos_id, eos_id, vocab_size, args.d_word_vec, args.n_layers_dec, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing, pe_maxlen=args.pe_maxlen) model = Transformer(encoder, decoder) optimizer = TransformerOptimizer( torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09), args.k, args.d_model, args.warmup_steps) print(args.k) print(args.d_model) print(args.warmup_steps)
def __init__(self, n_src_vocab, n_tgt_vocab, len_max_seq_enc, len_max_seq_dec, d_word_vec=512, d_model=512, d_inner=2048, n_layers=6, n_head=8, d_k=64, d_v=64, dropout=0.1, tgt_emb_prj_weight_sharing=True, emb_src_tgt_weight_sharing=True, pretrained_embeddings=None): super().__init__() self.encoder = Encoder(n_src_vocab=n_src_vocab, len_max_seq=len_max_seq_enc, d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner, n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v, dropout=dropout, pretrained_embeddings=pretrained_embeddings) self.decoder = Decoder(n_tgt_vocab=n_tgt_vocab, len_max_seq=len_max_seq_dec, d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner, n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v, dropout=dropout, pretrained_embeddings=pretrained_embeddings) self.tgt_word_prj = nn.Linear(d_model, n_tgt_vocab, bias=False) nn.init.xavier_normal_(self.tgt_word_prj.weight) assert d_model == d_word_vec, \ 'To facilitate the residual connections, \ the dimensions of all module outputs shall be the same.' if tgt_emb_prj_weight_sharing: # Share the weight matrix between target word embedding & the final logit dense layer self.tgt_word_prj.weight = self.decoder.tgt_word_emb.weight self.x_logit_scale = (d_model**-0.5) else: self.x_logit_scale = 1. if emb_src_tgt_weight_sharing: # Share the weight matrix between source & target word embeddings assert n_src_vocab == n_tgt_vocab, \ "To share word embedding table, the vocabulary size of src/tgt shall be the same." self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight