def __init__(self, args): super(DCP, self).__init__() self.emb_dims = args.emb_dims self.cycle = args.cycle if args.emb_nn == 'pointnet': self.emb_nn = PointNet(emb_dims=self.emb_dims) elif args.emb_nn == 'dgcnn': self.emb_nn = DGCNN(emb_dims=self.emb_dims) elif args.emb_nn == 'lpdnet': self.emb_nn = LPDNet(args) else: raise Exception('Not implemented') if args.pointer == 'identity': self.pointer = Identity() elif args.pointer == 'transformer': self.pointer = Transformer(args=args) else: raise Exception("Not implemented") if args.head == 'mlp': self.head = MLPHead(args=args) elif args.head == 'svd': self.head = SVDHead(args=args) else: raise Exception('Not implemented')
def model_fn(features, labels, mode, params): with tf.variable_scope('model'): model = Transformer(params, mode == tf.estimator.ModeKeys.TRAIN) logits = model(features['q'], features['a']) if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.PREDICT, predictions=logits, export_outputs={ 'response': tf.estimator.export.PredictOutput(logits) }) xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=labels) loss = tf.reduce_sum(xentropy) optimizer = tf.contrib.opt.LazyAdamOptimizer(learning_rate=1e-3) train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def get_model(text_proc, args): sent_vocab = text_proc.vocab model = Transformer(dict_size=len(sent_vocab), image_feature_dim=args.image_feat_size, vocab=sent_vocab, tf_ratio=args.teacher_forcing) # Initialize the networks and the criterion if len(args.start_from) > 0: print("Initializing weights from {}".format(args.start_from)) model.load_state_dict(torch.load(args.start_from, map_location=lambda storage, location: storage)) # Ship the model to GPU, maybe if torch.cuda.is_available(): model.cuda() # if args.distributed: # model.cuda() # model = torch.nn.parallel.DistributedDataParallel(model) # else: # model = torch.nn.DataParallel(model).cuda() # elif torch.cuda.device_count() > 1: # model = torch.nn.DataParallel(model).cuda() # else: # model.cuda() return model
def __init__(self, args): super(VCRNet, self).__init__() self.emb_dims = args.emb_dims # Dimension of embeddings, default = 512 self.cycle = args.cycle # Whether to use cycle consistency, default = False if args.emb_nn == 'pointnet': self.emb_nn = PointNet(emb_dims=self.emb_dims) elif args.emb_nn == 'dgcnn': self.emb_nn = DGCNN(emb_dims=self.emb_dims) elif args.emb_nn == 'lpdnet': # default self.emb_nn = LPDNet(args) else: raise Exception('Not implemented') if args.pointer == 'identity': self.pointer = Identity() elif args.pointer == 'transformer': # default self.pointer = Transformer(args=args) else: self.pointer = None if args.vcp_nn == 'topK': # default self.head = VcpTopK(args=args) elif args.vcp_nn == 'att': self.head = VcpAtt(args=args) elif args.vcp_nn == 'dist': self.head = VcpByDis(args=args) else: raise Exception("Not implemented") self.svd = SVDHead(args=args)
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1): c = copy.deepcopy attn = MultiHeadedAttention(h, d_model).to(args.device) ff = PositionwiseFeedForward(d_model, d_ff, dropout).to(args.device) position = PositionalEncoding(d_model, dropout).to(args.device) model = Transformer( Encoder( EncoderLayer(d_model, c(attn), c(ff), dropout).to(args.device), N).to(args.device), Decoder( DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout).to(args.device), N).to(args.device), nn.Sequential( Embeddings(d_model, src_vocab).to(args.device), c(position)), nn.Sequential( Embeddings(d_model, tgt_vocab).to(args.device), c(position)), Generator(d_model, tgt_vocab)).to(args.device) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) return model.to(args.device)
def __init__(self, params, mode, train_iter=None, valid_iter=None, test_iter=None): self.params = params # Train mode if mode == 'train': self.train_iter = train_iter self.valid_iter = valid_iter # Test mode else: self.test_iter = test_iter self.model = Transformer(self.params) self.model.to(self.params.device) # Scheduling Optimzer self.optimizer = ScheduledAdam(optim.Adam(self.model.parameters(), betas=(0.9, 0.98), eps=1e-9), hidden_dim=params.hidden_dim, warm_steps=params.warm_steps) self.criterion = nn.CrossEntropyLoss(ignore_index=self.params.pad_idx) self.criterion.to(self.params.device)
def build_model(args, source_vocab_length, target_vocab_length): if args.method == 'conventional': model = Transformer(d_model=args.d_model, nhead=args.nhead, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, dim_feedforward=args.dim_feedforward, source_vocab_length=source_vocab_length, target_vocab_length=target_vocab_length) elif args.method == 'proposed': model = MyTransformer(add_to_dec=args.add_to_dec, yamamoto=args.yamamoto, weighted=args.weighted_average, d_model=args.d_model, nhead=args.nhead, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, dim_feedforward=args.dim_feedforward, source_vocab_length=source_vocab_length, target_vocab_length=target_vocab_length) elif args.method == 'attention': model = MyTransformer2(d_model=args.d_model, nhead=args.nhead, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, dim_feedforward=args.dim_feedforward, source_vocab_length=source_vocab_length, target_vocab_length=target_vocab_length) return model
def __init__(self, d_model: int, partition: bool, layer_num: int, hidden_dropout: float, attention_dropout: float, dim_ff: int, nhead: int, kqv_dim: int, device: torch.device): super(Encoder, self).__init__() self.partition = partition self.d_model = d_model if self.partition: self.transf = PartitionTransformer(d_model, layer_num, nhead, dim_ff, hidden_dropout, attention_dropout, 'relu', kqv_dim=kqv_dim) else: self.transf = Transformer(d_model, layer_num, nhead, dim_ff, hidden_dropout, attention_dropout, 'relu', kqv_dim=kqv_dim)
def prediction(text): params = Params('config/params.json') # load tokenizer and torchtext Fields pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) pickle_kor = open('pickles/kor.pickle', 'rb') kor = pickle.load(pickle_kor) pickle_eng = open('pickles/eng.pickle', 'rb') eng = pickle.load(pickle_eng) eos_idx = eng.vocab.stoi['<eos>'] # select model and load trained model model = Transformer(params) model.load_state_dict(torch.load(params.save_model)) model.to(params.device) model.eval() # convert input into tensor and forward it through selected model tokenized = tokenizer.tokenize(text) indexed = [kor.vocab.stoi[token] for token in tokenized] source = torch.LongTensor(indexed).unsqueeze(0).to(params.device) # [1, source_len]: unsqueeze to add batch size target = torch.zeros(1, params.max_len).type_as(source.data) # [1, max_len] encoder_output = model.encoder(source) next_symbol = eng.vocab.stoi['<sos>'] for i in range(0, params.max_len): if next_symbol == eos_idx: break target[0][i] = next_symbol decoder_output, _ = model.decoder(target, source, encoder_output) # [1, target length, output dim] prob = decoder_output.squeeze(0).max(dim=-1, keepdim=False)[1] next_word = prob.data[i] next_symbol = next_word.item() #eos_idx = torch.where(target[0] == eos_idx)[0][0] #eos_idx = eos_idx.item() eos_index = 34 print(eos_idx) target = target[0][:eos_idx].unsqueeze(0) # translation_tensor = [target length] filed with word indices target, attention_map = model(source, target) target = target.squeeze(0).max(dim=-1)[1] reply_token = [eng.vocab.itos[token] for token in target if token != 3] print(reply_token) #translation = translated_token[:translated_token.index('<eos>')] #translation = ''.join(translation) reply = ' '.join(reply_token) #print(reply) #display_attention(tokenized, reply_token, attention_map[4].squeeze(0)[:-1]) return reply
def __init__(self): super().__init__() self.embedding = nn.Embedding(INPUT_DIM, args.enc_emb_dim, padding_idx=1) self.transform = Transformer(d_model=args.enc_emb_dim, dim_feedforward=args.enc_emb_dim, num_encoder_layers=12, dropout=args.enc_dropout)
def predict(config): params = Params('config/params.json') # load tokenizer and torchtext Fields pickle_tokenizer = open('pickles/tokenizer.pickle', 'rb') cohesion_scores = pickle.load(pickle_tokenizer) tokenizer = LTokenizer(scores=cohesion_scores) pickle_kor = open('pickles/kor.pickle', 'rb') kor = pickle.load(pickle_kor) pickle_eng = open('pickles/eng.pickle', 'rb') eng = pickle.load(pickle_eng) # select model and load trained model model = Transformer(params) model.load_state_dict(torch.load(params.save_model)) model.to(params.device) model.eval() input = clean_text(config.input) # convert input into tensor and forward it through selected model tokenized = tokenizer.tokenize(input) indexed = [kor.vocab.stoi[token] for token in tokenized] source = torch.LongTensor(indexed).unsqueeze(0).to( params.device) # [1, source length]: unsqueeze to add batch size target = torch.zeros(1, params.max_len).type_as(source.data) encoder_output = model.encoder(source) next_symbol = eng.vocab.stoi['<sos>'] for i in range(0, params.max_len): target[0][i] = next_symbol dec_output = model.decoder(target, source, encoder_output) # dec_output = [1, target length, output dim] prob = dec_output.squeeze(0).max(dim=-1, keepdim=False)[1] next_word = prob.data[i] next_symbol = next_word.item() # translation_tensor = [target length] filed with word indices target = model(source, target) target = torch.argmax(target.squeeze(0), -1) # target = target.squeeze(0).max(dim=-1, keepdim=False) translation = [eng.vocab.itos[token] for token in target][1:] translation = ' '.join(translation) print(f'kor> {config.input}') print(f'eng> {translation.capitalize()}')
def _load_transformer(self, model_type, enc_len, dec_len): if model_type == Constants.MODEL.VANILLA_TRANSFORMER: transformer = Transformer( num_layers=self.params.get("num_layers"), d_model=self.params.get("d_model"), num_heads=self.params.get("num_heads"), dff=self.params.get("dff"), vocab_size=self.vocab_size, pe_encoder_len=enc_len, pe_decoder_len=dec_len, rate=self.params.get("dropout_rate"), is_embed_pos=self.params.get("is_embed_pos"), is_finetune=self.params.get("is_finetune")) elif model_type == Constants.MODEL.EVOLVED_TRANSFORMER: transformer = EvolvedTransformer( num_encoder_layers=self.params.get("num_encoder_layers"), num_decoder_layers=self.params.get("num_decoder_layers"), d_model=self.params.get("d_model"), num_heads=self.params.get("num_heads"), dff=self.params.get("dff"), vocab_size=self.vocab_size, type_size=self.params.get('type_size'), pe_encoder_len=enc_len, pe_decoder_len=dec_len, rate=self.params.get("dropout_rate"), is_embed_pos=self.params.get("is_embed_pos"), is_finetune=self.params.get("is_finetune")) else: transformer = NERTransformer( num_encoder_layers=self.params.get("num_encoder_layers"), num_decoder_layers=self.params.get("num_decoder_layers"), d_model=self.params.get("d_model"), num_heads=self.params.get("num_heads"), dff=self.params.get("dff"), vocab_size=self.vocab_size, type_size=self.params.get('type_size'), ner_size=Constants.NER_SIZE, pe_encoder_len=enc_len, pe_decoder_len=dec_len, rate=self.params.get("dropout_rate"), is_embed_pos=self.params.get("is_embed_pos"), is_finetune=self.params.get("is_finetune")) return transformer
def __init__(self, subword: str, transliterate: str, bert_path: str, d_model: int, partition: bool, position_emb_dropout: float, bert_emb_dropout: float, emb_dropout: float, layer_num: int, hidden_dropout: float, attention_dropout: float, dim_ff: int, nhead: int, kqv_dim: int, device: torch.device): super(NEREncoder, self).__init__() self.bert_encoder = BERTEncoder(subword, transliterate, bert_path, device) self.bert_hidden_size = self.bert_encoder.BERT.config.hidden_size self.partition = partition self.d_model = d_model self.d_content = d_model // 2 if partition else d_model self.d_position = d_model - d_model // 2 if partition else d_model self.device = device self.position_emb_dropout = nn.Dropout(position_emb_dropout) self.bert_emb_dropout = nn.Dropout(bert_emb_dropout) self.emb_dropout = nn.Dropout(emb_dropout) self.layer_norm = nn.LayerNorm(d_model, elementwise_affine=True) self.bert_proj = nn.Linear(self.bert_hidden_size, self.d_content) self.position_embeddings = LearnedPositionalEmbedding(self.d_position, max_len=512) if self.partition: self.transf = PartitionTransformer(d_model, layer_num, nhead, dim_ff, hidden_dropout, attention_dropout, 'relu', kqv_dim=kqv_dim) else: self.transf = Transformer(d_model, layer_num, nhead, dim_ff, hidden_dropout, attention_dropout, 'relu', kqv_dim=kqv_dim)
def __build_model__(self): """ Create the Translator model for the CPU Returns ------- model : Transformer The transformer """ device = torch.device("cpu") encoder = Encoder( self.source_vocab_size, self.word_embedding_size, self.num_encoder_layers, self.num_encoder_heads, self.encoder_pf_dim, self.encoder_dropout, device, ) decoder = Decoder( self.target_vocab_size, self.word_embedding_size, self.num_decoder_layers, self.num_decoder_heads, self.decoder_pf_dim, self.decoder_dropout, device, ) model = Transformer( encoder, decoder, self.source_pad_id, self.target_sos_id, self.target_eos_id, self.target_pad_id, device, ) return model
def load_model(opt, device): checkpoint = torch.load(opt.model, map_location=device) model_opt = checkpoint['settings'] model = Transformer(model_opt.src_vocab_size, model_opt.trg_vocab_size, model_opt.src_pad_idx, model_opt.trg_pad_idx, trg_emb_prj_weight_sharing=model_opt.proj_share_weight, src_emb_prj_weight_sharing=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout).to(device) model.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') return model
def __init__(self, params, mode, train_iter=None, valid_iter=None, test_iter=None): self.params = params # Train mode if mode == 'train': self.train_iter = train_iter self.valid_iter = valid_iter # Test mode else: self.test_iter = test_iter self.model = Transformer(self.params) self.model.to(self.params.device) self.optimizer = optim.Adam(self.model.parameters()) self.criterion = nn.CrossEntropyLoss(ignore_index=self.params.pad_idx) self.criterion.to(self.params.device)
from tqdm import tqdm import os import time import numpy as np import pickle def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) p = Tasks() data_loader_tr, data_loader_val, data_loader_test = p.get_all_data(batch_size=config.batch_size) if(config.test): print("Test model",config.model) model = Transformer(p.vocab,model_file_path=config.save_path,is_eval=True) evaluate(model,data_loader_test,model_name=config.model,ty='test',verbose=True,log=True) exit(0) model = Transformer(p.vocab) print("MODEL USED",config.model) print("TRAINABLE PARAMETERS",count_parameters(model)) best_ppl = 1000 cnt = 0 for e in range(config.epochs): print("Epoch", e) p, l = [],[] pbar = tqdm(enumerate(data_loader_tr),total=len(data_loader_tr)) for i, d in pbar: torch.cuda.empty_cache()
def make_batch(inp, vacab): d = Dataset(inp, vacab) loader = torch.utils.data.DataLoader(dataset=d, batch_size=1, shuffle=False, collate_fn=collate_fn) return iter(loader).next() data_loader_tra, data_loader_val, data_loader_tst, vocab, program_number = prepare_data_seq( batch_size=config.batch_size) if (config.model == "trs"): model = Transformer(vocab, decoder_number=program_number, model_file_path=config.save_path, is_eval=True) elif (config.model == "experts"): model = Transformer_experts(vocab, decoder_number=program_number, model_file_path=config.save_path, is_eval=True) if (config.USE_CUDA): model.cuda() model = model.eval() print('Start to chat') context = deque(DIALOG_SIZE * ['None'], maxlen=DIALOG_SIZE) while (True): msg = input(">>> ") if (len(str(msg).rstrip().lstrip()) != 0):
def training(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #===================================# #==============Logging==============# #===================================# logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) handler = TqdmLoggingHandler() handler.setFormatter( logging.Formatter(" %(asctime)s - %(message)s", "%Y-%m-%d %H:%M:%S")) logger.addHandler(handler) logger.propagate = False #===================================# #============Data Load==============# #===================================# # 1) Dataloader setting write_log(logger, "Load data...") gc.disable() dataset_dict = { 'train': CustomDataset(data_path=args.preprocessed_path, phase='train'), 'valid': CustomDataset(data_path=args.preprocessed_path, phase='valid'), 'test': CustomDataset(data_path=args.preprocessed_path, phase='test') } unique_menu_count = dataset_dict['train'].unique_count() dataloader_dict = { 'train': DataLoader(dataset_dict['train'], drop_last=True, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=args.num_workers, collate_fn=PadCollate()), 'valid': DataLoader(dataset_dict['valid'], drop_last=False, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=args.num_workers, collate_fn=PadCollate()), 'test': DataLoader(dataset_dict['test'], drop_last=False, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=args.num_workers, collate_fn=PadCollate()) } gc.enable() write_log( logger, f"Total number of trainingsets iterations - {len(dataset_dict['train'])}, {len(dataloader_dict['train'])}" ) #===================================# #===========Model setting===========# #===================================# # 1) Model initiating write_log(logger, "Instantiating models...") model = Transformer(model_type=args.model_type, input_size=unique_menu_count, d_model=args.d_model, d_embedding=args.d_embedding, n_head=args.n_head, dim_feedforward=args.dim_feedforward, num_encoder_layer=args.num_encoder_layer, dropout=args.dropout) model = model.train() model = model.to(device) # 2) Optimizer setting optimizer = optimizer_select(model, args) scheduler = shceduler_select(optimizer, dataloader_dict, args) criterion = nn.MSELoss() scaler = GradScaler(enabled=True) model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # 2) Model resume start_epoch = 0 if args.resume: checkpoint = torch.load(os.path.join(args.model_path, 'checkpoint.pth.tar'), map_location='cpu') start_epoch = checkpoint['epoch'] + 1 model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) model = model.train() model = model.to(device) del checkpoint #===================================# #=========Model Train Start=========# #===================================# best_val_rmse = 9999999 write_log(logger, 'Train start!') for epoch in range(start_epoch, args.num_epochs): for phase in ['train', 'valid']: if phase == 'train': model.train() train_start_time = time.time() freq = 0 elif phase == 'valid': model.eval() val_loss = 0 val_rmse = 0 for i, (src_menu, label_lunch, label_supper) in enumerate(dataloader_dict[phase]): # Optimizer setting optimizer.zero_grad() # Input, output setting src_menu = src_menu.to(device, non_blocking=True) label_lunch = label_lunch.float().to(device, non_blocking=True) label_supper = label_supper.float().to(device, non_blocking=True) # Model with torch.set_grad_enabled(phase == 'train'): with autocast(enabled=True): if args.model_type == 'sep': logit = model(src_menu) logit_lunch = logit[:, 0] logit_supper = logit[:, 0] elif args.model_type == 'total': logit = model(src_menu) logit_lunch = logit[:, 0] logit_supper = logit[:, 1] # Loss calculate loss_lunch = criterion(logit_lunch, label_lunch) loss_supper = criterion(logit_supper, label_supper) loss = loss_lunch + loss_supper # Back-propagation if phase == 'train': scaler.scale(loss).backward() scaler.unscale_(optimizer) clip_grad_norm_(model.parameters(), args.clip_grad_norm) scaler.step(optimizer) scaler.update() # Scheduler setting if args.scheduler in ['constant', 'warmup']: scheduler.step() if args.scheduler == 'reduce_train': scheduler.step(loss) # Print loss value rmse_loss = torch.sqrt(loss) if phase == 'train': if i == 0 or freq == args.print_freq or i == len( dataloader_dict['train']): batch_log = "[Epoch:%d][%d/%d] train_MSE_loss:%2.3f | train_RMSE_loss:%2.3f | learning_rate:%3.6f | spend_time:%3.2fmin" \ % (epoch+1, i, len(dataloader_dict['train']), loss.item(), rmse_loss.item(), optimizer.param_groups[0]['lr'], (time.time() - train_start_time) / 60) write_log(logger, batch_log) freq = 0 freq += 1 elif phase == 'valid': val_loss += loss.item() val_rmse += rmse_loss.item() if phase == 'valid': val_loss /= len(dataloader_dict['valid']) val_rmse /= len(dataloader_dict['valid']) write_log(logger, 'Validation Loss: %3.3f' % val_loss) write_log(logger, 'Validation RMSE: %3.3f' % val_rmse) if val_rmse < best_val_rmse: write_log(logger, 'Checkpoint saving...') if not os.path.exists(args.save_path): os.mkdir(args.save_path) torch.save( { 'epoch': epoch, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'scaler': scaler.state_dict() }, os.path.join(args.save_path, f'checkpoint_cap.pth.tar')) best_val_rmse = val_rmse best_epoch = epoch else: else_log = f'Still {best_epoch} epoch RMSE({round(best_val_rmse, 3)}) is better...' write_log(logger, else_log) # 3) write_log(logger, f'Best Epoch: {best_epoch+1}') write_log(logger, f'Best Accuracy: {round(best_val_rmse, 3)}')
def training(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #===================================# #==============Logging==============# #===================================# logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) handler = TqdmLoggingHandler() handler.setFormatter( logging.Formatter(" %(asctime)s - %(message)s", "%Y-%m-%d %H:%M:%S")) logger.addHandler(handler) logger.propagate = False #===================================# #============Data Load==============# #===================================# # 1) Data open write_log(logger, "Load data...") gc.disable() with open(os.path.join(args.preprocess_path, 'processed.pkl'), 'rb') as f: data_ = pickle.load(f) train_src_indices = data_['train_src_indices'] valid_src_indices = data_['valid_src_indices'] train_trg_indices = data_['train_trg_indices'] valid_trg_indices = data_['valid_trg_indices'] src_word2id = data_['src_word2id'] trg_word2id = data_['trg_word2id'] src_vocab_num = len(src_word2id) trg_vocab_num = len(trg_word2id) del data_ gc.enable() write_log(logger, "Finished loading data!") # 2) Dataloader setting dataset_dict = { 'train': CustomDataset(train_src_indices, train_trg_indices, min_len=args.min_len, src_max_len=args.src_max_len, trg_max_len=args.trg_max_len), 'valid': CustomDataset(valid_src_indices, valid_trg_indices, min_len=args.min_len, src_max_len=args.src_max_len, trg_max_len=args.trg_max_len), } dataloader_dict = { 'train': DataLoader(dataset_dict['train'], drop_last=True, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=args.num_workers), 'valid': DataLoader(dataset_dict['valid'], drop_last=False, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=args.num_workers) } write_log( logger, f"Total number of trainingsets iterations - {len(dataset_dict['train'])}, {len(dataloader_dict['train'])}" ) #===================================# #===========Train setting===========# #===================================# # 1) Model initiating write_log(logger, 'Instantiating model...') model = Transformer( src_vocab_num=src_vocab_num, trg_vocab_num=trg_vocab_num, pad_idx=args.pad_id, bos_idx=args.bos_id, eos_idx=args.eos_id, d_model=args.d_model, d_embedding=args.d_embedding, n_head=args.n_head, dim_feedforward=args.dim_feedforward, num_common_layer=args.num_common_layer, num_encoder_layer=args.num_encoder_layer, num_decoder_layer=args.num_decoder_layer, src_max_len=args.src_max_len, trg_max_len=args.trg_max_len, dropout=args.dropout, embedding_dropout=args.embedding_dropout, trg_emb_prj_weight_sharing=args.trg_emb_prj_weight_sharing, emb_src_trg_weight_sharing=args.emb_src_trg_weight_sharing, parallel=args.parallel) model.train() model = model.to(device) tgt_mask = model.generate_square_subsequent_mask(args.trg_max_len - 1, device) # 2) Optimizer & Learning rate scheduler setting optimizer = optimizer_select(model, args) scheduler = shceduler_select(optimizer, dataloader_dict, args) scaler = GradScaler() # 3) Model resume start_epoch = 0 if args.resume: write_log(logger, 'Resume model...') checkpoint = torch.load( os.path.join(args.save_path, 'checkpoint.pth.tar')) start_epoch = checkpoint['epoch'] + 1 model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) scaler.load_state_dict(checkpoint['scaler']) del checkpoint #===================================# #=========Model Train Start=========# #===================================# best_val_acc = 0 write_log(logger, 'Traing start!') for epoch in range(start_epoch + 1, args.num_epochs + 1): start_time_e = time() for phase in ['train', 'valid']: if phase == 'train': model.train() if phase == 'valid': write_log(logger, 'Validation start...') val_loss = 0 val_acc = 0 model.eval() for i, (src, trg) in enumerate( tqdm(dataloader_dict[phase], bar_format='{l_bar}{bar:30}{r_bar}{bar:-2b}')): # Optimizer setting optimizer.zero_grad(set_to_none=True) # Input, output setting src = src.to(device, non_blocking=True) trg = trg.to(device, non_blocking=True) trg_sequences_target = trg[:, 1:] non_pad = trg_sequences_target != args.pad_id trg_sequences_target = trg_sequences_target[ non_pad].contiguous().view(-1) # Train if phase == 'train': # Loss calculate with autocast(): predicted = model(src, trg[:, :-1], tgt_mask, non_pad_position=non_pad) predicted = predicted.view(-1, predicted.size(-1)) loss = label_smoothing_loss(predicted, trg_sequences_target, args.pad_id) scaler.scale(loss).backward() scaler.unscale_(optimizer) clip_grad_norm_(model.parameters(), args.clip_grad_norm) scaler.step(optimizer) scaler.update() if args.scheduler in ['constant', 'warmup']: scheduler.step() if args.scheduler == 'reduce_train': scheduler.step(loss) # Print loss value only training if i == 0 or freq == args.print_freq or i == len( dataloader_dict['train']): acc = (predicted.max(dim=1)[1] == trg_sequences_target ).sum() / len(trg_sequences_target) iter_log = "[Epoch:%03d][%03d/%03d] train_loss:%03.3f | train_acc:%03.2f%% | learning_rate:%1.6f | spend_time:%02.2fmin" % \ (epoch, i, len(dataloader_dict['train']), loss.item(), acc*100, optimizer.param_groups[0]['lr'], (time() - start_time_e) / 60) write_log(logger, iter_log) freq = 0 freq += 1 # Validation if phase == 'valid': with torch.no_grad(): predicted = model(src, trg[:, :-1], tgt_mask, non_pad_position=non_pad) loss = F.cross_entropy(predicted, trg_sequences_target) val_loss += loss.item() val_acc += (predicted.max(dim=1)[1] == trg_sequences_target ).sum() / len(trg_sequences_target) if args.scheduler == 'reduce_valid': scheduler.step(val_loss) if args.scheduler == 'lambda': scheduler.step() if phase == 'valid': val_loss /= len(dataloader_dict[phase]) val_acc /= len(dataloader_dict[phase]) write_log(logger, 'Validation Loss: %3.3f' % val_loss) write_log(logger, 'Validation Accuracy: %3.2f%%' % (val_acc * 100)) if val_acc > best_val_acc: write_log(logger, 'Checkpoint saving...') torch.save( { 'epoch': epoch, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'scaler': scaler.state_dict() }, f'checkpoint_{args.parallel}.pth.tar') best_val_acc = val_acc best_epoch = epoch else: else_log = f'Still {best_epoch} epoch accuracy({round(best_val_acc.item()*100, 2)})% is better...' write_log(logger, else_log) # 3) Print results print(f'Best Epoch: {best_epoch}') print(f'Best Accuracy: {round(best_val_acc.item(), 2)}')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--problem', required=True) parser.add_argument('--train_step', type=int, default=200) parser.add_argument('--batch_size', type=int, default=4096) parser.add_argument('--max_length', type=int, default=100) parser.add_argument('--n_layers', type=int, default=6) parser.add_argument('--hidden_size', type=int, default=512) parser.add_argument('--filter_size', type=int, default=2048) parser.add_argument('--warmup', type=int, default=16000) parser.add_argument('--dropout', type=float, default=0.1) parser.add_argument('--label_smoothing', type=float, default=0.1) parser.add_argument('--val_every', type=int, default=5) parser.add_argument('--output_dir', type=str, default='./output') parser.add_argument('--data_dir', type=str, default='./data') parser.add_argument('--no_cuda', action='store_true') parser.add_argument('--summary_grad', action='store_true') opt = parser.parse_args() device = torch.device('cpu' if opt.no_cuda else 'cuda') if not os.path.exists(opt.output_dir + '/last/models'): os.makedirs(opt.output_dir + '/last/models') if not os.path.exists(opt.data_dir): os.makedirs(opt.data_dir) train_data, validation_data, i_vocab_size, t_vocab_size, opt = \ problem.prepare(opt.problem, opt.data_dir, opt.max_length, opt.batch_size, device, opt) if i_vocab_size is not None: print("# of vocabs (input):", i_vocab_size) print("# of vocabs (target):", t_vocab_size) if os.path.exists(opt.output_dir + '/last/models/last_model.pt'): print("Load a checkpoint...") last_model_path = opt.output_dir + '/last/models' model, global_step = utils.load_checkpoint(last_model_path, device, is_eval=False) else: model = Transformer(i_vocab_size, t_vocab_size, n_layers=opt.n_layers, hidden_size=opt.hidden_size, filter_size=opt.filter_size, dropout_rate=opt.dropout, share_target_embedding=opt.share_target_embedding, has_inputs=opt.has_inputs) model = model.to(device=device) global_step = 0 num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("# of parameters: {}".format(num_params)) optimizer = LRScheduler( filter(lambda x: x.requires_grad, model.parameters()), opt.hidden_size, opt.warmup, step=global_step) writer = SummaryWriter(opt.output_dir + '/last') val_writer = SummaryWriter(opt.output_dir + '/last/val') best_val_loss = float('inf') for t_step in range(opt.train_step): print("Epoch", t_step) start_epoch_time = time.time() global_step = train(train_data, model, opt, global_step, optimizer, t_vocab_size, opt.label_smoothing, writer) print("Epoch Time: {:.2f} sec".format(time.time() - start_epoch_time)) if t_step % opt.val_every != 0: continue val_loss = validation(validation_data, model, global_step, t_vocab_size, val_writer, opt) utils.save_checkpoint(model, opt.output_dir + '/last/models', global_step, val_loss < best_val_loss) best_val_loss = min(val_loss, best_val_loss)
padding_num=-1) # 파라미터 설정 vocab_size = len(preprocessing.id_to_word) wordvec_size = 300 head_size = 8 batch_size = 128 max_epoch = 20 max_grad = 5.0 x_test, x_train = preprocessing.divide_test_train(x_train, test_rate=0.1) t_test, t_train = preprocessing.divide_test_train(t_train, test_rate=0.1) model = Transformer(vocab_size, wordvec_size, head_size, num_heads=8, num_encoders=1, num_decoders=1) if os.path.isfile("../pkl/myTransformer_params.pkl"): model.load_params("../pkl/myTransformer_params.pkl") optimizer = Adam(lr=0.00001) # optimizer = SGD(lr=0.00005) # optimizer = RMSprop(lr=0.00005) trainer = Trainer(model, optimizer) acc_list = [] for epoch in range(max_epoch): trainer.fit(x_train, t_train,
# for reproduction RANDOM_SEED = 42 np.random.seed(RANDOM_SEED) train_path = '../../data/mitdb_refined_csv/mitbih_train_balanced.csv' test_path = '../../data/mitdb_refined_csv/mitbih_test_balanced.csv' wandb.init(project="mitbih-transformer") train_dataset = MITBIHDataset(train_path) test_dataset = MITBIHDataset(test_path) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = args.batch, sampler = None, shuffle = True, num_workers = args.num_workers) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = args.batch, sampler = None, shuffle = True, num_workers = args.num_workers) model = Transformer(config.n_layer, config.d_model, config.n_head, config.d_head, config.d_ff, config.n_classes, config.dropout) criterion_cls = torch.nn.CrossEntropyLoss() t_total = len(train_loader) * args.epoch lr = args.lr params = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if 'bias' in key: params += [{'params' : [value], 'lr' : lr * 2, \ 'weight_decay' : 0 }] else: params += [{'params':[value],'lr':lr, 'weight_decay': 0.0005}]
def main(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) handler = TqdmLoggingHandler() handler.setFormatter(logging.Formatter(" %(asctime)s - %(message)s")) logger.addHandler(handler) logger.propagate = False write_log(logger, "Load data") def load_data(args): gc.disable() with open(f"{args.preprocessed_data_path}/hanja_korean_word2id.pkl", "rb") as f: data = pickle.load(f) hanja_word2id = data['hanja_word2id'] korean_word2id = data['korean_word2id'] with open(f"{args.preprocessed_data_path}/preprocessed_test.pkl", "rb") as f: data = pickle.load(f) test_hanja_indices = data['hanja_indices'] test_korean_indices = data['korean_indices'] gc.enable() write_log(logger, "Finished loading data!") return hanja_word2id, korean_word2id, test_hanja_indices, test_korean_indices hanja_word2id, korean_word2id, test_hanja_indices, test_korean_indices = load_data( args) hanja_vocab_num = len(hanja_word2id) korean_vocab_num = len(korean_word2id) hk_dataset = HanjaKoreanDataset(test_hanja_indices, test_korean_indices, min_len=args.min_len, src_max_len=args.src_max_len, trg_max_len=args.trg_max_len) hk_loader = DataLoader(hk_dataset, drop_last=True, batch_size=args.hk_batch_size, num_workers=4, prefetch_factor=4, pin_memory=True) write_log(logger, f"hanja-korean: {len(hk_dataset)}, {len(hk_loader)}") del test_hanja_indices, test_korean_indices write_log(logger, "Build model") model = Transformer(hanja_vocab_num, korean_vocab_num, pad_idx=args.pad_idx, bos_idx=args.bos_idx, eos_idx=args.eos_idx, src_max_len=args.src_max_len, trg_max_len=args.trg_max_len, d_model=args.d_model, d_embedding=args.d_embedding, n_head=args.n_head, dim_feedforward=args.dim_feedforward, num_encoder_layer=args.num_encoder_layer, num_decoder_layer=args.num_decoder_layer, num_mask_layer=args.num_mask_layer) model.load_state_dict( torch.load(args.checkpoint_path, map_location=device)['model']) model.src_output_linear = None model.src_output_linear2 = None model.src_output_norm = None model.mask_encoders = None model = model.to(device) model.eval() write_log(logger, "Load SentencePiece model") parser = spm.SentencePieceProcessor() parser.Load(os.path.join(args.preprocessed_data_path, 'm_korean.model')) predicted_list = list() label_list = list() every_batch = torch.arange(0, args.beam_size * args.hk_batch_size, args.beam_size, device=device) tgt_masks = { l: model.generate_square_subsequent_mask(l, device) for l in range(1, args.trg_max_len + 1) } with torch.no_grad(): for src_sequences, trg_sequences in tqdm(hk_loader): src_sequences = src_sequences.to(device) label_list.extend(trg_sequences.tolist()) # Encoding # encoder_out: (src_seq, batch_size, d_model) # src_key_padding_mask: (batch_size, src_seq) encoder_out = model.src_embedding(src_sequences).transpose(0, 1) src_key_padding_mask = (src_sequences == model.pad_idx) for encoder in model.encoders: encoder_out = encoder( encoder_out, src_key_padding_mask=src_key_padding_mask) # Expanding # encoder_out: (src_seq, batch_size * k, d_model) # src_key_padding_mask: (batch_size * k, src_seq) src_seq_size = encoder_out.size(0) src_key_padding_mask = src_key_padding_mask.view( args.hk_batch_size, 1, -1).repeat(1, args.beam_size, 1) src_key_padding_mask = src_key_padding_mask.view(-1, src_seq_size) encoder_out = encoder_out.view(-1, args.hk_batch_size, 1, args.d_model).repeat( 1, 1, args.beam_size, 1) encoder_out = encoder_out.view(src_seq_size, -1, args.d_model) # Scores save vector & decoding list setting scores_save = torch.zeros(args.beam_size * args.hk_batch_size, 1, device=device) top_k_scores = torch.zeros(args.beam_size * args.hk_batch_size, 1, device=device) complete_seqs = dict() complete_ind = set() # Decoding start token setting seqs = torch.tensor([[model.bos_idx]], dtype=torch.long, device=device) seqs = seqs.repeat(args.beam_size * args.hk_batch_size, 1).contiguous() for step in range(model.trg_max_len): # Decoder setting # tgt_mask: (out_seq) # tgt_key_padding_mask: (batch_size * k, out_seq) tgt_mask = tgt_masks[seqs.size(1)] tgt_key_padding_mask = (seqs == model.pad_idx) # Decoding sentence # decoder_out: (out_seq, batch_size * k, d_model) decoder_out = model.trg_embedding(seqs).transpose(0, 1) for decoder in model.decoders: decoder_out = decoder( decoder_out, encoder_out, tgt_mask=tgt_mask, memory_key_padding_mask=src_key_padding_mask, tgt_key_padding_mask=tgt_key_padding_mask) # Score calculate # scores: (batch_size * k, vocab_num) scores = F.gelu(model.trg_output_linear(decoder_out[-1])) scores = model.trg_output_linear2( model.trg_output_norm(scores)) scores = F.log_softmax(scores, dim=1) # Repetition Penalty if step > 0 and args.repetition_penalty > 0: prev_ix = next_word_inds.view(-1) for index, prev_token_id in enumerate(prev_ix): scores[index][prev_token_id] *= args.repetition_penalty # Add score scores = top_k_scores.expand_as(scores) + scores if step == 0: # scores: (batch_size, vocab_num) # top_k_scores: (batch_size, k) scores = scores[::args.beam_size] scores[:, model.eos_idx] = float( '-inf') # set eos token probability zero in first step top_k_scores, top_k_words = scores.topk( args.beam_size, 1, True, True) else: # top_k_scores: (batch_size * k, out_seq) top_k_scores, top_k_words = scores.view( args.hk_batch_size, -1).topk(args.beam_size, 1, True, True) # Previous and Next word extract # seqs: (batch_size * k, out_seq + 1) prev_word_inds = top_k_words // korean_vocab_num next_word_inds = top_k_words % korean_vocab_num top_k_scores = top_k_scores.view( args.hk_batch_size * args.beam_size, -1) top_k_words = top_k_words.view( args.hk_batch_size * args.beam_size, -1) seqs = seqs[prev_word_inds.view(-1) + every_batch.unsqueeze( 1).repeat(1, args.beam_size).view(-1)] seqs = torch.cat([ seqs, next_word_inds.view(args.beam_size * args.hk_batch_size, -1) ], dim=1) # Find and Save Complete Sequences Score eos_ind = torch.where( next_word_inds.view(-1) == model.eos_idx)[0] if len(eos_ind) > 0: eos_ind = eos_ind.tolist() complete_ind_add = set(eos_ind) - complete_ind complete_ind_add = list(complete_ind_add) complete_ind.update(eos_ind) if len(complete_ind_add) > 0: scores_save[complete_ind_add] = top_k_scores[ complete_ind_add] for ix in complete_ind_add: complete_seqs[ix] = seqs[ix].tolist() # If eos token doesn't exist in sequence score_save_pos = torch.where(scores_save == 0) if len(score_save_pos[0]) > 0: for ix in score_save_pos[0].tolist(): complete_seqs[ix] = seqs[ix].tolist() scores_save[score_save_pos] = top_k_scores[score_save_pos] # Beam Length Normalization lp = torch.tensor([ len(complete_seqs[i]) for i in range(args.hk_batch_size * args.beam_size) ], device=device) lp = (((lp + args.beam_size)**args.beam_alpha) / ((args.beam_size + 1)**args.beam_alpha)) scores_save = scores_save / lp.unsqueeze(1) # Predicted and Label processing ind = scores_save.view(args.hk_batch_size, args.beam_size, -1).argmax(dim=1) ind_expand = ind.view(-1) + every_batch predicted_list.extend( [complete_seqs[i] for i in ind_expand.tolist()]) with open( f'./results_beam_{args.beam_size}_{args.beam_alpha}_{args.repetition_penalty}.pkl', 'wb') as f: pickle.dump( { 'prediction': predicted_list, 'label': label_list, 'prediction_decode': [parser.DecodeIds(pred) for pred in predicted_list], 'label_decode': [parser.DecodeIds(label) for label in label_list] }, f)
dropout_rate = 0.1 train_dataset, val_dataset = helper_tfds.get_dataset() trans_params = { "num_encoder": num_layers, "num_decoder": num_layers, "d_model": d_model, "num_heads": num_heads, "dff": dff, "inp_max_seq_len": input_vocab_size, "target_vocab_size": target_vocab_size, "inp_vocab_size": input_vocab_size, "tar_vocab_size": target_vocab_size } transformer = Transformer(**trans_params) learning_rate = CustomSchedule(d_model) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) train_loss = tf.keras.metrics.Mean(name='train_loss') train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') checkpoint_path = "./checkpoints/train" ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)
def main(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") def load_data(args): gc.disable() with open(f"{args.preprocessed_data_path}/hanja_korean_word2id.pkl", "rb") as f: data = pickle.load(f) hanja_word2id = data['hanja_word2id'] korean_word2id = data['korean_word2id'] with open(f"{args.preprocessed_data_path}/preprocessed_test.pkl", "rb") as f: data = pickle.load(f) test_hanja_indices = data['hanja_indices'] test_additional_hanja_indices = data['additional_hanja_indices'] gc.enable() return hanja_word2id, korean_word2id, test_hanja_indices, test_additional_hanja_indices hanja_word2id, korean_word2id, hanja_indices, additional_hanja_indices = load_data( args) hanja_vocab_num = len(hanja_word2id) korean_vocab_num = len(korean_word2id) print('Loader and Model Setting...') h_dataset = HanjaDataset(hanja_indices, additional_hanja_indices, hanja_word2id, min_len=args.min_len, src_max_len=args.src_max_len) h_loader = DataLoader(h_dataset, drop_last=True, batch_size=args.batch_size, num_workers=4, prefetch_factor=4) model = Transformer(hanja_vocab_num, korean_vocab_num, pad_idx=args.pad_idx, bos_idx=args.bos_idx, eos_idx=args.eos_idx, src_max_len=args.src_max_len, trg_max_len=args.trg_max_len, d_model=args.d_model, d_embedding=args.d_embedding, n_head=args.n_head, dim_feedforward=args.dim_feedforward, num_encoder_layer=args.num_encoder_layer, num_decoder_layer=args.num_decoder_layer, num_mask_layer=args.num_mask_layer) model.load_state_dict( torch.load(args.checkpoint_path, map_location='cpu')['model']) model.decoders = None model.trg_embedding = None model.trg_output_linear = None model.trg_output_linear2 = None model.trg_output_norm = None model = model.to(device) model.eval() masking_acc = defaultdict(float) with torch.no_grad(): for inputs, labels in h_loader: # Setting inputs = inputs.to(device) labels = labels.to(device) masked_position = labels != args.pad_idx masked_labels = labels[masked_position].contiguous().view( -1).unsqueeze(1) total_mask_count = masked_labels.size(0) # Prediction, output: Batch * Length * Vocab pred = model.reconstruct_predict(inputs, masked_position=masked_position) _, pred = pred.topk(10, 1, True, True) # Top1, 5, 10 masking_acc[1] += (torch.sum( masked_labels == pred[:, :1]).item()) / total_mask_count masking_acc[5] += (torch.sum( masked_labels == pred[:, :5]).item()) / total_mask_count masking_acc[10] += (torch.sum( masked_labels == pred).item()) / total_mask_count for key in masking_acc.keys(): masking_acc[key] /= len(h_loader) for key, value in masking_acc.items(): print(f'Top {key} Accuracy: {value:.4f}') with open('./mask_result.pkl', 'wb') as f: pickle.dump(masking_acc, f)
def main(): training_dataset = [[ "안녕하세요, 제 이름은 윤주성입니다", "hello, my name is joosung yoon" ], ["저는 텐서플로우를 좋아합니다", "i like tensorflow"]] X_y_split = list(zip(*training_dataset)) X_train_str = list( X_y_split[0]) # ['안녕하세요, 제 이름은 윤주성입니다', '저는 텐서플로우를 좋아합니다'] y_train_str = list( X_y_split[1] ) # ['Hello, my name is joosung Yoon', 'I like TensorFlow'] print(X_train_str) print(y_train_str) corpus = [] corpus.extend(X_train_str) corpus.extend( y_train_str ) # ['안녕하세요, 제 이름은 윤주성입니다', '저는 텐서플로우를 좋아합니다', 'Hello, my name is joosung Yoon', 'I like TensorFlow'] vocab = build_vocab(corpus) print(vocab.idx2word) max_sequence_len = 13 X_train, _, _ = word_to_pad_word_ids(text_batch=X_train_str, vocab=vocab, maxlen=max_sequence_len, add_start_end_token=True) _, tar_inp, tar_real = word_to_pad_word_ids( text_batch=y_train_str, vocab=vocab, maxlen=max_sequence_len, add_start_end_token=True) # add +1 maxlen for start, end token print( X_train ) # [[ 5 6 7 8 9 10 11 12 13 14 0 0 0 0 0], [15 16 17 18 19 0 0 0 0 0 0 0 0 0 0]] print( tar_inp ) # [[20 8 21 22 23 24 25 0 0 0 0 0 0 0 0], [26 27 28 0 0 0 0 0 0 0 0 0 0 0 0]] print(tar_real) print(decode_word_ids(X_train, vocab)) # [['안녕/NNG', '하/XSV', '세요/EP+EF', ',/SC', '제/MM', '이름/NNG', '은/JX', '윤주/NNG', '성/XSN', '입니다/VCP+EC', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'], # ['저/NP', '는/JX', '텐서플로우/NNP', '를/JKO', '좋아합니다/VV+EC', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']] config = {} config['vocab_size'] = len(vocab.idx2word) config['maxlen'] = max_sequence_len config['embed_dim'] = 100 config['head_num'] = 5 config['split_embed_dim'] = 20 config['layer_num'] = 2 config['feed_forward_dim'] = 100 # define model model = Transformer(config=config) loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none') # input label == index of class optimizer = tf.keras.optimizers.Adam() train_loss = tf.keras.metrics.Mean(name='train_loss') train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') test_loss = tf.keras.metrics.Mean(name='test_loss') test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='test_accuracy') def loss_function(real, pred): mask = tf.math.logical_not(tf.math.equal(real, 0)) # padding 아닌건 1 loss_ = loss_object(real, pred) mask = tf.cast(mask, dtype=loss_.dtype) loss_ *= mask # 패딩이 아닌 1인 값은 살리고, 패딩인 값인 0인 값은 없앰 return tf.reduce_mean(loss_) def create_padding_mask(seq): seq = tf.cast(tf.math.equal(seq, 0), tf.float32) # add extra dimensions so that we can add the padding # to the attention logits. return seq[:, tf.newaxis, tf.newaxis, :] # (batch_size, 1, 1, seq_len) def create_look_ahead_mask(step_size): """ - decoder에서 각 상태에 대한 self-attention이 inference step에 맞게 future token을 보지 못하게 해야됨 - 각 step이 소유하고 있는 attention은 step개수 만큼임 - future token보지 못하게 하려면 각 step에서 future step에 대해서 마스킹 해야함 - 1 step에서는 나머지 n-1개 masking, 2번째 스텝에서는 앞에 두개 빼고 나머지 n-2개 마스킹 - 이렇게 하면 역삼각형 모양의 마스킹 매트릭스가 나옴 - step * step 을 대각선으로 나눈 모양임 example) x = tf.random.uniform((1, 3)) temp = create_look_ahead_mask(x.shape[1]) temp: <tf.Tensor: id=311521, shape=(3, 3), dtype=float32, numpy= array([[ 0., 1., 1.], [ 0., 0., 1.], [ 0., 0., 0.]], dtype=float32)> Special usecase: tf.matrix_band_part(input, 0, -1) ==> Upper triangular part. tf.matrix_band_part(input, -1, 0) ==> Lower triangular part. tf.matrix_band_part(input, 0, 0) ==> Diagonal. :param step_size: :return: """ mask = 1 - tf.linalg.band_part(tf.ones((step_size, step_size)), -1, 0) return mask # (seq_len, seq_len) def create_masks(inp, tar): # Encoder padding mask enc_padding_mask = create_padding_mask(inp) # Used in the 2nd attention block in the decoder. # This padding mask is used to mask the encoder outputs. dec_padding_mask = create_padding_mask(inp) # Used in the 1st attention block in the decoder. # It is used to pad and mask future tokens in the input received by # the decoder. look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1]) dec_target_padding_mask = create_padding_mask(tar) combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask) return enc_padding_mask, combined_mask, dec_padding_mask # 세션 대신 tf.function() decorator로 파이썬 함수를 감싸면, 이 함수를 하나의 그래프로 실행하기 위해 JIT 컴파일함 # tf.function()을 쓰면 eager mode -> graph mode 되는 것임 # @tf.function def train_step(enc_input, tar_inp, tar_real): # tar_inp = label[:, :-1] # remove </s> # tar_real = label[:, 1:] # remove <s> enc_padding_mask, combined_mask, dec_padding_mask = create_masks( enc_input, tar_inp) with tf.GradientTape() as tape: predictions, attention_weights = model(enc_input, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask) loss = loss_function(tar_real, predictions) # masking losses for padding predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32).numpy() print("X_train: ", decode_word_ids(enc_input.numpy(), vocab)) print("tar_inp: ", decode_word_ids(tar_inp.numpy(), vocab)) print("tar_real: ", decode_word_ids(tar_real.numpy(), vocab)) print("result: ", decode_word_ids(predicted_id, vocab)) gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) train_loss(loss) train_accuracy(tar_real, predictions) # @tf.function # def test_step(Y_test, label): # predictions = model(Y_test) # t_loss = loss_object(label, predictions) # # test_loss(t_loss) # test_accuracy(label, predictions) def plot_attention_weights(attention, sentence, result, layer): import matplotlib.pyplot as plt from matplotlib import font_manager, rc # print("font_list: ", font_manager.get_fontconfig_fonts()) font_name = font_manager.FontProperties( fname='/Library/Fonts/NanumSquareBold.ttf').get_name() rc('font', family=font_name) fig = plt.figure(figsize=(16, 8)) sentence, _, _ = word_to_pad_word_ids( text_batch=[sentence], vocab=vocab, maxlen=max_sequence_len, add_start_end_token=True) #tokenizer_pt.encode(sentence) attention = tf.squeeze(attention[layer], axis=0) for head in range(attention.shape[0]): ax = fig.add_subplot(2, 4, head + 1) # plot the attention weights im = ax.matshow( attention[head][:, :], cmap='viridis') # viridis #plt.cm.Reds # plt.cm.Blues fontdict = {'fontsize': 10} ax.set_xticks(range(len(decode_word_ids(sentence, vocab)[0]))) ax.set_yticks(range(len(decode_word_ids(result, vocab)[0]))) from mpl_toolkits.axes_grid1 import make_axes_locatable divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="5%", pad=0.05) plt.colorbar(im, cax=cax) ax.set_xticklabels(decode_word_ids(sentence, vocab)[0], fontdict=fontdict, rotation=90) ax.set_yticklabels(decode_word_ids(result, vocab)[0], fontdict=fontdict) ax.set_xlabel('Head {}'.format(head + 1)) plt.tight_layout() plt.show() def evaluate(inp_sentence, vocab, max_sequence_len): # inference 일때는 굳이 length를 +1 하지 않아도됨 encoder_input, _, _ = word_to_pad_word_ids(text_batch=[inp_sentence], vocab=vocab, maxlen=max_sequence_len, add_start_end_token=True) print("encoder_input: ", encoder_input) decoder_input = ['<s>'] decoder_input = [vocab.word2idx[_] for _ in decoder_input] output = tf.expand_dims(decoder_input, 0) print("output: ", decode_word_ids(output.numpy(), vocab)) for i in range(max_sequence_len): enc_padding_mask, combined_mask, dec_padding_mask = create_masks( encoder_input, output) # predictions.shape == (batch_size, seq_len, vocab_size) predictions, attention_weights = model(encoder_input, output, False, enc_padding_mask, combined_mask, dec_padding_mask) # select the last word from the seq_len dimension print("predicted_id: ", tf.cast(tf.argmax(predictions, axis=-1), tf.int32)) predictions = predictions[:, -1:, :] # (batch_size, 1, vocab_size) predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32) # return the result if the predicted_id is equal to the end token if tf.equal(predicted_id, vocab.word2idx['</s>']): return tf.squeeze(output, axis=0), attention_weights # concatentate the predicted_id to the output which is given to the decoder # as its input. output = tf.concat([output, predicted_id], axis=-1) print("output: ", decode_word_ids(output.numpy(), vocab)) return tf.squeeze(output, axis=0), attention_weights def translate(sentence, vocab, max_sequence_len, plot=''): result, attention_weights = evaluate(sentence, vocab, max_sequence_len) result = [result.numpy()] predicted_sentence = decode_word_ids(result, vocab) print('Input: {}'.format(sentence)) print('Predicted translation: {}'.format(predicted_sentence)) if plot: plot_attention_weights(attention_weights, sentence, result, plot) ### Training EPOCHS = 4000 BATCH_SIZE = 45 train_ds = tf.data.Dataset.from_tensor_slices((X_train, tar_inp, tar_real)) train_ds = train_ds.repeat(EPOCHS).shuffle(1024).batch(BATCH_SIZE) train_ds = train_ds.prefetch(tf.data.experimental.AUTOTUNE) for step, (X_train_batch, tar_inp, tar_real) in enumerate(train_ds): train_step(X_train_batch, tar_inp, tar_real) template = 'Step {}, Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}' print( template.format(step + 1, train_loss.result(), train_accuracy.result() * 100, test_loss.result(), test_accuracy.result() * 100)) translate("안녕하세요, 제 이름은 윤주성입니다", vocab, max_sequence_len, plot='decoder_layer2_block2') model.summary()
source=source, target=target, ) random.seed(const.SEED) np.random.seed(const.SEED) torch.manual_seed(const.SEED) torch.cuda.manual_seed(const.SEED) torch.backends.cudnn.deterministic = True model = Transformer( source_vocab_size=SOURCE_VOCAB_SIZE, target_vocab_size=TARGET_VOCAB_SIZE, source_padding_index=SRC_PAD_IDX, target_padding_index=TRG_PAD_IDX, embedding_size=const.EMBEDDING_SIZE, number_of_layers=const.NUMBER_OF_LAYERS, number_of_heads=const.NUMBER_OF_HEADS, forward_expansion=const.FORWARD_EXPANSION, device=device, ).to(device) model.apply(model_utils.initialize_weights) optimizer = torch.optim.Adam(model.parameters(), lr=const.LEARNING_RATE) cross_entropy = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX) print(f'The model has {model_utils.count_parameters(model):,} trainable parameters') trainer = Trainer( const=const, optimizer=optimizer,
def main(): ''' Usage: python train.py -data_pkl m30k_deen_shr.pkl -log m30k_deen_shr -embs_share_weight -proj_share_weight -label_smoothing -save_model trained -b 256 -warmup 128000 ''' parser = argparse.ArgumentParser() parser.add_argument('-data_pkl', default=None) # all-in-1 data pickle or bpe field parser.add_argument('-train_path', default=None) # bpe encoded data parser.add_argument('-val_path', default=None) # bpe encoded data parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-b', '--batch_size', type=int, default=2048) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model if not opt.log and not opt.save_model: print('No experiment result will be saved.') raise if opt.batch_size < 2048 and opt.n_warmup_steps <= 4000: print('[Warning] The warmup steps may be not enough.\n' \ '(sz_b, warmup) = (2048, 4000) is the official setting.\n' \ 'Using smaller batch w/o longer warmup may cause ' \ 'the warmup stage ends with only little data trained.') device = torch.device('cuda' if opt.cuda else 'cpu') # ========= Loading Dataset =========# if all((opt.train_path, opt.val_path)): training_data, validation_data = prepare_dataloaders_from_bpe_files( opt, device) elif opt.data_pkl: training_data, validation_data = prepare_dataloaders(opt, device) else: raise print(opt) transformer = Transformer(opt.src_vocab_size, opt.trg_vocab_size, src_pad_idx=opt.src_pad_idx, trg_pad_idx=opt.trg_pad_idx, trg_emb_prj_weight_sharing=opt.proj_share_weight, src_emb_prj_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) model_path = 'checkpoints/pretrained.chkpt' checkpoint = torch.load(model_path, map_location=device) transformer.load_state_dict(checkpoint['model']) optimizer = ScheduledOptim( optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09), 2.0, opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt)
from utils.dataset import Dataset from utils.infer import get_upper_triangular import os save_folder = 'weights' save_file = "triu_mask_model.pkl" step = 1000 total_loss = -1. src_sequence_size = 8 tgt_sequence_size = 8 if __name__ == "__main__": dataset = Dataset(bd.en_dict, bd.cn_dict, bd.sentence_pair_demo, src_sequence_size, tgt_sequence_size) model = Transformer(src_vocab_size=len(bd.en_dict), tgt_vocab_size=len(bd.cn_dict), word_emb_dim=8, tgt_sequence_size=8) loss_f = torch.nn.NLLLoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.001) model.train() upper_tri = get_upper_triangular(8) for i in range(step): optimizer.zero_grad() src, tgt_in, tgt_out, _, _ = dataset.get_batch(batch_size=1) output = model(src, tgt_in, tgt_mask=upper_tri) loss = loss_f(torch.log(output), tgt_out) if total_loss < 0: total_loss = loss.detach().numpy() else: total_loss = total_loss * 0.95 + loss.detach().numpy() * 0.05 loss.backward()