def main(): parser = argparse.ArgumentParser() parser.add_argument('-read_train_dir', required=True) parser.add_argument('-read_dev_dir', required=True) parser.add_argument('-read_test_dir', required=True) parser.add_argument('-read_vocab_file', required=True) parser.add_argument('-load_model_file', required=True) parser.add_argument('-save_model_dir', required=True) #the epoch of initialized model is 0, after 1 epoch training, epoch is 1 #if continue trainning, curr_epoch should be model.epoch + 1 parser.add_argument('-epoch', type=int, default=50) parser.add_argument('-optim_start_lr', type=float, default=0.001) parser.add_argument('-optim_soft_coefficient', type=float, default=1000) parser.add_argument('-batch_size', type=int, default=64) parser.add_argument('-use_gpu', action='store_true') parser.add_argument('-save_interval', type=int, default=10) opt = parser.parse_args() print('[PROCEDURE] prepare trainning.') checkpoint = torch.load(opt.load_model_file, map_location=lambda storage, loc: storage) model = checkpoint['model'] model_options = checkpoint['model_options'] print('[INFO] loading model with parameter:\n\t{}'.format(model_options)) print('[INFO] reading training data...') train_data = initialize_batch_loader(opt.read_train_dir + '/feats.scp', opt.read_train_dir + '/text', opt.read_vocab_file, opt.batch_size) print('[INFO] reading dev data...') dev_data = initialize_batch_loader(opt.read_dev_dir + '/feats.scp', opt.read_dev_dir + '/text', opt.read_vocab_file, opt.batch_size) print('[INFO] reading test data...') test_data = initialize_batch_loader(opt.read_test_dir + '/feats.scp', opt.read_test_dir + '/text', opt.read_vocab_file, opt.batch_size) print('[INFO] batch loader is initialized') vocab_size = len(torch.load(opt.read_vocab_file)) crit = get_criterion(vocab_size) print('[INFO] using cross entropy loss.') optimizer = ScheduledOptim(optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09), start_lr=opt.optim_start_lr, soft_coefficient=opt.optim_soft_coefficient) print('[INFO] using adam as optimizer.') print('[PROCEDURE] trainning start...') if opt.use_gpu: model = model.cuda() crit = crit.cuda() train(model, train_data, dev_data, test_data, crit, optimizer, opt, model_options)
def driver(_config,_run): if (_config["selectively_omitted_index"] != -1): ex.add_config({"omitted_feature_name":_config["selective_audio_visual_feature_omission"][_config["selectively_omitted_index"]]["name"]}) output = open('config_file.pkl', 'wb') pickle.dump(_config, output) ex.add_artifact('config_file.pkl') output.close() set_random_seed() #print("inside driver") #X_train, y_train, X_valid, y_valid, X_test, y_test = load_saved_data() #print(X_train, y_train, X_valid, y_valid, X_test, y_test) train_data_loader,dev_data_loader,test_data_loader = set_up_data_loader() multimodal_context_config = _config["multimodal_context_configs"] model = Contextual_MFN(_config,my_logger).to(_config["device"]) #for now, we will use the same scheduler for the entire model. #Later, if necessary, we may use the default optimizer of MFN #TODO: May have to use separate scheduler for transformer and mfn #We are using the optimizer and scgheduler of mfn as a last resort optimizer = ScheduledOptim( optim.Adam( filter(lambda x: x.requires_grad, model.parameters()), betas=(0.9, 0.98), eps=1e-09), multimodal_context_config["d_model"], multimodal_context_config["n_warmup_steps"]) #TODO: May have to change the criterion #criterion = nn.L1Loss() criterion = nn.BCEWithLogitsLoss() criterion = criterion.to(_config["device"]) # optimizer = optim.Adam( # filter(lambda x: x.requires_grad, model.parameters()),lr = _config["config"]["lr"], # betas=(0.9, 0.98), eps=1e-09) # #torch.optim.Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False) # #optimizer = ReduceLROnPlateau(optimizer,mode='min',patience=100,factor=0.5,verbose=False) # scheduler = ReduceLROnPlateau(optimizer,mode='min',patience=100,factor=0.5,verbose=True) train(model, train_data_loader,dev_data_loader, optimizer, criterion) #test_accuracy = test_score_from_model(model,test_data_loader,criterion) test_accuracy = test_score_from_file(test_data_loader,criterion) ex.log_scalar("test.accuracy",test_accuracy) results = dict() #I believe that it will try to minimize the rest. Let's see how it plays out results["optimization_target"] = 1 - test_accuracy stat_file = open("all_accuracies_for_stat.txt","a") stat_file.write(str(_config["experiment_config_index"]) + "," + str(test_accuracy) + "\n") stat_file.close() return results
def main(): parser = argparse.ArgumentParser() parser.add_argument('-read_feats_scp_file', required=True) parser.add_argument('-read_text_file', required=True) parser.add_argument('-read_vocab_file', required=True) parser.add_argument('-load_model_file', required=True) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-epoch', type=int, default=10) #the epoch of initialized model is 0, after 1 epoch training, epoch is 1 #if continue trainning, curr_epoch should be model.epoch + 1 parser.add_argument('-curr_epoch', type=int, default=1) parser.add_argument('-batch_size', type=int, default=64) parser.add_argument('-save_model_perfix', required=True) parser.add_argument('-use_gpu', action='store_true') opt = parser.parse_args() print('--------------------[PROCEDURE]--------------------') print('[PROCEDURE] prepare trainning.') train_data = initialize_batch_loader(opt.read_feats_scp_file, opt.read_text_file, opt.read_vocab_file, opt.batch_size) eval_data = initialize_batch_loader(opt.read_feats_scp_file, opt.read_text_file, opt.read_vocab_file, opt.batch_size) print('[INFO] batch loader is initialized') checkpoint = torch.load(opt.load_model_file) model = checkpoint['model'] model_options = checkpoint['model_options'] print('[INFO] loading model with parameter: {}'.format(model_options)) def get_criterion(vocab_size): ''' With PAD token zero weight ''' weight = torch.ones(vocab_size) weight[constants.PAD] = 0 return nn.CrossEntropyLoss(weight, size_average=False) vocab_size = len(torch.load(opt.read_vocab_file)) crit = get_criterion(vocab_size) print('[INFO] using cross entropy loss.') optimizer = ScheduledOptim( optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09), model_options.d_model * 2, opt.n_warmup_steps) print('[INFO] using adam as optimizer.') print('--------------------[PROCEDURE]--------------------') print('[PROCEDURE] trainning start...') if opt.use_gpu: train(model.cuda(), train_data, eval_data, crit.cuda(), optimizer, opt, model_options) else: train(model, train_data, eval_data, crit, optimizer, opt, model_options)
def skyline_iteration_provider(transformer): opt = model_config() optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) def iteration(src_seq, src_pos, tgt_seq, tgt_pos, gold): optimizer.zero_grad() loss = transformer(src_seq, src_pos, tgt_seq, tgt_pos, gold) loss.backward() optimizer.step_and_update_lr() return iteration
def prepare_for_training(_config): train_data_loader, dev_data_loader, test_data_loader = set_up_data_loader() if _config["model"] == "trans_mfn": model = Transformed_mfn(_config).to(_config["device"]) else: model = Multimodal_Video_transformer(_config, my_logger).to(_config["device"]) #for now, we will use the same scheduler for the entire model. #Later, if necessary, we may use the default optimizer of MFN #TODO: May have to use separate scheduler for transformer and mfn #We are using the optimizer and scgheduler of mfn as a last resort if (_config["optim"] == "transformer"): optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), betas=(0.9, 0.98), eps=1e-09), _config["sentence_transformer_configs"]["d_model"] * 5, _config["sentence_transformer_configs"]["n_warmup_steps"]) elif (_config["optim"] == "paul"): print("initializing paul trans") optimizer_adam = optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), lr=_config["lr"], betas=(0.9, 0.98), eps=1e-09) scheduler = ReduceLROnPlateau(optimizer_adam, mode='min', patience=100, factor=0.5, verbose=True) optimizer = Optimizer_Scheduler(optimizer_adam, scheduler) #We are multiplying by 3 as there are three different transformer units #TODO: May have to change the criterion #since the scores are in float format, we are using the L1Loss if (_config["loss_function"] == "ll1"): criterion = nn.L1Loss() else: criterion = nn.BCEWithLogitsLoss() criterion = criterion.to(_config["device"]) return train_data_loader, dev_data_loader, test_data_loader, model, optimizer, criterion
def train(self, niter=1): optimizer = ScheduledOptim( optim.Adam(self.module.parameters(), betas=(0.9, 0.98), eps=1e-09), 2.0, self.opt.d_model, self.opt.n_warmup_steps) for _ in range(niter): optimizer.zero_grad() pred = self.module(*self.example_inputs) loss, n_correct, n_word = cal_performance( pred, self.gold, self.opt.trg_pad_idx, smoothing=self.opt.label_smoothing) loss.backward() optimizer.step_and_update_lr()
def main(): ''' Main function ''' opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # ========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len trn_data, val_data = prepare_dataloaders(data, opt) opt.src_vocab_size = trn_data.dataset.src_vocab_size opt.tgt_vocab_size = trn_data.dataset.tgt_vocab_size # ========= Preparing Model =========# if opt.embs_share_weight: assert trn_data.dataset.src_word2idx == trn_data.dataset.tgt_word2idx,\ ('The src/tgt word2idx table are different but asked to share ' 'word embedding.') print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, trn_data, val_data, optimizer, device, opt)
def prep_for_training(_config): encoder_config = _config["encoder"] model = Multi_Transformer( n_src_features=encoder_config["n_source_features"], len_max_seq=encoder_config["max_token_seq_len"], _config=_config, tgt_emb_prj_weight_sharing=encoder_config["proj_share_weight"], emb_src_tgt_weight_sharing=encoder_config["embs_share_weight"], d_k=encoder_config["d_k"], d_v=encoder_config["d_v"], d_model=encoder_config["d_model"], d_word_vec=encoder_config["d_word_vec"], d_inner=encoder_config["d_inner_hid"], n_layers=encoder_config["n_layers"], n_head=encoder_config["n_head"], dropout=encoder_config["dropout"]).to(_config["device"]) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), betas=(0.9, 0.98), eps=1e-09), encoder_config["d_model"], encoder_config["n_warmup_steps"]) if (_config["loss_function"] == "bce"): print("using bce loss") criterion = nn.BCEWithLogitsLoss() else: criterion = nn.L1Loss() criterion = criterion.to(_config["device"]) # optimizer = optim.Adam( # filter(lambda x: x.requires_grad, transformer.parameters()),lr = _config["learning_rate"], # betas=(0.9, 0.98), eps=1e-09) #torch.optim.Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False) #optimizer = ReduceLROnPlateau(optimizer,mode='min',patience=100,factor=0.5,verbose=False) #scheduler = ReduceLROnPlateau(optimizer,mode='min',patience=100,factor=0.5,verbose=True) return model, optimizer, criterion
def __init__(self, source_dataset, batch_size, epochs, window_size, device, plot_file, train_data, test_data, valid_data, target_column, target_min, target_max, d_inner, n_layers, n_head_, d_k, d_v, n_warmup_steps, criterion, target_name, d_model, model_file=None, load_data=False, load_model=False): self.data_frame = self.read_dataset(source_dataset) self.batch_size = batch_size self.epochs = epochs self.device = device self.target_column = target_column self.window = window_size self.plot_file = plot_file self.n_layers = n_layers self.n_head = n_head_ self.d_inner = d_inner self.warmup_step = n_warmup_steps self.d_k = d_k self.d_v = d_v self.d_model = d_model self.target_name = target_name self.input_mask = torch.ones([self.batch_size, 1, self.window], dtype=torch.int, device=device) self.target_max = target_max self.target_min = target_min self.model_file = model_file self.prev_epoch = 0 if load_data: self.train_df = pd.read_csv(train_data) self.test_df = pd.read_csv(test_data) self.valid_df = pd.read_csv(valid_data) else: self.train_df, self.valid_df, self.test_df = self.organize_dataset( train_data, test_data, valid_data) pad_col = [ 'col' + str(i) for i in range(self.train_df.shape[1], self.d_model) ] for col in pad_col: self.train_df[col] = 0 self.test_df[col] = 0 self.valid_df[col] = 0 self.columns = self.train_df.shape[1] self.model = Encoder(n_position=200, d_word_vec=self.columns, d_model=self.columns, d_inner=d_inner, n_layers=n_layers, n_head=n_head_, d_k=d_k, d_v=d_v, dropout=0).to(device) if load_model: self.model = torch.load(self.model_file)['model'] self.model.eval() self.model = self.model.to(device) self.prev_epoch = torch.load(self.model_file)['epoch'] self.criterion = criterion self.optimizer = ScheduledOptim( optim.Adam(self.model.parameters(), betas=(0.9, 0.98), eps=1e-09), 2.0, self.columns, n_warmup_steps, n_step=self.prev_epoch * (math.floor( self.train_df.shape[0] / self.window * self.batch_size))) self.loss_list = [] self.lr_list = []
class Dataset: def __init__(self, source_dataset, batch_size, epochs, window_size, device, plot_file, train_data, test_data, valid_data, target_column, target_min, target_max, d_inner, n_layers, n_head_, d_k, d_v, n_warmup_steps, criterion, target_name, d_model, model_file=None, load_data=False, load_model=False): self.data_frame = self.read_dataset(source_dataset) self.batch_size = batch_size self.epochs = epochs self.device = device self.target_column = target_column self.window = window_size self.plot_file = plot_file self.n_layers = n_layers self.n_head = n_head_ self.d_inner = d_inner self.warmup_step = n_warmup_steps self.d_k = d_k self.d_v = d_v self.d_model = d_model self.target_name = target_name self.input_mask = torch.ones([self.batch_size, 1, self.window], dtype=torch.int, device=device) self.target_max = target_max self.target_min = target_min self.model_file = model_file self.prev_epoch = 0 if load_data: self.train_df = pd.read_csv(train_data) self.test_df = pd.read_csv(test_data) self.valid_df = pd.read_csv(valid_data) else: self.train_df, self.valid_df, self.test_df = self.organize_dataset( train_data, test_data, valid_data) pad_col = [ 'col' + str(i) for i in range(self.train_df.shape[1], self.d_model) ] for col in pad_col: self.train_df[col] = 0 self.test_df[col] = 0 self.valid_df[col] = 0 self.columns = self.train_df.shape[1] self.model = Encoder(n_position=200, d_word_vec=self.columns, d_model=self.columns, d_inner=d_inner, n_layers=n_layers, n_head=n_head_, d_k=d_k, d_v=d_v, dropout=0).to(device) if load_model: self.model = torch.load(self.model_file)['model'] self.model.eval() self.model = self.model.to(device) self.prev_epoch = torch.load(self.model_file)['epoch'] self.criterion = criterion self.optimizer = ScheduledOptim( optim.Adam(self.model.parameters(), betas=(0.9, 0.98), eps=1e-09), 2.0, self.columns, n_warmup_steps, n_step=self.prev_epoch * (math.floor( self.train_df.shape[0] / self.window * self.batch_size))) self.loss_list = [] self.lr_list = [] def read_dataset(self, source_dataset): return pd.read_csv(source_dataset) def organize_dataset(self, train_data, test_data, valid_data): train_df = self.data_frame valid_df = self.data_frame test_df = self.data_frame return train_df, valid_df, test_df def train(self): train_tensor = torch.tensor(self.train_df.values, dtype=torch.float, device=self.device) train_rows = self.train_df.shape[0] section_size = self.window * self.batch_size avg_loss = 0 for i in range(self.epochs): chosen_idx = np.random.choice(train_rows, replace=True, size=math.floor(train_rows / 10)) imputing_df = self.train_df.copy() imputing_df.iloc[[j in chosen_idx for j in range(train_rows)], self.target_column] = 0 imputing_tensor = torch.tensor(imputing_df.values, dtype=torch.float, device=self.device) avg_loss = 0 lr = 0 for j in range(math.floor(train_rows / section_size)): batch_imputing_tensor = imputing_tensor[j * section_size:(j + 1) * section_size, :] batch_train_tensor = train_tensor[j * section_size:(j + 1) * section_size, :] input_tensor = self.unsqueeze(batch_imputing_tensor) self.optimizer.zero_grad() imputed_tensor = self.squeeze( self.model(input_tensor, self.input_mask)[0]) imputing_idx = [ k in chosen_idx for k in range(j * section_size, (j + 1) * section_size) ] imputing_idx_tensor = torch.tensor(imputing_idx) imputed_label_tensor = imputed_tensor[imputing_idx_tensor, self.target_column] true_label_tensor = batch_train_tensor[imputing_idx_tensor, self.target_column] loss = torch.sqrt( self.criterion(imputed_label_tensor, true_label_tensor)) # loss = self.criterion(imputed_label_tensor, true_label_tensor) if imputed_label_tensor.shape[0] > 0: loss.backward() #here compute engine lr = self.optimizer.step_and_update_lr() avg_loss = (j * avg_loss + loss) / (j + 1) self.loss_list.append(avg_loss * (self.target_max - self.target_min)) self.lr_list.append(10000 * lr) self.save_model(i) print(avg_loss * (self.target_max - self.target_min)) self.draw_plots(avg_loss * (self.target_max - self.target_min)) def validate(self): valid_tensor = torch.tensor(self.valid_df.values, dtype=torch.float, device=self.device) valid_rows = self.valid_df.shape[0] section_size = self.window * self.batch_size chosen_idx = np.random.choice(valid_rows, replace=True, size=math.floor(valid_rows / 10)) imputing_df = self.valid_df.copy() imputing_df.iloc[[j in chosen_idx for j in range(valid_rows)], self.target_column] = 0 imputing_tensor = torch.tensor(imputing_df.values, dtype=torch.float, device=self.device) avg_loss = 0 imputed_list = [] for j in range(math.floor(valid_rows / section_size)): batch_imputing_tensor = imputing_tensor[j * section_size:(j + 1) * section_size, :] batch_valid_tensor = valid_tensor[j * section_size:(j + 1) * section_size, :] input_tensor = self.unsqueeze(batch_imputing_tensor) imputed_tensor = self.squeeze( self.model(input_tensor, self.input_mask)[0]) imputing_idx = [ k in chosen_idx for k in range(j * section_size, (j + 1) * section_size) ] imputing_idx_tensor = torch.tensor(imputing_idx) imputed_label_tensor = imputed_tensor[imputing_idx_tensor, self.target_column] true_label_tensor = batch_valid_tensor[imputing_idx_tensor, self.target_column] imputed_list = imputed_list + imputed_tensor[:, self. target_column].tolist( ) # loss = torch.sqrt(self.criterion(imputed_label_tensor, true_label_tensor)) loss = self.criterion(imputed_label_tensor, true_label_tensor) if imputed_label_tensor.shape[0] > 0: avg_loss = (j * avg_loss + loss) / (j + 1) print(avg_loss * (self.target_max - self.target_min)) valid_list = valid_tensor[:, self.target_column].tolist() imputed_list = [(imputed_list[i] * (i in chosen_idx) + valid_list[i] * (i not in chosen_idx)) for i in range(len(imputed_list))] plt.plot(imputed_list, 'r', label="Imputed") plt.plot(valid_list, 'b', label="True") plt.legend(loc="upper right") plt.show() def unsqueeze(self, batch_tensor): temp_tensor = torch.zeros((self.batch_size, self.window, self.columns), dtype=torch.float, device=self.device) for i in range(self.batch_size): temp_tensor[i, :, :] = batch_tensor[i * self.window:(i + 1) * self.window, :] return temp_tensor def squeeze(self, predict_tensor): temp_tensor = torch.zeros( (self.batch_size * self.window, self.columns), dtype=torch.float, device=self.device) for i in range(self.batch_size): temp_tensor[i * self.window:(i + 1) * self.window, :] = predict_tensor[i, :, :] return temp_tensor def draw_plots(self, avg_loss): plt.plot(self.loss_list, 'r', label="Loss") plt.plot(self.lr_list, 'b', label="10000 * Learning Rate") title = 'n_layers: ' + str(self.n_layers) + '\n' + 'n_heads: ' + str( self.n_head ) + '\n' + 'd_inner: ' + str( self.d_inner ) + '\n' + 'warmup_step: ' + str( self.warmup_step ) + '\n' + 'd_v: ' + str(self.d_v) + '\n' + 'd_k: ' + str( self.d_k ) + '\n' + 'd_model: ' + str(self.d_model) + '\n' + 'window: ' + str( self.window ) + '\n' + 'target_column: ' + self.target_name + '\n' + 'Loss_function: ' + str( self.criterion) + '\n' + 'avg_loss: ' + str(float(avg_loss.data)) plt.legend(loc="upper right", title=title) timestr = time.strftime("%Y%m%d-%H%M%S") plt.savefig(self.plot_file + timestr, quality=90) def save_model(self, epoch): checkpoint = { 'epoch': epoch, 'lr_list': self.lr_list, 'loss_list': self.loss_list, 'model': self.model } if self.model_file: torch.save(checkpoint, self.model_file)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() # parser.add_argument( # '-data', default='/data/nfsdata/data/sunzijun/transformer/burry4/data.pt') parser.add_argument('-data', default='./mini_data.pt') parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=64) # parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=3) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default='trained') parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='all') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model debug = False if debug: opt.batch_size = 2 opt.dropout = 0 opt.epoch = 300 opt.log = '/home/sunzijun/data/CRNN_trans' # ========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data = my_prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size # 构造id2word字典 idx2word = {idx: word for word, idx in data['dict']['tgt'].items()} # ========= Preparing Model =========# print("************ prepare model ****************") if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device_ids = [1, 3] device = torch.device('cuda', device_ids[0]) transformer = CRNN_Transformer( opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout) transformer = transformer.cuda(device_ids[0]) # 模型载入cuda device 0 transformer = torch.nn.DataParallel( transformer, device_ids=device_ids) # dataParallel重新包装 optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt, idx2word)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=None) parser.add_argument('-step', type=int, default=None) parser.add_argument('-batch_size', type=int, default=64) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) # NOTE(keshav2): This just refers to the learning rate schedule, # nothing performance related. parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('--checkpoint_dir', type=str, default='/lfs/1/keshav2/checkpoints/transformer') parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='all') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') parser.add_argument('--dist-url', default='env://', type=str, help='url used to set up distributed training') parser.add_argument('--dist-backend', default='nccl', type=str, help='Distributed backend') parser.add_argument('--local_rank', default=0, type=int, help='Local rank') parser.add_argument('--rank', default=None, type=int, help='Rank') parser.add_argument('--world_size', default=None, type=int, help='World size') parser.add_argument('--master_addr', default=None, type=str, help='Master address to use for distributed run') parser.add_argument('--master_port', default=None, type=int, help='Master port to use for distributed run') parser.add_argument('--throughput_estimation_interval', type=int, default=None, help='Steps between logging steps completed') parser.add_argument('--max_duration', type=int, default=None, help='Maximum duration in seconds') parser.add_argument('--enable_gavel_iterator', action='store_true', default=False, help='If set, use Gavel iterator') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model torch.cuda.set_device(opt.local_rank) if opt.epoch is not None and opt.step is not None: raise ValueError('Only one of epoch and step may be set') elif opt.epoch is None and opt.step is None: raise ValueError('One of epoch and step must be set') opt.distributed = False if opt.master_addr is not None: opt.distributed = True os.environ['MASTER_ADDR'] = opt.master_addr os.environ['MASTER_PORT'] = str(opt.master_port) dist.init_process_group(backend=opt.dist_backend, init_method=opt.dist_url, world_size=opt.world_size, rank=opt.rank) #========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data = prepare_dataloaders( data, opt, opt.master_addr is not None) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size #========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) if opt.distributed: transformer = DDP(transformer, device_ids=[opt.local_rank], output_device=opt.local_rank) if opt.enable_gavel_iterator: training_data = GavelIterator(training_data, opt.checkpoint_dir, load_checkpoint, save_checkpoint) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt)
def main(): parser = argparse.ArgumentParser(description='main_train.py') # dir = "../data/jd/big" # dir = "../data/jd/middle" dir = "../data/jd/pure" parser.add_argument('-data_dir', default=dir) parser.add_argument('-epoch', type=int, default=30) parser.add_argument('-batch_size', type=int, default=64) parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) # parser.add_argument('-d_inner_hid', type=int, default=1024) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-ct_layers', type=int, default=1) # ContextLayers parser.add_argument('-n_layers', type=int, default=3) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true', default=True) parser.add_argument('-proj_share_weight', action='store_true', default=True) parser.add_argument('-label_smoothing', action='store_true', default=True) parser.add_argument('-log', default="log") # parser.add_argument('-save_model', default="model") parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument( '-device', action='store_true', default=torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')) args = parser.parse_args() args.model_name = str(args.ct_layers) + '_' + str(args.n_layers) + '_' if not os.path.exists(args.log): os.mkdir(args.log) print("加载词汇表") reader = torch.load(args.data_dir + "/reader.data") args.max_token_seq_len = reader['settings']["max_token_seq_len"] args.max_word_seq_len = reader['settings']["max_word_seq_len"] print("加载验证集数据") valid_src = read_file(path=args.data_dir + "/valid_src.txt", ) valid_tgt = read_file(path=args.data_dir + "/valid_tgt.txt") valid_ctx = read_file(path=args.data_dir + "/valid_attr.txt") valid_src, valid_ctx, valid_tgt = \ digitalize(src=valid_src, tgt=valid_tgt, ctx=valid_ctx, max_sent_len=args.max_token_seq_len - 2, word2idx=reader['dict']['src'], index2freq=reader["dict"]["frequency"], topk=3) # training_data, validation_data = prepare_dataloaders(reader, data, args) validation_data = torch.utils.data.DataLoader(SeqDataset( src_word2idx=reader['dict']['src'], tgt_word2idx=reader['dict']['tgt'], ctx_word2idx=reader['dict']['ctx'], src_insts=valid_src, ctx_insts=valid_ctx, tgt_insts=valid_tgt), num_workers=4, pin_memory=False, batch_size=args.batch_size, collate_fn=tri_collate_fn) del valid_src, valid_ctx, valid_tgt print("加载训练集数据") begin, end = 0, sys.maxsize # begin, end = 0, 100 train_src = read_file(path=args.data_dir + "/train_src.txt", begin=begin, end=end) train_tgt = read_file(path=args.data_dir + "/train_tgt.txt", begin=begin, end=end) train_ctx = read_file(path=args.data_dir + "/train_attr.txt", begin=begin, end=end) train_src, train_ctx, train_tgt = \ digitalize(src=train_src, tgt=train_tgt, ctx=train_ctx, max_sent_len=args.max_token_seq_len - 2, word2idx=reader['dict']['src'], index2freq=reader["dict"]["frequency"], topk=0) training_data = torch.utils.data.DataLoader(SeqDataset( src_word2idx=reader['dict']['src'], tgt_word2idx=reader['dict']['tgt'], ctx_word2idx=reader['dict']['ctx'], src_insts=train_src, ctx_insts=train_ctx, tgt_insts=train_tgt), num_workers=4, pin_memory=False, batch_size=args.batch_size, collate_fn=tri_collate_fn, shuffle=True) del train_src, train_ctx, train_tgt args.src_vocab_size = training_data.dataset.src_vocab_size args.tgt_vocab_size = training_data.dataset.tgt_vocab_size args.ctx_vocab_size = training_data.dataset.ctx_vocab_size args.idx2word = {idx: word for word, idx in reader['dict']['src'].items()} print("---准备模型---") if args.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table 不同 但共用word embedding.' print(args) args.model_path = "log/" + args.model_name + ".model" if os.path.exists(args.model_path): checkpoint = torch.load(args.model_path, map_location=args.device) model_opt = checkpoint['settings'] transformer = ContextTransformer( model_opt.ctx_vocab_size, model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, tgt_emb_prj_weight_sharing=model_opt.proj_share_weight, emb_src_tgt_weight_sharing=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner=model_opt.d_inner_hid, ct_layers=model_opt.en_layers, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout) if (args.ct_layers < 0): transformer = Transformer( args.src_vocab_size, args.tgt_vocab_size, args.max_token_seq_len, tgt_emb_prj_weight_sharing=args.proj_share_weight, emb_src_tgt_weight_sharing=args.embs_share_weight, d_k=args.d_k, d_v=args.d_v, d_model=args.d_model, d_word_vec=args.d_word_vec, d_inner=args.d_inner_hid, n_layers=args.n_layers, n_head=args.n_head, dropout=args.dropout).to(args.device) transformer.load_state_dict(checkpoint['model']) transformer = transformer.to(args.device) print('[Info] 装入模型,继续训练') else: transformer = ContextTransformer( args.ctx_vocab_size, args.src_vocab_size, args.tgt_vocab_size, args.max_token_seq_len, tgt_emb_prj_weight_sharing=args.proj_share_weight, emb_src_tgt_weight_sharing=args.embs_share_weight, d_k=args.d_k, d_v=args.d_v, d_model=args.d_model, d_word_vec=args.d_word_vec, d_inner=args.d_inner_hid, ct_layers=args.ct_layers, n_layers=args.n_layers, n_head=args.n_head, dropout=args.dropout).to(args.device) if (args.ct_layers < 0): transformer = Transformer( args.src_vocab_size, args.tgt_vocab_size, args.max_token_seq_len, tgt_emb_prj_weight_sharing=args.proj_share_weight, emb_src_tgt_weight_sharing=args.embs_share_weight, d_k=args.d_k, d_v=args.d_v, d_model=args.d_model, d_word_vec=args.d_word_vec, d_inner=args.d_inner_hid, n_layers=args.n_layers, n_head=args.n_head, dropout=args.dropout).to(args.device) optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09) args_optimizer = ScheduledOptim(optimizer, args.d_model, args.n_warmup_steps) printModel(transformer) train(transformer, training_data, validation_data, args_optimizer, args)
def train(model, training_data, validation_data, device, opt, training_mode): ''' Start training ''' log_train_file, log_valid_file = None, None try: os.makedirs(opt.save_folder) except FileExistsError: pass if opt.log: if training_mode == TRAIN_BASE: log_train_file = os.path.join(opt.save_folder, opt.log + '.train.base.log') log_valid_file = os.path.join(opt.save_folder, opt.log + '.valid.base.log') elif training_mode == TRAIN_ENCODER: log_train_file = os.path.join( opt.save_folder, opt.log + '.train.encoder.highway.log') log_valid_file = os.path.join( opt.save_folder, opt.log + '.valid.encoder.highway.log') elif training_mode == TRAIN_DECODER: log_train_file = os.path.join( opt.save_folder, opt.log + '.train.decoder.highway.log') log_valid_file = os.path.join( opt.save_folder, opt.log + '.valid.decoder.highway.log') print('[Info] Training performance will be written to file: {} and {}'. format(log_train_file, log_valid_file)) with open(log_train_file, 'w') as log_tf, open(log_valid_file, 'w') as log_vf: log_tf.write('epoch,loss,ppl,accuracy\n') log_vf.write('epoch,loss,ppl,accuracy\n') def print_performances(header, loss, accu, start_time): print(' - {header:12} ppl: {ppl: 8.5f}, accuracy: {accu:3.3f} %, '\ 'elapse: {elapse:3.3f} min'.format( header=f"({header})", ppl=math.exp(min(loss, 100)), accu=100*accu, elapse=(time.time()-start_time)/60)) no_decay = ["bias", "LayerNorm.weight"] if training_mode == TRAIN_BASE: optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if ("encoder_highway" not in n) and ( "decoder_highway" not in n) and (not any( nd in n for nd in no_decay)) ], "weight_decay": opt.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if ("encoder_highway" not in n) and ("decoder_highway" not in n) and (any(nd in n for nd in no_decay)) ], "weight_decay": 0.0, }, ] elif training_mode == TRAIN_ENCODER: optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if ("encoder_highway" in n) and (not any( nd in n for nd in no_decay)) ], "weight_decay": opt.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if ("encoder_highway" in n) and (any(nd in n for nd in no_decay)) ], "weight_decay": 0.0, }, ] elif training_mode == TRAIN_DECODER: optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if ("decoder_highway" in n) and (not any( nd in n for nd in no_decay)) ], "weight_decay": opt.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if ("decoder_highway" in n) and (any(nd in n for nd in no_decay)) ], "weight_decay": 0.0, }, ] optimizer = ScheduledOptim( optim.Adam(optimizer_grouped_parameters, betas=(0.9, 0.98), eps=1e-09), 2.0, opt.d_model, opt.n_warmup_steps) if training_mode == TRAIN_BASE: training_epoch = opt.base_epoch elif training_mode == TRAIN_ENCODER: training_epoch = opt.highway_encoder_epoch elif training_mode == TRAIN_DECODER: training_epoch = opt.highway_decoder_epoch #valid_accus = [] valid_losses = [] for epoch_i in range(training_epoch): print('[ Epoch', epoch_i, ']') start = time.time() train_loss, train_accu = train_epoch(model, training_data, optimizer, opt, device, smoothing=opt.label_smoothing, training_mode=training_mode) print_performances('Training', train_loss, train_accu, start) start = time.time() valid_loss, valid_accu = eval_epoch(model, validation_data, device, opt, training_mode) print_performances('Validation', valid_loss, valid_accu, start) valid_losses += [valid_loss] checkpoint = { 'epoch': epoch_i, 'settings': opt, 'model': model.state_dict() } if opt.save_model: if opt.save_mode == 'all': if training_mode == TRAIN_BASE: model_name = os.path.join( opt.save_folder, opt.save_model + '_accu_{accu:3.3f}.chkpt'.format( accu=100 * valid_accu)) elif training_mode == TRAIN_ENCODER: model_name = os.path.join( opt.save_folder, opt.save_model + '_loss_{loss:3.3f}_encoder_highway.chkpt'.format( valid_loss)) elif training_mode == TRAIN_DECODER: model_name = os.path.join( opt.save_folder, opt.save_model + '_accu_{accu:3.3f}_decoder_highway.chkpt'.format( accu=100 * valid_accu)) torch.save(checkpoint, model_name) elif opt.save_mode == 'best': if training_mode == TRAIN_BASE: model_name = os.path.join(opt.save_folder, opt.save_model + '.chkpt') elif training_mode == TRAIN_ENCODER: model_name = os.path.join( opt.save_folder, opt.save_model + '_encoder_highway.chkpt') elif training_mode == TRAIN_DECODER: model_name = os.path.join( opt.save_folder, opt.save_model + '_decoder_highway.chkpt') if training_mode == TRAIN_BASE and valid_loss <= min( valid_losses): torch.save(checkpoint, model_name) elif training_mode == TRAIN_ENCODER or training_mode == TRAIN_DECODER: torch.save(checkpoint, model_name) print(' - [Info] The checkpoint file has been updated.') if log_train_file and log_valid_file: with open(log_train_file, 'a') as log_tf, open(log_valid_file, 'a') as log_vf: log_tf.write( '{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format( epoch=epoch_i, loss=train_loss, ppl=math.exp(min(train_loss, 100)), accu=100 * train_accu)) log_vf.write( '{epoch},{loss: 8.5f},{ppl: 8.5f},{accu:3.3f}\n'.format( epoch=epoch_i, loss=valid_loss, ppl=math.exp(min(valid_loss, 100)), accu=100 * valid_accu))
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=64) # parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # ========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size # ========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=64) # parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default='default') parser.add_argument('-tensorboard', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model global global_counter global_counter = 0 writer = None if opt.tensorboard: writer = SummaryWriter(os.path.join('./logs', opt.tensorboard)) # ========= Loading Dataset =========# data = torch.load(opt.data) global idx2char idx2char = {v: k for k, v in data['dict']['src'].items()} opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data, unique_char_len = prepare_dataloaders( data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size # ========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) try: transformer.load_state_dict(torch.load('./checkpoints/model.pt')) print("Model loaded successfully.......") except: pass optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt, unique_char_len, writer)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-data', default='./data/preprocessedData') parser.add_argument('-epoch', type=int, default=50) parser.add_argument('-batch_size', type=int, default=64) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default='log') # None parser.add_argument('-save_model', default='trained') # None parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-beam_size', type=int, default=5, help='Beam size') parser.add_argument('-n_best', type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true', default=True) opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # Loading Dataset data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size # Preparing Model if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') # device = torch.device('cpu') transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout) discriminator = Discriminator(opt.d_model, 1024, opt.max_token_seq_len, device) #''' if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") transformer = nn.DataParallel(transformer) # ''' transformer.to(device) discriminator.to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) optimizer_d = optim.RMSprop(discriminator.parameters(), lr=5e-4) train(transformer, discriminator, training_data, validation_data, optimizer, optimizer_d, device, opt)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=100) parser.add_argument('-batch_size', type=int, default=64) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=1024) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='all') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-multi_gpu', action='store_true') parser.add_argument('-use_ctx', action='store_true') parser.add_argument( '-external_validation_script', type=str, default=None, metavar='PATH', nargs='*', help= "location of validation script (to run your favorite metric for validation) (default: %(default)s)" ) opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model #========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len #========= Preparing DataLoader =========# training_data = DataLoader( data['dict']['src'], data['dict']['tgt'], src_insts=data['train']['src'], tgt_insts=data['train']['tgt'], ctx_insts=(data['train']['ctx'] if opt.use_ctx else None), batch_size=opt.batch_size, cuda=opt.cuda, is_train=True, sort_by_length=True) validation_data = DataLoader( data['dict']['src'], data['dict']['tgt'], src_insts=data['valid']['src'], tgt_insts=data['valid']['tgt'], ctx_insts=(data['valid']['ctx'] if opt.use_ctx else None), batch_size=opt.batch_size, shuffle=False, cuda=opt.cuda, is_train=False, sort_by_length=True) opt.src_vocab_size = training_data.src_vocab_size opt.tgt_vocab_size = training_data.tgt_vocab_size #========= Preparing Model =========# if opt.embs_share_weight and training_data.src_word2idx != training_data.tgt_word2idx: print( '[Warning]', 'The src/tgt word2idx table are different but asked to share word embedding.' ) print(opt) transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, proj_share_weight=opt.proj_share_weight, embs_share_weight=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner_hid=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout, use_ctx=opt.use_ctx) #print(transformer) # optimizer = ScheduledOptim( # optim.Adam( # transformer.get_trainable_parameters(), # betas=(0.9, 0.98), eps=1e-09), # opt.d_model, opt.n_warmup_steps) optimizer = ScheduledOptim( optim.Adam(transformer.get_trainable_parameters(), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) def get_criterion(vocab_size): ''' With PAD token zero weight ''' weight = torch.ones(vocab_size) weight[Constants.PAD] = 0 #return nn.CrossEntropyLoss(weight, size_average=False) return nn.NLLLoss(weight, size_average=False) crit = get_criterion(training_data.tgt_vocab_size) logsoftmax = nn.LogSoftmax() if opt.cuda: transformer = transformer.cuda() crit = crit.cuda() logsoftmax = logsoftmax.cuda() if opt.multi_gpu: transformer = nn.DataParallel(transformer) crit = nn.DataParallel(crit) logsoftmax = nn.DataParallel(logsoftmax) train(transformer, training_data, validation_data, crit, logsoftmax, optimizer, opt)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-mined_data', required=True) parser.add_argument('-snippet_model', required=True) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=64) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', type=bool, default=True) parser.add_argument('-save_model_dir', default=None, required=True) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='all') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') # For bleu eval parser.add_argument('-beam_size', type=int, default=5, help='Beam size') parser.add_argument('-n_best', type=int, default=1, help="""If verbose is set, will output the n_best decoded sentences""") parser.add_argument('-test_epoch', type=int, default=5, help='Test every x epochs') parser.add_argument('-resume_from_epoch', type=int, default=0, help='Warm restart') # Not really needed parser.add_argument('-alpha', type=float, default=1.0, help='Weighting loss') parser.add_argument('-loss_weight', type=float, default=0.1, help='Mined loss weight') parser.add_argument('-lr', type=float, default=1e-3, help='Learning rate') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # Snippet model sentencepiece sp.Load(opt.snippet_model) #========= Loading Dataset =========# data = torch.load(opt.data) mined_data = torch.load(opt.mined_data) opt.inp_seq_max_len = 4 * data['settings'].train_max_input_len opt.out_seq_max_len = 4 * data['settings'].train_max_output_len opt.max_token_seq_len = int(opt.out_seq_max_len / 4) training_data, validation_data, test_data, mined_data = prepare_dataloaders( data, mined_data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size print(opt.inp_seq_max_len, opt.out_seq_max_len, opt.src_vocab_size, opt.tgt_vocab_size) #========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.inp_seq_max_len, opt.out_seq_max_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09, lr=opt.lr), opt.d_model, opt.n_warmup_steps) save_params(opt) opt = check_restart_conditions(opt) if opt.resume_from_epoch >= 1: print('Loading Old model') print('Loading model files from folder: %s' % opt.save_model_dir) transformer = load_models(transformer, opt, opt.resume_from_epoch) train(transformer, training_data, validation_data, test_data, mined_data, optimizer, device, opt)
def main(): ''' Usage: python train.py -data_pkl m30k_deen_shr.pkl -embs_share_weight -proj_share_weight -label_smoothing -output_dir output -b 256 -warmup 128000 ''' parser = argparse.ArgumentParser() parser.add_argument('-data_pkl', default=None) # all-in-1 data pickle or bpe field parser.add_argument('-train_path', default=None) # bpe encoded data parser.add_argument('-val_path', default=None) # bpe encoded data parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-b', '--batch_size', type=int, default=2048) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-warmup','--n_warmup_steps', type=int, default=4000) parser.add_argument('-lr_mul', type=float, default=2.0) parser.add_argument('-seed', type=int, default=None) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-scale_emb_or_prj', type=str, default='prj') parser.add_argument('-output_dir', type=str, default=None) parser.add_argument('-use_tb', action='store_true') parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # https://pytorch.org/docs/stable/notes/randomness.html # For reproducibility if opt.seed is not None: torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = False # torch.set_deterministic(True) np.random.seed(opt.seed) random.seed(opt.seed) if not opt.output_dir: print('No experiment result will be saved.') raise if not os.path.exists(opt.output_dir): os.makedirs(opt.output_dir) if opt.batch_size < 2048 and opt.n_warmup_steps <= 4000: print('[Warning] The warmup steps may be not enough.\n'\ '(sz_b, warmup) = (2048, 4000) is the official setting.\n'\ 'Using smaller batch w/o longer warmup may cause '\ 'the warmup stage ends with only little data trained.') device = torch.device('cuda' if opt.cuda else 'cpu') #========= Loading Dataset =========# if all((opt.train_path, opt.val_path)): training_data, validation_data = prepare_dataloaders_from_bpe_files(opt, device) elif opt.data_pkl: training_data, validation_data = prepare_dataloaders(opt, device) else: raise print(opt) transformer = Transformer( opt.src_vocab_size, opt.trg_vocab_size, src_pad_idx=opt.src_pad_idx, trg_pad_idx=opt.trg_pad_idx, trg_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_trg_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout, scale_emb_or_prj=opt.scale_emb_or_prj).to(device) optimizer = ScheduledOptim( optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09), opt.lr_mul, opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-epoch', type=int, default=1) parser.add_argument('-batch_size', type=int, default=4) parser.add_argument('-context_width', type=int, default=1) parser.add_argument('-frame_rate', type=int, default=30) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=1024) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=400) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-log', default=None) parser.add_argument('-save_model', default='./exp') parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') opt = parser.parse_args() cfg_path = './config/transformer.cfg' config = configparser.ConfigParser() config.read(cfg_path) #========= Preparing DataLoader =========# training_data = DataLoader('train', config, DEVICE, batch_size=opt.batch_size, context_width=opt.context_width, frame_rate=opt.frame_rate) validation_data = DataLoader('dev', config, DEVICE, batch_size=opt.batch_size, context_width=opt.context_width, frame_rate=opt.frame_rate) test_data = DataLoader('test', config, DEVICE, batch_size=opt.batch_size, context_width=opt.context_width, frame_rate=opt.frame_rate) #========= Preparing Model =========# print(opt) input_dim = training_data.features_dim output_dim = training_data.vocab_size n_inputs_max_seq = max(training_data.inputs_max_seq_lengths, validation_data.inputs_max_seq_lengths, test_data.inputs_max_seq_lengths) n_outputs_max_seq = max(training_data.outputs_max_seq_lengths, validation_data.outputs_max_seq_lengths, test_data.outputs_max_seq_lengths) print('*************************') print('The max length of inputs is %d:' % n_inputs_max_seq) print('The max length of targets is %d' % n_outputs_max_seq) transformer = Transformer(input_dim, output_dim, n_inputs_max_seq, n_outputs_max_seq, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_inner_hid=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout, device=DEVICE) # print(transformer) optimizer = ScheduledOptim( optim.Adam(transformer.get_trainable_parameters(), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) def get_criterion(output_dim): ''' With PAD token zero weight ''' weight = torch.ones(output_dim) weight[Constants.PAD] = 0 return nn.CrossEntropyLoss(weight, size_average=False) crit = get_criterion(training_data.vocab_size) transformer = transformer.to(DEVICE) crit = crit.to(DEVICE) train(transformer, training_data, validation_data, crit, optimizer, opt)
def main(): ''' Usage: python train.py -data_pkl m30k_deen_shr.pkl -log m30k_deen_shr -embs_share_weight -proj_share_weight -label_smoothing -save_model trained -b 256 -warmup 128000 ''' global C global shapes global Beta parser = argparse.ArgumentParser() parser.add_argument('-data_pkl', default=None) # all-in-1 data pickle or bpe field parser.add_argument('-srn', type=bool, default=False) parser.add_argument('-optimize_c', type=bool, default=False) parser.add_argument('-Beta', type=float, default=1.0) parser.add_argument("-lr", type=float, default=1e-1) parser.add_argument("-scheduler_mode", type=str, default=None) parser.add_argument("-scheduler_factor", type=float, default=0.5) parser.add_argument('-train_path', default=None) # bpe encoded data parser.add_argument('-val_path', default=None) # bpe encoded data parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-b', '--batch_size', type=int, default=2048) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model Beta = opt.Beta if not opt.log and not opt.save_model: print('No experiment result will be saved.') raise if opt.batch_size < 2048 and opt.n_warmup_steps <= 4000: print('[Warning] The warmup steps may be not enough.\n'\ '(sz_b, warmup) = (2048, 4000) is the official setting.\n'\ 'Using smaller batch w/o longer warmup may cause '\ 'the warmup stage ends with only little data trained.') device = torch.device('cuda' if opt.cuda else 'cpu') #========= Loading Dataset =========# if all((opt.train_path, opt.val_path)): training_data, validation_data = prepare_dataloaders_from_bpe_files( opt, device) elif opt.data_pkl: training_data, validation_data = prepare_dataloaders(opt, device) else: raise print(opt) transformer = Transformer(opt.src_vocab_size, opt.trg_vocab_size, src_pad_idx=opt.src_pad_idx, trg_pad_idx=opt.trg_pad_idx, trg_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_trg_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) if opt.srn: transformer = migrate_to_srn(transformer) transformer = transformer.to(device) if opt.optimize_c: srn_modules = [ module for module in transformer.modules() if isinstance(module, (SRNLinear, SRNConv2d)) ] sranks = [] shapes = [] for module in srn_modules: W = module.weight.detach() shape_w = W.shape W = W.view(shape_w[0], -1) sranks.append(stable_rank(W).item()) shapes.append(W.shape) # a rule of thump to initialize the target srank with the current srank of the model C = [ Parameter((torch.ones(1) * sranks[i] / min(shapes[i])).view(())) for i in range(len(srn_modules)) ] for i, module in enumerate(srn_modules): C[i].to(device) module.c = C[i] criteria = criteria_ else: criteria = cal_performance optimizer = ScheduledOptim(optim.Adam(transformer.parameters(), lr=1e-2, betas=(0.9, 0.98), eps=1e-09), opt.lr, opt.d_model, opt.n_warmup_steps, mode=opt.scheduler_mode, factor=opt.scheduler_factor, patience=3) train(transformer, training_data, validation_data, optimizer, device, opt, loss=criteria) print("~~~~~~~~~~~~~C~~~~~~~~~~~~~") print(C) print("~~~~~~~~~~~~~~~~~~~~~~~~~~~") print("-----------Model-----------") print(transformer) print("---------------------------") with torch.no_grad(): for pname, p in transformer.named_parameters(): if len(p.shape) > 1: print("...Parameter ", pname, ", srank=", stable_rank(p.view(p.shape[0], -1)).item())
def main(): ''' Usage: python train.py -data_pkl m30k_deen_shr.pkl -log m30k_deen_shr -embs_share_weight -proj_share_weight -label_smoothing -save_model trained -b 256 -warmup 128000 ''' parser = argparse.ArgumentParser() parser.add_argument('-data_pkl', default=None) # all-in-1 data pickle or bpe field parser.add_argument('-train_path', default=None) # bpe encoded data parser.add_argument('-val_path', default=None) # bpe encoded data parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-b', '--batch_size', type=int, default=2048) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', default=True, action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model if not opt.log and not opt.save_model: print('No experiment result will be saved.') raise if opt.batch_size < 2048 and opt.n_warmup_steps <= 4000: print('[Warning] The warmup steps may be not enough.\n'\ '(sz_b, warmup) = (2048, 4000) is the official setting.\n'\ 'Using smaller batch w/o longer warmup may cause '\ 'the warmup stage ends with only little data trained.') device = torch.device('cuda' if opt.cuda else 'cpu') #========= Loading Dataset =========# if all((opt.train_path, opt.val_path)): training_data, validation_data = prepare_dataloaders_from_bpe_files( opt, device) elif opt.data_pkl: training_data, validation_data = prepare_dataloaders(opt, device) else: raise print(opt) transformer = Transformer(opt.src_vocab_size, opt.trg_vocab_size, src_pad_idx=opt.src_pad_idx, trg_pad_idx=opt.trg_pad_idx, trg_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_trg_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09), 2.0, opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() #----------------------参数都在这里面,默认参数!!!!!!!!!!!!!!!! parser.add_argument('-data', required=False) parser.add_argument('-epoch', type=int, default=1) #为了跑通我就先写1了. parser.add_argument('-batch_size', type=int, default=32) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default='/transformer_my') parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') # 这种action里面写上就表示默认调用的话就是true opt = parser.parse_args() opt.d_word_vec = opt.d_model ''' 下面来把参数写这里, 就方便了. ''' opt.saved_weight = '/trained.chkpt' # 就的模型的位置. opt.data = 'yunixng_bash/data/multi30k.atok.low.pt' # 数据集的位置. opt.save_model = 'trained' # 存模型的名字. opt.save_mode = 'best' # 数据集的位置. opt.proj_share_weight = True # 数据集的位置. opt.label_smoothing = True # 数据集的位置. opt.cuda = False opt.batch_size = 200 opt.epoch = 30 print(opt, 44444444444444444444444444444444444444444444444444444444444444) #========= Loading Dataset =========# data = torch.load( opt.data ) # 这里面的数据已经经过编码了. 具体的编码规则也都在data里面,data里面是一个字典.并且src 和tgt的字典是不一样的,所以上面的embs_share_weight 参数一定要false. 数据集一共大小才3mb. 真方便. 就是根目录下面的multi30k.atok.low.pt这个. 应该是一个小数据及,3万个句子对, 字典3k. 只有点常用的英文字. 并且没用使用word-piece. 只是word级别的编码. 所以随便给一个句子,超出字典非常正常. 但是目前用这个,对于测试非常方便,速度很快. opt.max_token_seq_len = data['settings'].max_token_seq_len # 进行数据长度预处理 , 就是加padding 而已. training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size #========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print('配的参数都打印在这里了') print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer( # 准备网络模型. opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt)
def main(): h = logging.StreamHandler() formatter = logging.Formatter("[%(asctime)s][%(levelname)s]%(message)s", datefmt="%Y-%m-%d %H:%M:%S") h.setFormatter(formatter) logger = logging.getLogger() logger.setLevel(logging.INFO) logger.addHandler(h) parser = argparse.ArgumentParser() parser.add_argument('-data_path', default="../Data/dataset") parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-b', '--batch_size', type=int, default=512) parser.add_argument('-d_model', type=int, default=15) parser.add_argument('-d_inner_hid', type=int, default=256) parser.add_argument('-d_k', type=int, default=15) parser.add_argument('-d_v', type=int, default=15) parser.add_argument('-n_head', type=int, default=1) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=100000) parser.add_argument('-lr_mul', type=float, default=2.0) parser.add_argument('-seed', type=int, default=None) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-scale_emb_or_prj', type=str, default='prj') parser.add_argument('-output_dir', type=str, default='./checkpoint/') parser.add_argument('-summary_dir', type=str, default='./summary') parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model logging.info(opt) writer = SummaryWriter(log_dir=str(opt.summary_dir)) if opt.seed is not None: torch.manual_seed(opt.seed) torch.backends.cudnn.benchmark = False np.random.seed(opt.seed) random.seed(opt.seed) if not os.path.exists(opt.output_dir): os.makedirs(opt.output_dir) device = torch.device('cuda' if opt.cuda else 'cpu') #========= Loading Dataset =========# pkl_files = os.listdir(opt.data_path) pwd = os.getcwd() pkl_files = [ os.path.join(pwd, opt.data_path, file) for file in pkl_files if 'train' in file ] data_list = [data for data in pkl_files if '.pkl' in data] random.shuffle(data_list) logging.info(data_list) transformer = Transformer(trg_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_trg_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout, scale_emb_or_prj=opt.scale_emb_or_prj).to(device) optimizer = ScheduledOptim( optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09), opt.lr_mul, opt.d_model, opt.n_warmup_steps) test(transformer, data_list, optimizer, device, opt, writer)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', default="./pssp-data/data.pt") parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=17) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=256) parser.add_argument('-d_inner_hid', type=int, default=512) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default="model") parser.add_argument('-save_plot', default="loss.png") parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model #========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data, test_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size opt.vocab_src = training_data.dataset.src_word2idx opt.vocab_tgt = training_data.dataset.tgt_word2idx #========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer( opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout) transformer = DataParallel(transformer, range(0, torch.cuda.device_count())).to(device) optimizer = ScheduledOptim( optim.Adam( filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) weight_mask = None crossEntropy = nn.CrossEntropyLoss(weight_mask, reduction='sum', ignore_index=Constants.PAD) train_loss, val_loss = train( transformer, training_data, validation_data, optimizer, device, opt, crossEntropy) print("Starting Test...") test(transformer, test_data, device, opt, crossEntropy) print("Making loss graph...") plt = plot(train_loss, val_loss) plt.savefig(opt.save_plot + ".png") print("Finished!")
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=64) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=1024) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model #========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len #========= Preparing DataLoader =========# training_data = DataLoader(data['dict']['src'], data['dict']['tgt'], src_insts=data['train']['src'], tgt_insts=data['train']['tgt'], batch_size=opt.batch_size, cuda=opt.cuda) validation_data = DataLoader(data['dict']['src'], data['dict']['tgt'], src_insts=data['valid']['src'], tgt_insts=data['valid']['tgt'], batch_size=opt.batch_size, shuffle=False, test=True, cuda=opt.cuda) opt.src_vocab_size = training_data.src_vocab_size opt.tgt_vocab_size = training_data.tgt_vocab_size #========= Preparing Model =========# if opt.embs_share_weight and training_data.src_word2idx != training_data.tgt_word2idx: print( '[Warning]', 'The src/tgt word2idx table are different but asked to share word embedding.' ) print(opt) transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, proj_share_weight=opt.proj_share_weight, embs_share_weight=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner_hid=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout) #print(transformer) optimizer = ScheduledOptim( optim.Adam(transformer.get_trainable_parameters(), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) def get_criterion(vocab_size): ''' With PAD token zero weight ''' weight = torch.ones(vocab_size) weight[Constants.PAD] = 0 return nn.CrossEntropyLoss(weight, size_average=False) crit = get_criterion(training_data.tgt_vocab_size) if opt.cuda: transformer = transformer.cuda() crit = crit.cuda() print("===>TRAIN\n") train(transformer, training_data, validation_data, crit, optimizer, opt)
def main(): """ Main function """ parser = argparse.ArgumentParser() parser.add_argument("-data", required=True) parser.add_argument("-epoch", type=int, default=10) parser.add_argument("-batch_size", type=int, default=64) # parser.add_argument("-d_word_vec", type=int, default=512) parser.add_argument("-d_model", type=int, default=512) parser.add_argument("-d_inner_hid", type=int, default=2048) parser.add_argument("-d_k", type=int, default=64) parser.add_argument("-d_v", type=int, default=64) parser.add_argument("-n_head", type=int, default=8) parser.add_argument("-n_layers", type=int, default=6) parser.add_argument("-n_warmup_steps", type=int, default=4000) parser.add_argument("-dropout", type=float, default=0.1) parser.add_argument("-embs_share_weight", action="store_true") parser.add_argument("-proj_share_weight", action="store_true") parser.add_argument("-log", default=None) parser.add_argument("-save_model", default=None) parser.add_argument("-save_mode", type=str, choices=["all", "best"], default="best") parser.add_argument("-no_cuda", action="store_true") parser.add_argument("-label_smoothing", action="store_true") opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # ========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data["settings"].max_token_seq_len training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size # ========= Preparing Model =========# if opt.embs_share_weight: assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ "The src/tgt word2idx table are different but asked to share word embedding." print(opt) device = torch.device("cuda" if opt.cuda else "cpu") transformer = Transformer( opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam( filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data_all', default='data/csv/data_train_2_sort.torch') parser.add_argument('-save_model', default='module/2018-7-30.pt') parser.add_argument('-start_time', default='2018-07-01') parser.add_argument('-end_time', default='2018-08-30') parser.add_argument('-epoch', type=int, default=16) parser.add_argument('-batch_size', type=int, default=128) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=32) parser.add_argument('-d_v', type=int, default=32) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=2) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.3) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default='log/logs.log') parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') parser.add_argument('-batch_x', default=32) parser.add_argument('-batch_y', default=32) parser.add_argument('-train_type', default='name') opt = parser.parse_args() opt.cuda = torch.cuda.is_available() opt.d_word_vec = opt.d_model # ========= Loading Dataset =========# # opt.max_token_seq_len = data['settings'].max_token_seq_len training_data, validation_data, voc_name, data_val_ofpa = ld.get_data_loader( opt, device) opt.src_vocab_size = voc_name opt.tgt_vocab_size = opt.src_vocab_size if opt.train_type == 'time': voc = ld.get_time_vac(opt) opt.tgt_vocab_size = voc if voc > 500 else 728 # ========= Preparing Model =========# if opt.embs_share_weight: assert opt.src_vocab_size == opt.tgt_vocab_size, \ 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.batch_x, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) if opt.train_type == 'time': print("train time dim ") # train(transformer, train_time, val_time, optimizer, device, opt) else: train(transformer, training_data, validation_data, optimizer, device, opt, data_val_ofpa)
print(' - (Training) accuracy: {accu:3.3f} %, '\ 'elapse: {elapse:3.3f} min'.format( accu=100*train_accu, elapse=(time.time()-start)/60)) start = time.time() valid_loss, valid_accu = eval_epoch(model, validation_data, predicates) print(' - (Validation) accuracy: {accu:3.3f} %, '\ 'elapse: {elapse:3.3f} min'.format( accu=100*valid_accu, elapse=(time.time()-start)/60)) valid_accus += [valid_accu] device = torch.device('cpu') word2idx,ints,en1_pos,en2_pos,predicates,relation2idx = data.build_sentences() training_data, validation_data = prepare_dataloaders(word2idx,ints,en1_pos,en2_pos,predicates) model = Transformer( n_src_vocab=len(word2idx), len_max_seq=config.max_seq_len).to(device) optimizer = ScheduledOptim( optim.Adam( filter(lambda x: x.requires_grad, model.parameters()), betas=(0.9, 0.98), eps=1e-09), 512, 1000) train(model, training_data, validation_data, optimizer,predicates)