def fit(x,y,z,dev_x,dev_y,dev_z,lr,decay_weight,n_epochs=n_epochs): train_K = np.load(ROOT_PATH+'/mendelian_precomp/{}_train_K.npy'.format(sname)) dev_K = np.load(ROOT_PATH+'/mendelian_precomp/{}_dev_K.npy'.format(sname)) train_K = torch.from_numpy(train_K).float() dev_K = torch.from_numpy(dev_K).float() n_data = x.shape[0] net = Net(x.shape[1]) es = EarlyStopping(patience=5) optimizer = optim.Adam(list(net.parameters()), lr=lr, weight_decay=decay_weight) for epoch in range(n_epochs): permutation = torch.randperm(n_data) for i in range(0, n_data, batch_size): indices = permutation[i:i+batch_size] batch_x, batch_y = x[indices], y[indices] # training loop def closure(): optimizer.zero_grad() pred_y = net(batch_x) loss = my_loss(pred_y, batch_y, indices, train_K) loss.backward() return loss optimizer.step(closure) # Does the update if epoch % 5 == 0 and epoch >= 5 and dev_x is not None: # 5, 10 for small # 5,50 for large g_pred = net(test.x.float()) test_err = ((g_pred-test.g.float())**2).mean() dev_err = my_loss(net(dev_x), dev_y, None, dev_K) print('test',test_err,'dev',dev_err) if es.step(dev_err): break return es.best, epoch, net
def fit(x,y,z,dev_x,dev_y,dev_z,a,lr,decay_weight, ax, y_axz, w_samples, n_epochs=n_epochs): if 'mnist' in sname: train_K = torch.eye(x.shape[0]) else: train_K = (kernel(z, None, a, 1)+kernel(z, None, a/10, 1)+kernel(z, None, a*10, 1))/3 if dev_z is not None: if 'mnist' in sname: dev_K = torch.eye(x.shape[0]) else: dev_K = (kernel(dev_z, None, a, 1)+kernel(dev_z, None, a/10, 1)+kernel(dev_z, None, a*10, 1))/3 n_data = x.shape[0] net = FCNN(x.shape[1]) if sname not in ['mnist_x', 'mnist_xz'] else CNN() es = EarlyStopping(patience=10) # 10 for small optimizer = optim.Adam(list(net.parameters()), lr=lr, weight_decay=decay_weight) test_errs, dev_errs, exp_errs, mse_s = [], [], [], [] for epoch in range(n_epochs): permutation = torch.randperm(n_data) for i in range(0, n_data, batch_size): indices = permutation[i:i+batch_size] batch_x, batch_y = x[indices], y[indices] # training loop def closure(): optimizer.zero_grad() pred_y = net(batch_x) loss = my_loss(pred_y, batch_y, indices, train_K) loss.backward() return loss optimizer.step(closure) # Does the update if epoch % 5 == 0 and epoch >= 50 and dev_x is not None: # 5, 10 for small # 5,50 for large g_pred = net(test_X) # TODO: is it supposed to be test_X here? A: yes I think so. test_err = ((g_pred-test_Y)**2).mean() # TODO: why isn't this loss reweighted? A: because it is supposed to measure the agreement between prediction and labels. if epoch == 50 and 'mnist' in sname: if z.shape[1] > 100: train_K = np.load(ROOT_PATH+'/mnist_precomp/{}_train_K0.npy'.format(sname)) train_K = (torch.exp(-train_K/a**2/2)+torch.exp(-train_K/a**2*50)+torch.exp(-train_K/a**2/200))/3 dev_K = np.load(ROOT_PATH+'/mnist_precomp/{}_dev_K0.npy'.format(sname)) dev_K = (torch.exp(-dev_K/a**2/2)+torch.exp(-dev_K/a**2*50)+torch.exp(-dev_K/a**2/200))/3 else: train_K = (kernel(z, None, a, 1)+kernel(z, None, a/10, 1)+kernel(z, None, a*10, 1))/3 dev_K = (kernel(dev_z, None, a, 1)+kernel(dev_z, None, a/10, 1)+kernel(dev_z, None, a*10, 1))/3 dev_err = my_loss(net(dev_x), dev_y, None, dev_K) err_in_expectation, mse = conditional_expected_loss(net=net, ax=ax, w_samples=w_samples, y_samples=y_samples, y_axz=y_axz, x_on=False) print('test', test_err, 'dev', dev_err, 'err_in_expectation', err_in_expectation, 'mse: ', mse) test_errs.append(test_err) dev_errs.append(dev_err) exp_errs.append(err_in_expectation) mse_s.append(mse) if es.step(dev_err): break losses = {'test': test_errs, 'dev': dev_errs, 'exp': exp_errs, 'mse_': mse_s} return es.best, epoch, net, losses
def train(opt): if torch.cuda.is_available(): logger.info("%s", torch.cuda.get_device_name(0)) # set etc torch.autograd.set_detect_anomaly(True) # set config config = load_config(opt) config['opt'] = opt logger.info("%s", config) # set path set_path(config) # prepare train, valid dataset train_loader, valid_loader = prepare_datasets(config) with temp_seed(opt.seed): # prepare model model = prepare_model(config) # create optimizer, scheduler, summary writer, scaler optimizer, scheduler, writer, scaler = prepare_osws(config, model, train_loader) config['optimizer'] = optimizer config['scheduler'] = scheduler config['writer'] = writer config['scaler'] = scaler # training early_stopping = EarlyStopping(logger, patience=opt.patience, measure='f1', verbose=1) local_worse_steps = 0 prev_eval_f1 = -float('inf') best_eval_f1 = -float('inf') for epoch_i in range(opt.epoch): epoch_st_time = time.time() eval_loss, eval_f1 = train_epoch(model, config, train_loader, valid_loader, epoch_i) # early stopping if early_stopping.validate(eval_f1, measure='f1'): break if eval_f1 > best_eval_f1: best_eval_f1 = eval_f1 if opt.save_path: logger.info("[Best model saved] : {:10.6f}".format(best_eval_f1)) save_model(config, model) # save finetuned bert model/config/tokenizer if config['emb_class'] in ['bert', 'distilbert', 'albert', 'roberta', 'bart', 'electra']: if not os.path.exists(opt.bert_output_dir): os.makedirs(opt.bert_output_dir) model.bert_tokenizer.save_pretrained(opt.bert_output_dir) model.bert_model.save_pretrained(opt.bert_output_dir) early_stopping.reset(best_eval_f1) early_stopping.status() # begin: scheduling, apply rate decay at the measure(ex, loss) getting worse for the number of deacy epoch steps. if prev_eval_f1 >= eval_f1: local_worse_steps += 1 else: local_worse_steps = 0 logger.info('Scheduler: local_worse_steps / opt.lr_decay_steps = %d / %d' % (local_worse_steps, opt.lr_decay_steps)) if not opt.use_transformers_optimizer and \ epoch_i > opt.warmup_epoch and \ (local_worse_steps >= opt.lr_decay_steps or early_stopping.step() > opt.lr_decay_steps): scheduler.step() local_worse_steps = 0 prev_eval_f1 = eval_f1
def train_pytorch(**kwargs): CHECKPOINT_PATH.mkdir(parents=True, exist_ok=True) # 调用logging.basicConfig会给进程添加一个root logger,这样其他模块中logger的日志才会显示到console当中 # (子logger传到root logger,root logger通过他自带的StreamHandler输出)。 # 如果不调用logging.basicConfig,必须得每个子logger配置一个StreamHandler,很麻烦 logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) formater = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') # Print logs to the terminal. # stream_handler = logging.StreamHandler() # stream_handler.setFormatter(formater) # # Save logs to file. log_path = CHECKPOINT_PATH / 'train.log' file_handler = logging.FileHandler(filename=log_path, mode='w', encoding='utf-8') file_handler.setFormatter(formater) # logger.addHandler(stream_handler) logger.addHandler(file_handler) inputs = kwargs['inputs'] outputs = kwargs['outputs'] # test_inputs = kwargs['test_inputs'] gkf = GroupKFold(n_splits=kwargs['n_splits']).split(X=df_train.q2, groups=df_train.id) # sss = StratifiedShuffleSplit(n_splits=kwargs['n_splits'], test_size=0.2, random_state=RANDOM_SEED).split(X=df_train.q2, # y=df_train.label) # skf = StratifiedKFold(n_splits=kwargs['n_splits'], shuffle=True, random_state=RANDOM_SEED).split(X=df_train.q2, y=outputs) # oof = np.zeros((len(df_train),1)) # all_pred = np.zeros(shape=(len(df_train), 2)) # 分类任务 all_pred = np.zeros(shape=(len(df_train))) # 回归任务 all_true = np.zeros(shape=(len(df_train))) for fold, (train_idx, valid_idx) in enumerate(gkf): # for fold, (train_idx, valid_idx) in enumerate(skf): logger.info(f'Fold No. {fold}') train_inputs = [inputs[i][train_idx] for i in range(len(inputs))] train_outputs = outputs[train_idx] train_qa_id = df_train[['id', 'id_sub', 'label']].iloc[train_idx] # =============================================================== # 通过反向翻译进行样本增强(只增强正样本) # 获得训练集样本的(id, id_sub) # train_id_set = set([f'{x[0]},{x[1]}' for x in df_train.iloc[train_idx][['id', 'id_sub']].to_numpy()]) # # 从增强样本中找出训练集中出现的样本 # mask = df_train_ex[['id', 'id_sub']].apply(lambda x: f'{x["id"]},{x["id_sub"]}' in train_id_set, axis=1) # df_train_fold = df_train_ex[mask] # 获得训练集样本的(id, id_sub) # train_id_set = set([f'{x[0]},{x[1]}' for x in df_train.iloc[train_idx][['id', 'id_sub']].to_numpy()]) # # 从增强样本中找出训练集中出现的样本 # mask = df_train_aug[['id', 'id_sub']].apply(lambda x: f'{x["id"]},{x["id_sub"]}' in train_id_set, axis=1) # df_train_fold = df_train_aug[mask] # train_inputs, train_inputs_overlap = compute_input_arrays(df_train_fold, input_categories, tokenizer, MAX_SEQUENCE_LENGTH) # train_outputs = compute_output_arrays(df_train_fold, output_categories) # df_train_fold = df_train.iloc[train_idx] # train_q_aug = [] # for x in tqdm(df_train_fold['q1']): # train_q_aug.append(eda_one(x)) # train_a_aug = [] # for x in tqdm(df_train_fold['q2']): # train_a_aug.append(eda_one(x)) # df_train_fold = pd.DataFrame(data={'q1': train_q_aug, 'q2': train_a_aug}) # train_inputs, train_inputs_overlap = compute_input_arrays(df_train_fold, input_categories, tokenizer, MAX_SEQUENCE_LENGTH) # train_outputs = compute_output_arrays(df_train_fold, output_categories) # 添加安居客数据到训练集 # train_inputs = [np.concatenate([train_inputs[i], anjuke_inputs[i]], axis=0) for i in range(len(inputs))] # train_outputs = np.concatenate([train_outputs, anjuke_outputs], axis=0) # ================================================================ valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))] valid_outputs = outputs[valid_idx] valid_qa_id = df_train[['id', 'id_sub', 'label']].iloc[valid_idx] train_set = HouseDataset(train_inputs, train_outputs, train_qa_id) valid_set = HouseDataset(valid_inputs, valid_outputs, valid_qa_id) # test_set = HouseDataset(test_inputs, np.zeros_like(test_inputs[0])) # 测试集没有标签 logger.info('Train set size: {}, valid set size {}'.format( len(train_set), len(valid_set))) train_loader = DataLoader( train_set, batch_size=kwargs['batch_size'], # shuffle=True # 如果使用分类训练,设为True ) valid_loader = DataLoader(valid_set, batch_size=kwargs['valid_batch_size']) # test_loader = DataLoader(test_set, # batch_size=512) device = torch.device(f"cuda:{kwargs['device']}") # model = BertForHouseQA().cuda(device) model = torch.nn.DataParallel(BertForHouseQA(), device_ids=[1, 2, 3]).cuda(device) # 找到分数最高的checkpoint文件并加载 # best_score_ = max([float(x.name[len(MODEL_NAME)+1:-3]) for x in CHECKPOINT_PATH.iterdir() if x.is_file()]) # best_ckpt_path = CHECKPOINT_PATH/f'{MODEL_NAME}_{best_score_}.pt' # ckpt = torch.load(best_ckpt_path) # model.load_state_dict(ckpt['model_state_dict']) # 加载point-wise模型,使用pair-wise继续训练 # 或者加载安居客模型 # ===================================================== # org_model = BertForHouseQA().cuda(device) # time_str = '2020-11-18-12:49:44' # org_ckpt_path = DATA_PATH / f"model_record/{MODEL_NAME}/{time_str}" # org_ckpt_path = DATA_PATH / f'anjuke/model_record/{MODEL_NAME}/{time_str}' # org_ckpt_paths = [x for x in org_ckpt_path.iterdir() if x.is_file() and x.suffix == '.pt'] # prefix = f'{MODEL_NAME}_' # best_ckpt_path = [x for x in org_ckpt_paths if str(x.name).startswith(prefix)][0] # ckpt = torch.load(best_ckpt_path) # org_model.load_state_dict(ckpt['model_state_dict']) # model = BertClsToReg(org_model).cuda(device) # model = BertClsToCls(org_model).cuda(device) # ===================================================== # List all modules inside the model. logger.info('Model modules:') for i, m in enumerate(model.named_children()): logger.info('{} -> {}'.format(i, m)) # # Get the number of total parameters. # total_params = sum(p.numel() for p in model.parameters()) # trainable_params = sum(p.numel() # for p in model.parameters() if p.requires_grad) # logger.info("Total params: {:,}".format(total_params)) # logger.info("Trainable params: {:,}".format(trainable_params)) # 使用HingeLoss criterion = torch.nn.MarginRankingLoss(margin=1.0) # criterion = torch.nn.MSELoss() # criterion = torch.nn.CrossEntropyLoss() # criterion_scl = SupConLoss(temperature=0.1, device=device) # optimizer = torch.optim.Adam( # model.parameters(), lr=kwargs['lr'], weight_decay=kwargs['weight_decay']) optimizer = transformers.AdamW(model.parameters(), lr=kwargs['lr'], weight_decay=kwargs['weight_decay']) logger.info('Optimizer:') logger.info(optimizer) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, # mode='min', # patience=int(kwargs['patience']/2), # verbose=True # ) scheduler = transformers.get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=4, num_training_steps=kwargs['epoch']) # best_score = 0.0 stopper = EarlyStopping(patience=kwargs['patience'], mode='max') ckpt_path = None for epoch in range(kwargs['epoch']): pass # =======================Training=========================== # Set model to train mode. model.train() steps = int(np.ceil(len(train_set) / kwargs['batch_size'])) pbar = tqdm(desc='Epoch {}, loss {}'.format(epoch, 'NAN'), total=steps) for i, sample in enumerate(train_loader): x, y = sample[0].cuda(device).long(), sample[1].cuda( device).long() optimizer.zero_grad() feat, model_outputs = model(x) # [batch_size, 2] # CrossEntropy # loss = criterion(model_outputs, y) # MSE # loss = criterion(model_outputs, y.float().unsqueeze(-1)) # 使用 HingeLoss train_qa_id_sub = sample[2].cpu().detach().numpy() loss = get_hinge_loss(model_outputs, train_qa_id_sub, criterion) # 使用SCL # feat = F.normalize(feat, dim=-1).unsqueeze(1) # scl = criterion_scl(feat, y) # scl_weight = 0.3 # loss = (1-scl_weight)*loss + scl_weight*scl # loss += scl loss.backward() optimizer.step() pbar.set_description('Epoch {}, train loss {:.4f}'.format( epoch, loss.item())) pbar.update() pbar.close() # ========================================================= # =======================Validation======================== # Set model to evaluation mode. model.eval() with torch.no_grad(): # Validation step valid_loss = [] valid_pred = [] valid_true = [] steps = int( np.ceil(len(valid_set) / kwargs['valid_batch_size'])) pbar = tqdm(desc='Validating', total=steps) for i, sample in enumerate(valid_loader): y_true_local = sample[1].numpy() x, y_true = sample[0].cuda(device).long(), sample[1].cuda( device).long() feat, model_outputs = model(x) # MSELoss # loss = criterion(model_outputs, y_true.float().unsqueeze(-1)).cpu().detach().item() # HingeLoss valid_qa_id_sub = sample[2].cpu().detach().numpy() loss = get_hinge_loss(model_outputs, valid_qa_id_sub, criterion).cpu().detach().item() y_pred = model_outputs.cpu().detach().squeeze(-1).numpy() # CrossEntropy # loss = criterion( # model_outputs, y_true).cpu().detach().item() # y_pred = F.softmax( # model_outputs.cpu().detach(), dim=1).numpy() valid_loss.append(loss) valid_pred.append(y_pred) valid_true.append(y_true_local) pbar.update() pbar.close() valid_loss = np.asarray(valid_loss).mean() valid_pred = np.concatenate(valid_pred, axis=0) valid_true = np.concatenate(valid_true, axis=0) # 如果使用回归模型 valid_f1, thr = search_f1(valid_true, valid_pred) logger.info("Epoch {}, valid loss {:.5f}, valid f1 {:.4f}".format( epoch, valid_loss, valid_f1)) # 如果使用分类模型 # valid_pred_label = np.argmax(valid_pred, axis=1) # valid_auc = roc_auc_score(valid_true, valid_pred_label) # valid_p, valid_r, valid_f1, _ = precision_recall_fscore_support( # valid_true, valid_pred_label, average='binary') # logger.info( # "Epoch {}, valid loss {:.5f}, valid P {:.4f}, valid R {:.4f}, valid f1 {:.4f}, valid auc {:.4f}".format( # epoch, valid_loss, valid_p, valid_r, valid_f1, valid_auc) # ) # logger.info('Confusion Matrix: ') # logger.info(confusion_matrix(y_true=valid_true, # y_pred=valid_pred_label, normalize='all')) # Apply ReduceLROnPlateau to the lr. scheduler.step(valid_f1) stop_flag, best_flag = stopper.step(valid_f1) if best_flag: # 删除之前保存的模型 if ckpt_path is not None: ckpt_path.unlink() ckpt_path = CHECKPOINT_PATH / \ f"{MODEL_NAME}_{fold}_{epoch}_{stopper.best_score}.pt" # 保存目前的最佳模型 torch.save( { "model_name": "BertForHouseQA", "epoch": epoch, "valid_loss": valid_loss, "valid_f1": valid_f1, "model_state_dict": model.state_dict(), "train_idx": train_idx, "valid_idx": valid_idx, "fold": fold, # "optimizer_state_dict": optimizer.state_dict(), "thr": thr # 'scheduler_state_dict': scheduler.state_dict() }, f=ckpt_path, ) logger.info("A best score! Saved to checkpoints.") # 保存每个验证折的预测值,用作最后整个训练集的f1评估 all_pred[valid_idx] = valid_pred all_true[valid_idx] = valid_true if stop_flag: logger.info("Stop training due to early stopping.") # 终止训练 break # 保存每个验证折的预测值,用作最后整个训练集的f1评估 # oof[valid_idx] = valid_pred # valid_f1, _ = search_f1(valid_outputs, valid_pred) # 寻找最佳分类阈值和f1 score # print('Valid f1 score = ', valid_f1) # ========================================================== # 结束后,评估整个训练集 # CrossEntropy # all_pred = np.argmax(all_pred, axis=1) # all_auc = roc_auc_score(all_true, all_pred) # all_p, all_r, all_f1, _ = precision_recall_fscore_support( # all_true, all_pred, average='binary') # logger.info( # "all P {:.4f}, all R {:.4f}, all f1 {:.4f}, all auc {:.4f}".format( # all_p, all_r, all_f1, all_auc) # ) # logger.info('Confusion Matrix: ') # logger.info(confusion_matrix(y_true=all_true, # y_pred=all_pred, normalize='all')) # MSELoss all_f1, all_thr = search_f1(all_true, all_pred) logger.info("All f1 {:.4f}, all thr {:.4f}".format(all_f1, all_thr)) return all_f1, CHECKPOINT_PATH
def fit(x, y, z, dev_x, dev_y, dev_z, a, lr, decay_weight, n_epochs=n_epochs): if 'mnist' in sname: train_K = torch.eye(x.shape[0]) else: train_K = (kernel(z, None, a, 1) + kernel(z, None, a / 10, 1) + kernel(z, None, a * 10, 1)) / 3 if dev_z is not None: if 'mnist' in sname: dev_K = torch.eye(x.shape[0]) else: dev_K = (kernel(dev_z, None, a, 1) + kernel(dev_z, None, a / 10, 1) + kernel(dev_z, None, a * 10, 1)) / 3 n_data = x.shape[0] net = FCNN(x.shape[1]) if sname not in ['mnist_x', 'mnist_xz' ] else CNN() es = EarlyStopping(patience=5) # 10 for small optimizer = optim.Adam(list(net.parameters()), lr=lr, weight_decay=decay_weight) # optimizer = optim.SGD(list(net.parameters()),lr=1e-1, momentum=0.9) # optimizer = optim.Adadelta(list(net.parameters())) for epoch in range(n_epochs): permutation = torch.randperm(n_data) for i in range(0, n_data, batch_size): indices = permutation[i:i + batch_size] batch_x, batch_y = x[indices], y[indices] # training loop def closure(): optimizer.zero_grad() pred_y = net(batch_x) loss = my_loss(pred_y, batch_y, indices, train_K) loss.backward() return loss optimizer.step(closure) # Does the update if epoch % 5 == 0 and epoch >= 50 and dev_x is not None: # 5, 10 for small # 5,50 for large g_pred = net(test_X) test_err = ((g_pred - test_G)**2).mean() if epoch == 50 and 'mnist' in sname: if z.shape[1] > 100: train_K = np.load( ROOT_PATH + '/mnist_precomp/{}_train_K0.npy'.format(sname)) train_K = (torch.exp(-train_K / a**2 / 2) + torch.exp(-train_K / a**2 * 50) + torch.exp(-train_K / a**2 / 200)) / 3 dev_K = np.load( ROOT_PATH + '/mnist_precomp/{}_dev_K0.npy'.format(sname)) dev_K = (torch.exp(-dev_K / a**2 / 2) + torch.exp(-dev_K / a**2 * 50) + torch.exp(-dev_K / a**2 / 200)) / 3 else: train_K = (kernel(z, None, a, 1) + kernel(z, None, a / 10, 1) + kernel(z, None, a * 10, 1)) / 3 dev_K = (kernel(dev_z, None, a, 1) + kernel(dev_z, None, a / 10, 1) + kernel(dev_z, None, a * 10, 1)) / 3 dev_err = my_loss(net(dev_x), dev_y, None, dev_K) print('test', test_err, 'dev', dev_err) if es.step(dev_err): break return es.best, epoch, net
def train_pytorch(**kwargs): CHECKPOINT_PATH.mkdir(parents=True, exist_ok=True) # 调用logging.basicConfig会给进程添加一个root logger,这样其他模块中logger的日志才会显示到console当中 # (子logger传到root logger,root logger通过他自带的StreamHandler输出)。 # 如果不调用logging.basicConfig,必须得每个子logger配置一个StreamHandler,很麻烦 logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) formater = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') # Print logs to the terminal. # stream_handler = logging.StreamHandler() # stream_handler.setFormatter(formater) # # Save logs to file. log_path = CHECKPOINT_PATH / 'train.log' file_handler = logging.FileHandler(filename=log_path, mode='w', encoding='utf-8') file_handler.setFormatter(formater) # logger.addHandler(stream_handler) logger.addHandler(file_handler) inputs = kwargs['inputs'] outputs = kwargs['outputs'] # test_inputs = kwargs['test_inputs'] # gkf = GroupKFold(n_splits=kwargs['n_splits']).split(X=df_train.q2, groups=df_train.id) sss = StratifiedShuffleSplit(n_splits=kwargs['n_splits'], test_size=0.2).split(X=df_train.q2, y=df_train.label) # skf = StratifiedKFold(n_splits=kwargs['n_splits'], shuffle=True, random_state=RANDOM_SEED).split(X=df_train.q2, y=outputs) # oof = np.zeros((len(df_train),1)) all_pred = np.zeros(shape=(len(df_train), 2)) # 分类任务 # all_pred = np.zeros(shape=(len(df_train))) # 回归任务 all_true = np.zeros(shape=(len(df_train))) for fold, (train_idx, valid_idx) in enumerate(sss): # for fold, (train_idx, valid_idx) in enumerate(skf): logger.info(f'Fold No. {fold}') train_inputs = [inputs[i][train_idx] for i in range(len(inputs))] train_outputs = outputs[train_idx] train_qa_id = df_train[['id', 'id_sub', 'label']].iloc[train_idx] # 通过反向翻译进行样本增强(只增强正样本) # 获得训练集样本的(id, id_sub) # train_id_set = set([f'{x[0]},{x[1]}' for x in df_train.iloc[train_idx][['id', 'id_sub']].to_numpy()]) # # 从增强样本中找出训练集中出现的样本 # mask = df_train_ex[['id', 'id_sub']].apply(lambda x: f'{x["id"]},{x["id_sub"]}' in train_id_set, axis=1) # df_train_fold = df_train_ex[mask] # 获得训练集样本的(id, id_sub) # train_id_set = set([f'{x[0]},{x[1]}' for x in df_train.iloc[train_idx][['id', 'id_sub']].to_numpy()]) # # 从增强样本中找出训练集中出现的样本 # mask = df_train_aug[['id', 'id_sub']].apply(lambda x: f'{x["id"]},{x["id_sub"]}' in train_id_set, axis=1) # df_train_fold = df_train_aug[mask] # train_inputs = compute_input_arrays(df_train_fold, input_categories, tokenizer, MAX_SEQUENCE_LENGTH) # train_outputs = compute_output_arrays(df_train_fold, output_categories) valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))] valid_outputs = outputs[valid_idx] valid_qa_id = df_train[['id', 'id_sub', 'label']].iloc[valid_idx] train_set = HouseDataset(train_inputs, train_outputs, train_qa_id) valid_set = HouseDataset(valid_inputs, valid_outputs, valid_qa_id) # test_set = HouseDataset(test_inputs, np.zeros_like(test_inputs[0])) # 测试集没有标签 logger.info('Train set size: {}, valid set size {}'.format( len(train_set), len(valid_set))) train_loader = DataLoader(train_set, batch_size=kwargs['batch_size'], shuffle=True # 如果使用分类训练,建议True ) valid_loader = DataLoader(valid_set, batch_size=kwargs['valid_batch_size']) # test_loader = DataLoader(test_set, # batch_size=512) device = torch.device(f"cuda:{kwargs['device']}") model = BertForHouseQA().cuda(device) # List all modules inside the model. logger.info('Model modules:') for i, m in enumerate(model.named_children()): logger.info('{} -> {}'.format(i, m)) # # Get the number of total parameters. # total_params = sum(p.numel() for p in model.parameters()) # trainable_params = sum(p.numel() # for p in model.parameters() if p.requires_grad) # logger.info("Total params: {:,}".format(total_params)) # logger.info("Trainable params: {:,}".format(trainable_params)) # 使用HingeLoss # criterion = torch.nn.MarginRankingLoss(margin=1.0) # criterion = torch.nn.MSELoss() criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=kwargs['lr'], weight_decay=kwargs['weight_decay']) logger.info('Optimizer:') logger.info(optimizer) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, # mode='min', # patience=8, # verbose=True # ) # best_score = 0.0 stopper = EarlyStopping(patience=kwargs['patience'], mode='max') ckpt_path = None for epoch in range(kwargs['epoch']): pass # =======================Training=========================== # Set model to train mode. model.train() steps = int(np.ceil(len(train_set) / kwargs['batch_size'])) pbar = tqdm(desc='Epoch {}, loss {}'.format(epoch, 'NAN'), total=steps) for i, sample in enumerate(train_loader): x, y = sample[0].cuda(device).long(), sample[1].cuda(device).long() optimizer.zero_grad() model_outputs = model(x) # [batch_size, 2] # CrossEntropy loss = criterion(model_outputs, y) # MSE # loss = criterion(model_outputs, y.float().unsqueeze(-1)) # 使用 HingeLoss # train_qa_id_sub = sample[2].numpy() # loss = get_hinge_loss(model_outputs, train_qa_id_sub, criterion) # 使用SCL # inners = torch.do loss.backward() optimizer.step() pbar.set_description( 'Epoch {}, train loss {:.4f}'.format(epoch, loss.item())) pbar.update() pbar.close() # ========================================================= # =======================Validation======================== # Set model to evaluation mode. model.eval() with torch.no_grad(): # Validation step valid_loss = [] valid_pred = [] valid_true = [] steps = int(np.ceil(len(valid_set) / kwargs['valid_batch_size'])) pbar = tqdm(desc='Validating', total=steps) for i, sample in enumerate(valid_loader): y_true_local = sample[1].numpy() x, y_true = sample[0].cuda( device).long(), sample[1].cuda(device).long() model_outputs = model(x) # MSELoss # loss = criterion(model_outputs, y_true.float().unsqueeze(-1)).cpu().detach().item() # HingeLoss # valid_qa_id_sub = sample[2].numpy() # loss = get_hinge_loss(model_outputs, valid_qa_id_sub, criterion) # y_pred = model_outputs.cpu().detach().squeeze(-1).numpy() # CrossEntropy loss = criterion(model_outputs, y_true).cpu().detach().item() y_pred = F.softmax(model_outputs.cpu().detach(), dim=1).numpy() valid_loss.append(loss) valid_pred.append(y_pred) valid_true.append(y_true_local) pbar.update() pbar.close() valid_loss = np.asarray(valid_loss).mean() valid_pred = np.concatenate(valid_pred, axis=0) valid_true = np.concatenate(valid_true, axis=0) # 如果使用回归模型 # valid_f1, thr = search_f1(valid_true, valid_pred) # logger.info("Epoch {}, valid loss {:.5f}, valid f1 {:.4f}".format(epoch, valid_loss, valid_f1))) # 如果使用分类模型 valid_pred_label = np.argmax(valid_pred, axis=1) valid_auc = roc_auc_score(valid_true, valid_pred_label) valid_p, valid_r, valid_f1, _ = precision_recall_fscore_support(valid_true, valid_pred_label, average='binary') # Apply ReduceLROnPlateau to the lr. # scheduler.step(valid_loss) logger.info( "Epoch {}, valid loss {:.5f}, valid P {:.4f}, valid R {:.4f}, valid f1 {:.4f}, valid auc {:.4f}".format( epoch, valid_loss, valid_p, valid_r, valid_f1, valid_auc) ) logger.info('Confusion Matrix: ') logger.info(confusion_matrix(y_true=valid_true, y_pred=valid_pred_label, normalize='all')) stop_flag, best_flag = stopper.step(valid_f1) if best_flag: # 删除之前保存的模型 if ckpt_path is not None: ckpt_path.unlink() ckpt_path = CHECKPOINT_PATH / f"{MODEL_NAME}_{fold}_{epoch}_{stopper.best_score}.pt" # 保存目前的最佳模型 torch.save( { "model_name": "BertForHouseQA", "epoch": epoch, "valid_loss": valid_loss, "valid_f1": valid_f1, "model_state_dict": model.state_dict(), # "optimizer_state_dict": optimizer.state_dict(), # "thr": thr # 'scheduler_state_dict': scheduler.state_dict() }, f=ckpt_path, ) logger.info("A best score! Saved to checkpoints.") # 保存每个验证折的预测值,用作最后整个训练集的f1评估 all_pred[valid_idx] = valid_pred all_true[valid_idx] = valid_true if stop_flag: logger.info("Stop training due to early stopping.") # 终止训练 break # 保存每个验证折的预测值,用作最后整个训练集的f1评估 # oof[valid_idx] = valid_pred # valid_f1, _ = search_f1(valid_outputs, valid_pred) # 寻找最佳分类阈值和f1 score # print('Valid f1 score = ', valid_f1) # ========================================================== # 结束后,评估整个训练集 # CrossEntropy all_pred = np.argmax(all_pred, axis=1) all_auc = roc_auc_score(all_true, all_pred) all_p, all_r, all_f1, _ = precision_recall_fscore_support(all_true, all_pred, average='binary') logger.info( "all P {:.4f}, all R {:.4f}, all f1 {:.4f}, all auc {:.4f}".format( all_p, all_r, all_f1, all_auc) ) logger.info('Confusion Matrix: ') logger.info(confusion_matrix(y_true=all_true, y_pred=all_pred, normalize='all')) # MSELoss # all_f1, all_thr = search_f1(all_true, all_pred) # logger.info("All f1 {:.4f}, all thr {:.4f}".format(all_f1, all_thr)) return all_f1, CHECKPOINT_PATH
def main(args): ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime()) splits = ['train', 'valid'] + (['test'] if args.test else []) datasets = OrderedDict() for split in splits: datasets[split] = PTB(data_dir=args.data_dir, split=split, create_data=args.create_data, max_sequence_length=args.max_sequence_length, min_occ=args.min_occ) model = SentenceVAE(vocab_size=datasets['train'].vocab_size, sos_idx=datasets['train'].sos_idx, eos_idx=datasets['train'].eos_idx, pad_idx=datasets['train'].pad_idx, unk_idx=datasets['train'].unk_idx, max_sequence_length=args.max_sequence_length, embedding_size=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, word_dropout=args.word_dropout, embedding_dropout=args.embedding_dropout, latent_size=args.latent_size, num_layers=args.num_layers, bidirectional=args.bidirectional) if torch.cuda.is_available(): model = model.cuda() print(model) if args.tensorboard_logging: writer = SummaryWriter( os.path.join(args.logdir, expierment_name(args, ts))) writer.add_text("model", str(model)) writer.add_text("args", str(args)) writer.add_text("ts", ts) save_model_path = os.path.join(args.save_model_path, ts) os.makedirs(save_model_path) def kl_anneal_function(anneal_function, step, k, x0): if anneal_function == 'logistic': return float(1 / (1 + np.exp(-k * (step - x0)))) elif anneal_function == 'linear': return min(1, step / x0) else: return 1.0 NLL = torch.nn.NLLLoss(size_average=False, ignore_index=datasets['train'].pad_idx) def loss_fn(logp, target, length, mean, logv, anneal_function, step, k, x0): # cut-off unnecessary padding from target, and flatten target = target[:, :torch.max(length).data[0]].contiguous().view(-1) logp = logp.view(-1, logp.size(2)) # Negative Log Likelihood NLL_loss = NLL(logp, target) # KL Divergence KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp()) KL_weight = kl_anneal_function(anneal_function, step, k, x0) return NLL_loss, KL_loss, KL_weight optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) tensor = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.Tensor step = 0 early_stop = EarlyStopping(min_delta=0.001, patience=5) for epoch in range(args.epochs): for split in splits: data_loader = DataLoader(dataset=datasets[split], batch_size=args.batch_size, shuffle=split == 'train', num_workers=cpu_count(), pin_memory=torch.cuda.is_available()) tracker = defaultdict(tensor) # Enable/Disable Dropout if split == 'train': model.train() else: model.eval() for iteration, batch in enumerate(data_loader): batch_size = batch['input'].size(0) for k, v in batch.items(): if torch.is_tensor(v): batch[k] = to_var(v) # Forward pass logp, mean, logv, z = model(batch['input'], batch['length']) # loss calculation NLL_loss, KL_loss, KL_weight = loss_fn(logp, batch['target'], batch['length'], mean, logv, args.anneal_function, step, args.k, args.x0) if split != 'train': KL_weight = 1.0 loss = (NLL_loss + KL_weight * KL_loss) / batch_size # backward + optimization if split == 'train': optimizer.zero_grad() loss.backward() optimizer.step() step += 1 # bookkeeping tracker['ELBO'] = torch.cat((tracker['ELBO'], loss.data)) if args.tensorboard_logging: writer.add_scalar("%s/ELBO" % split.upper(), loss.data[0], epoch * len(data_loader) + iteration) writer.add_scalar("%s/NLL Loss" % split.upper(), NLL_loss.data[0] / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Loss" % split.upper(), KL_loss.data[0] / batch_size, epoch * len(data_loader) + iteration) writer.add_scalar("%s/KL Weight" % split.upper(), KL_weight, epoch * len(data_loader) + iteration) if iteration % args.print_every == 0 or iteration + 1 == len( data_loader): print( "%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f" % (split.upper(), iteration, len(data_loader) - 1, loss.data[0], NLL_loss.data[0] / batch_size, KL_loss.data[0] / batch_size, KL_weight)) if split == 'valid': if 'target_sents' not in tracker: tracker['target_sents'] = list() tracker['target_sents'] += idx2word( batch['target'].data, i2w=datasets['train'].get_i2w(), pad_idx=datasets['train'].pad_idx) tracker['z'] = torch.cat((tracker['z'], z.data), dim=0) print("%s Epoch %02d/%i, Mean ELBO %9.4f" % (split.upper(), epoch, args.epochs, torch.mean(tracker['ELBO']))) if args.tensorboard_logging: writer.add_scalar("%s-Epoch/ELBO" % split.upper(), torch.mean(tracker['ELBO']), epoch) # save a dump of all sentences and the encoded latent space if split == 'valid': dump = { 'target_sents': tracker['target_sents'], 'z': tracker['z'].tolist() } if not os.path.exists(os.path.join('dumps', ts)): os.makedirs('dumps/' + ts) with open( os.path.join('dumps/' + ts + '/valid_E%i.json' % epoch), 'w') as dump_file: json.dump(dump, dump_file) # save checkpoint if split == 'train': checkpoint_path = os.path.join(save_model_path, "E%i.pytorch" % (epoch)) torch.save(model.state_dict(), checkpoint_path) print("Model saved at %s" % checkpoint_path) if split == 'valid' and early_stop.step(torch.mean( tracker['ELBO'])): print("Early Stopping after {}".format(epoch)) exit(0)
), batch_size=1) val_loader = torch.utils.data.DataLoader(dataset.listDataset( val_list, shuffle=False, transform=tf, ), batch_size=1) train_loss_values = [] for epoch in range(num_of_epochs): print(' --- teacher training: epoch {}'.format(epoch + 1)) train_loss = train(model, optimizer, train_loader) #evaluate for one epoch on validation set val = evaluate(model, val_loader) train_loss_values.append(val) #if val_metric is best, add checkpoint if (True): #print("New Best!") #best = val.item() torch.save(model.state_dict(), 'checkpoints/CP_{}.pth'.format(epoch)) print("Checkpoint {} saved!".format(epoch + 1)) if es.step(val): plt.plot(train_loss_values) print("Early Stopping . . .") break scheduler.step()
def fit(self, input_xy, lr=0.01, weight=None, epoch='auto', print_batch_num=10, batch_size=128, tot_size=np.inf, max_epoch=20, test_input=None, target_names=None, enable_early_stopping=False, **kwargs): optimizer = torch.optim.SGD(self.nn_module.parameters(), lr=lr) criterion = nn.CrossEntropyLoss(weight=torch.tensor(weight, dtype=torch.float32, device=self.device ) if weight is not None else None) if test_input is not None: # use test returned f1 score if validation set specified early_stopping = EarlyStopping(mode='max', patience=5, percentage=True) test_input = tee(test_input, max_epoch if test_input is not None else 0) else: # else use average loss early_stopping = EarlyStopping(mode='min', patience=5, percentage=False) logging.info('Train starting...') for epoch_idx, epoch_input in enumerate(tee(input_xy, max_epoch)): tot_loss = [] for batch_cnt, batch_data in enumerate(zip(*([iter(epoch_input)] * batch_size))): outputs = [] trues = [] optimizer.zero_grad() for x, y in batch_data: output = self.nn_module(x).view(-1) outputs.append(output) trues.append(y.argmax()) outputs = torch.stack(outputs) trues = torch.tensor(trues, dtype=torch.long, device=self.device) loss = criterion(outputs, trues) if (batch_cnt+1) % print_batch_num == 0: logging.debug('Epoch %5.3f, Current loss: %10.8f' % (epoch_idx+batch_cnt*batch_size/tot_size, loss)) tot_loss.append(loss.item()) loss.backward() optimizer.step() if test_input is not None: score = self.test(test_input[epoch_idx], target_names=target_names) logging.info('Epoch %d, test score %4.2f' % (epoch_idx, score)) else: score = sum(tot_loss) / len(tot_loss) logging.info('Epoch %d, avg loss %4.2f' % (epoch_idx, score)) if enable_early_stopping and early_stopping.step(score): logging.info('Early stopped.') break logging.info('Train end.')
def init_model_and_train( label='', crf=parameters['crf'], char_mode=parameters['char_mode'], encoder_mode=parameters['encoder_mode'], use_gpu=parameters['use_gpu'], eval_every=parameters[ 'eval_every'], # Calculate F-1 Score after this many iterations plot_every=parameters[ 'plot_every'], # Store loss after this many iterations gradient_clip=parameters['gradient_clip'], total_epochs=parameters['epochs'] + 1, output_dir=parameters['output_dir'], embedding_dim=parameters['word_dim'], hidden_dim=parameters['word_lstm_dim']): # Create model model = BiLSTM_CRF(vocab_size=len(word_to_id), tag_to_ix=tag_to_id, embedding_dim=embedding_dim, hidden_dim=hidden_dim, use_gpu=use_gpu, char_to_ix=char_to_id, pre_word_embeds=word_embeds, use_crf=crf, char_mode=char_mode, encoder_mode=encoder_mode) # Enable GPU if use_gpu: model.cuda() print(f"Char mode: {char_mode}, Encoder mode: {encoder_mode}") # Training parameters learning_rate = 0.015 momentum = 0.9 decay_rate = 0.05 optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum) # Variables which will used in training process losses = [] # list to store all losses loss = 0.0 # Loss Initializatoin best_dev_F = -1.0 # Current best F-1 Score on Dev Set best_test_F = -1.0 # Current best F-1 Score on Test Set best_train_F = -1.0 # Current best F-1 Score on Train Set all_F = [[0, 0, 0]] # List storing all the F-1 Scores all_acc = [[0, 0, 0]] # List storing all the Accuracy Scores count = 0 # Counts the number of iterations train_length = len(train_data) # Define early stopping es = EarlyStopping(patience=3, mode='max') # eval_every = 1 tr = time.time() model.train(True) for epoch in range(1, total_epochs): print(f'Epoch {epoch}:') for i, index in enumerate(np.random.permutation(train_length)): # for i, index in enumerate(np.random.permutation(eval_every)): count += 1 data = train_data[index] # gradient updates for each data entry model.zero_grad() sentence_in = data['words'] sentence_in = Variable(torch.LongTensor(sentence_in)) tags = data['tags'] chars2 = data['chars'] if char_mode == 'LSTM': chars2_sorted = sorted(chars2, key=lambda p: len(p), reverse=True) d = {} for i, ci in enumerate(chars2): for j, cj in enumerate(chars2_sorted): if ci == cj and not j in d and not i in d.values(): d[j] = i continue chars2_length = [len(c) for c in chars2_sorted] char_maxl = max(chars2_length) chars2_mask = np.zeros((len(chars2_sorted), char_maxl), dtype='int') for i, c in enumerate(chars2_sorted): chars2_mask[i, :chars2_length[i]] = c chars2_mask = Variable(torch.LongTensor(chars2_mask)) if char_mode == 'CNN': d = {} # Padding the each word to max word size of that sentence chars2_length = [len(c) for c in chars2] char_maxl = max(chars2_length) chars2_mask = np.zeros((len(chars2_length), char_maxl), dtype='int') for i, c in enumerate(chars2): chars2_mask[i, :chars2_length[i]] = c chars2_mask = Variable(torch.LongTensor(chars2_mask)) targets = torch.LongTensor(tags) # we calculate the negative log-likelihood for the predicted tags using the predefined function if use_gpu: neg_log_likelihood = model.get_neg_log_likelihood( sentence_in.cuda(), targets.cuda(), chars2_mask.cuda(), chars2_length, d) else: neg_log_likelihood = model.get_neg_log_likelihood( sentence_in, targets, chars2_mask, chars2_length, d) loss += neg_log_likelihood.item() / len(data['words']) neg_log_likelihood.backward() # we use gradient clipping to avoid exploding gradients torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clip) optimizer.step() # Storing loss if count % plot_every == 0: loss /= plot_every print(count, ': ', loss) if losses == []: losses.append(loss) losses.append(loss) loss = 0.0 # Evaluating on Train, Test, Dev Sets if (epoch > 20) or (epoch % eval_every == 0): print(f'Evaluating on Train, Test, Dev Sets at count={count}') model.train(False) best_train_F, new_train_F, new_train_acc, _ = evaluating( model, train_data, best_train_F, "Train", char_mode=char_mode, use_gpu=use_gpu) best_dev_F, new_dev_F, new_dev_acc, save = evaluating( model, dev_data, best_dev_F, "Dev", char_mode=char_mode, use_gpu=use_gpu) if save: print("Saving Model to ", model_name) torch.save(model.state_dict(), model_name) best_test_F, new_test_F, new_test_acc, _ = evaluating( model, test_data, best_test_F, "Test", char_mode=char_mode, use_gpu=use_gpu) all_F.append([new_train_F, new_dev_F, new_test_F]) all_acc.append([new_train_acc, new_dev_acc, new_test_acc]) model.train(True) if (epoch > 20 or epoch % eval_every == 0) and es.step(all_F[-1][1]): print( f'Early stopping: epoch={epoch}, count={count}, new_acc_F={all_acc[-1][1]}' ) break # early stopping criterion is met, we can stop now # Performing decay on the learning rate adjust_learning_rate(optimizer, lr=learning_rate / (1 + decay_rate * count / len(train_data))) print(f'{(time.time() - tr) / 60} minutes') torch.save(model, output_dir + '/' + label + '.model') plt.figure(0) plt.plot(losses) plt.savefig(output_dir + '/' + label + '_appended.png', transparent=True) plt.figure(1) plt.clf() plt.plot(losses) plt.savefig(output_dir + '/' + label + '.png', transparent=True) return all_F