def main(args, logger): # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv') trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl') trn_df['is_original'] = 1 # raw_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/raw_pseudo_tst_df.csv') # half_opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/half_opt_pseudo_tst_df.csv') # opt_pseudo_df = pd.read_csv('./mnt/inputs/pseudos/top2_e078_e079_e080_e081_e082_e083/opt_pseudo_tst_df.csv') # raw_pseudo_df2 = pd.read_csv('./mnt/inputs/pseudos/top2_e121_e125_e126_e127_e128_e129/raw_pseudo_tst_df.csv') # half_opt_pseudo_df2 = pd.read_csv('./mnt/inputs/pseudos/top2_e121_e125_e126_e127_e128_e129/half_opt_pseudo_tst_df.csv') # opt_pseudo_df2 = pd.read_csv('./mnt/inputs/pseudos/top2_e121_e125_e126_e127_e128_e129/opt_pseudo_tst_df.csv') # clean texts # trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer']) # load additional tokens # with open('./mnt/inputs/nes_info/trn_over_10_vocab.pkl', 'rb') as fin: # additional_tokens = pickle.load(fin) gkf = GroupKFold(n_splits=5).split( X=trn_df.question_body, groups=trn_df.question_body_le, ) histories = { 'trn_loss': {}, 'val_loss': {}, 'val_metric': {}, 'val_metric_raws': {}, } loaded_fold = -1 loaded_epoch = -1 if args.checkpoint: histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint) fold_best_metrics = [] fold_best_metrics_raws = [] for fold, (trn_idx, val_idx) in enumerate(gkf): if fold < loaded_fold: fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) continue sel_log( f' --------------------------- start fold {fold} --------------------------- ', logger) fold_trn_df = trn_df.iloc[trn_idx] # .query('is_original == 1') fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'], axis=1) # use only original row fold_val_df = trn_df.iloc[val_idx].query('is_original == 1') fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'], axis=1) if args.debug: fold_trn_df = fold_trn_df.sample(100, random_state=71) fold_val_df = fold_val_df.sample(100, random_state=71) temp = pd.Series( list( itertools.chain.from_iterable( fold_trn_df.question_title.apply(lambda x: x.split(' ')) + fold_trn_df.question_body.apply(lambda x: x.split(' ')) + fold_trn_df.answer.apply(lambda x: x.split(' ')))) ).value_counts() tokens = temp[temp >= 10].index.tolist() # tokens = [] tokens = [ 'CAT_TECHNOLOGY'.casefold(), 'CAT_STACKOVERFLOW'.casefold(), 'CAT_CULTURE'.casefold(), 'CAT_SCIENCE'.casefold(), 'CAT_LIFE_ARTS'.casefold(), ] # + additional_tokens # fold_trn_df = pd.concat([fold_trn_df, raw_pseudo_df, opt_pseudo_df, half_opt_pseudo_df, raw_pseudo_df2, opt_pseudo_df2, half_opt_pseudo_df2], axis=0) trn_dataset = QUESTDataset( df=fold_trn_df, mode='train', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode=TQA_MODE, TBSEP='[TBSEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) # update token trn_sampler = RandomSampler(data_source=trn_dataset) trn_loader = DataLoader(trn_dataset, batch_size=BATCH_SIZE, sampler=trn_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=True, pin_memory=True) val_dataset = QUESTDataset( df=fold_val_df, mode='valid', tokens=tokens, augment=[], tokenizer_type=TOKENIZER_TYPE, pretrained_model_name_or_path=TOKENIZER_PRETRAIN, do_lower_case=DO_LOWER_CASE, LABEL_COL=LABEL_COL, t_max_len=T_MAX_LEN, q_max_len=Q_MAX_LEN, a_max_len=A_MAX_LEN, tqa_mode=TQA_MODE, TBSEP='[TBSEP]', pos_id_type='arange', MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, ) val_sampler = RandomSampler(data_source=val_dataset) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, num_workers=os.cpu_count(), worker_init_fn=lambda x: np.random.seed(), drop_last=False, pin_memory=True) # fobj = BCEWithLogitsLoss() # fobj = FocalLossKaggle(gamma=2) fobj = MarginRankingLoss() state_dict = BertModel.from_pretrained(MODEL_PRETRAIN).state_dict() model = BertModelForBinaryMultiLabelClassifier( num_labels=len(LABEL_COL), config_path=MODEL_CONFIG_PATH, state_dict=state_dict, token_size=len(trn_dataset.tokenizer), MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN, cat_last_layer_num=1, do_ratio=0.2, ) optimizer = optim.Adam(model.parameters(), lr=3e-5) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=MAX_EPOCH, eta_min=1e-5) # load checkpoint model, optim, scheduler if args.checkpoint and fold == loaded_fold: load_checkpoint(args.checkpoint, model, optimizer, scheduler) for epoch in tqdm(list(range(MAX_EPOCH))): if fold <= loaded_fold and epoch <= loaded_epoch: continue if epoch < 1: model.freeze_unfreeze_bert(freeze=True, logger=logger) else: model.freeze_unfreeze_bert(freeze=False, logger=logger) model = DataParallel(model) model = model.to(DEVICE) trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader, DEVICE) val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test( model, fobj, val_loader, DEVICE, mode='valid') scheduler.step() if fold in histories['trn_loss']: histories['trn_loss'][fold].append(trn_loss) else: histories['trn_loss'][fold] = [ trn_loss, ] if fold in histories['val_loss']: histories['val_loss'][fold].append(val_loss) else: histories['val_loss'][fold] = [ val_loss, ] if fold in histories['val_metric']: histories['val_metric'][fold].append(val_metric) else: histories['val_metric'][fold] = [ val_metric, ] if fold in histories['val_metric_raws']: histories['val_metric_raws'][fold].append(val_metric_raws) else: histories['val_metric_raws'][fold] = [ val_metric_raws, ] logging_val_metric_raws = '' for val_metric_raw in val_metric_raws: logging_val_metric_raws += f'{float(val_metric_raw):.4f}, ' sel_log( f'fold : {fold} -- epoch : {epoch} -- ' f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- ' f'val_metric : {float(val_metric):.4f} -- ' f'val_metric_raws : {logging_val_metric_raws}', logger) model = model.to('cpu') model = model.module save_checkpoint( f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model, optimizer, scheduler, histories, val_y_preds, val_y_trues, val_qa_ids, fold, epoch, val_loss, val_metric, ) fold_best_metrics.append(np.max(histories["val_metric"][fold])) fold_best_metrics_raws.append( histories["val_metric_raws"][fold][np.argmax( histories["val_metric"][fold])]) save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', trn_dataset.tokenizer, clean=False) del model # calc training stats fold_best_metric_mean = np.mean(fold_best_metrics) fold_best_metric_std = np.std(fold_best_metrics) fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}' sel_log(fold_stats, logger) send_line_notification(fold_stats) fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0) fold_raw_stats = '' for metric_stats_raw in fold_best_metrics_raws_mean: fold_raw_stats += f'{float(metric_stats_raw):.4f},' sel_log(fold_raw_stats, logger) send_line_notification(fold_raw_stats) sel_log('now saving best checkpoints...', logger)
def main(args): # Select the hardware device to use for inference. if torch.cuda.is_available(): device = torch.device('cuda', torch.cuda.current_device()) torch.backends.cudnn.benchmark = True else: device = torch.device('cpu') # Disable gradient calculations by default. torch.set_grad_enabled(False) # create checkpoint dir os.makedirs(args.checkpoint, exist_ok=True) if args.arch == 'hg1': model = hg1(pretrained=False) elif args.arch == 'hg2': model = hg2(pretrained=False) elif args.arch == 'hg8': model = hg8(pretrained=False) else: raise Exception('unrecognised model architecture: ' + args.model) model = DataParallel(model).to(device) optimizer = RMSprop(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) best_acc = 0 # optionally resume from a checkpoint title = 'mpii ' + args.arch if args.resume: assert os.path.isfile(args.resume) print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc = checkpoint['best_acc'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names( ['Epoch', 'LR', 'Train Loss', 'Val Loss', 'Train Acc', 'Val Acc']) # create data loader train_dataset = Mpii(args.image_path, is_train=True) train_loader = DataLoader(train_dataset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers, pin_memory=True) val_dataset = Mpii(args.image_path, is_train=False) val_loader = DataLoader(val_dataset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=True) # train and eval lr = args.lr for epoch in range(args.start_epoch, args.epochs): lr = adjust_learning_rate(optimizer, epoch, lr, args.schedule, args.gamma) print('\nEpoch: %d | LR: %.8f' % (epoch + 1, lr)) # train for one epoch train_loss, train_acc = do_training_epoch(train_loader, model, device, optimizer) # evaluate on validation set valid_loss, valid_acc, predictions = do_validation_epoch( val_loader, model, device, False) # append logger file logger.append( [epoch + 1, lr, train_loss, valid_loss, train_acc, valid_acc]) # remember best acc and save checkpoint is_best = valid_acc > best_acc best_acc = max(valid_acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, predictions, is_best, checkpoint=args.checkpoint, snapshot=args.snapshot) logger.close() logger.plot(['Train Acc', 'Val Acc']) savefig(os.path.join(args.checkpoint, 'log.eps'))
def train_model(train_dataset, train_num_each, val_dataset, val_num_each): num_train = len(train_dataset) num_val = len(val_dataset) train_useful_start_idx = get_useful_start_idx(sequence_length, train_num_each) val_useful_start_idx = get_useful_start_idx(sequence_length, val_num_each) num_train_we_use = len(train_useful_start_idx) // num_gpu * num_gpu num_val_we_use = len(val_useful_start_idx) // num_gpu * num_gpu # num_train_we_use = 4 # num_val_we_use = 800 train_we_use_start_idx = train_useful_start_idx[0:num_train_we_use] val_we_use_start_idx = val_useful_start_idx[0:num_val_we_use] train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) val_idx = [] for i in range(num_val_we_use): for j in range(sequence_length): val_idx.append(val_we_use_start_idx[i] + j) num_train_all = len(train_idx) num_val_all = len(val_idx) print('num train start idx : {:6d}'.format(len(train_useful_start_idx))) print('last idx train start: {:6d}'.format(train_useful_start_idx[-1])) print('num of train dataset: {:6d}'.format(num_train)) print('num of train we use : {:6d}'.format(num_train_we_use)) print('num of all train use: {:6d}'.format(num_train_all)) print('num valid start idx : {:6d}'.format(len(val_useful_start_idx))) print('last idx valid start: {:6d}'.format(val_useful_start_idx[-1])) print('num of valid dataset: {:6d}'.format(num_val)) print('num of valid we use : {:6d}'.format(num_val_we_use)) print('num of all valid use: {:6d}'.format(num_val_all)) train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False) val_loader = DataLoader(val_dataset, batch_size=val_batch_size, sampler=val_idx, num_workers=workers, pin_memory=False) model = multi_gru() if use_gpu: model = model.cuda() model = DataParallel(model) criterion_1 = nn.BCEWithLogitsLoss(size_average=False) criterion_2 = nn.CrossEntropyLoss(size_average=False) sig_f = nn.Sigmoid() if multi_optim == 0: if optimizer_choice == 0: optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_adjust_lr, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam(model.parameters(), lr=learning_rate) elif multi_optim == 1: if optimizer_choice == 0: optimizer = optim.SGD([ { 'params': model.module.share.parameters() }, { 'params': model.module.gru.parameters(), 'lr': learning_rate }, { 'params': model.module.fc.parameters(), 'lr': learning_rate }, ], lr=learning_rate / 10, momentum=momentum, dampening=dampening, weight_decay=weight_decay, nesterov=use_nesterov) if sgd_adjust_lr == 0: exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=sgd_adjust_lr, gamma=sgd_gamma) elif sgd_adjust_lr == 1: exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam([ { 'params': model.module.share.parameters() }, { 'params': model.module.gru.parameters(), 'lr': learning_rate }, { 'params': model.module.fc.parameters(), 'lr': learning_rate }, ], lr=learning_rate / 10) best_model_wts = copy.deepcopy(model.state_dict()) best_val_accuracy_1 = 0.0 best_val_accuracy_2 = 0.0 # judge by accu2 correspond_train_acc_1 = 0.0 correspond_train_acc_2 = 0.0 record_np = np.zeros([epochs, 8]) for epoch in range(epochs): # np.random.seed(epoch) np.random.shuffle(train_we_use_start_idx) train_idx = [] for i in range(num_train_we_use): for j in range(sequence_length): train_idx.append(train_we_use_start_idx[i] + j) train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False) model.train() train_loss_1 = 0.0 train_loss_2 = 0.0 train_corrects_1 = 0 train_corrects_2 = 0 train_start_time = time.time() for data in train_loader: inputs, labels_1, labels_2 = data if use_gpu: inputs = Variable(inputs.cuda()) labels_1 = Variable(labels_1.cuda()) labels_2 = Variable(labels_2.cuda()) else: inputs = Variable(inputs) labels_1 = Variable(labels_1) labels_2 = Variable(labels_2) optimizer.zero_grad() outputs_1, outputs_2 = model.forward(inputs) _, preds_2 = torch.max(outputs_2.data, 1) sig_out = outputs_1.data.cpu() sig_out = sig_f(sig_out) preds_1 = torch.ByteTensor(sig_out > 0.5) preds_1 = preds_1.long() train_corrects_1 += torch.sum(preds_1 == labels_1.data.cpu()) labels_1 = Variable(labels_1.data.float()) loss_1 = criterion_1(outputs_1, labels_1) loss_2 = criterion_2(outputs_2, labels_2) loss = loss_1 + loss_2 loss.backward() optimizer.step() train_loss_1 += loss_1.data[0] train_loss_2 += loss_2.data[0] train_corrects_2 += torch.sum(preds_2 == labels_2.data) train_elapsed_time = time.time() - train_start_time train_accuracy_1 = train_corrects_1 / num_train_all / 7 train_accuracy_2 = train_corrects_2 / num_train_all train_average_loss_1 = train_loss_1 / num_train_all / 7 train_average_loss_2 = train_loss_2 / num_train_all # begin eval model.eval() val_loss_1 = 0.0 val_loss_2 = 0.0 val_corrects_1 = 0 val_corrects_2 = 0 val_start_time = time.time() for data in val_loader: inputs, labels_1, labels_2 = data labels_2 = labels_2[(sequence_length - 1)::sequence_length] if use_gpu: inputs = Variable(inputs.cuda(), volatile=True) labels_1 = Variable(labels_1.cuda(), volatile=True) labels_2 = Variable(labels_2.cuda(), volatile=True) else: inputs = Variable(inputs, volatile=True) labels_1 = Variable(labels_1, volatile=True) labels_2 = Variable(labels_2, volatile=True) if crop_type == 0 or crop_type == 1: outputs_1, outputs_2 = model.forward(inputs) elif crop_type == 5: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs_1, outputs_2 = model.forward(inputs) outputs_1 = outputs_1.view(5, -1, 7) outputs_1 = torch.mean(outputs_1, 0) outputs_2 = outputs_2.view(5, -1, 7) outputs_2 = torch.mean(outputs_2, 0) elif crop_type == 10: inputs = inputs.permute(1, 0, 2, 3, 4).contiguous() inputs = inputs.view(-1, 3, 224, 224) outputs_1, outputs_2 = model.forward(inputs) outputs_1 = outputs_1.view(10, -1, 7) outputs_1 = torch.mean(outputs_1, 0) outputs_2 = outputs_2.view(10, -1, 7) outputs_2 = torch.mean(outputs_2, 0) outputs_2 = outputs_2[sequence_length - 1::sequence_length] _, preds_2 = torch.max(outputs_2.data, 1) sig_out = outputs_1.data.cpu() sig_out = sig_f(sig_out) preds_1 = torch.ByteTensor(sig_out > 0.5) preds_1 = preds_1.long() val_corrects_1 += torch.sum(preds_1 == labels_1.data.cpu()) labels_1 = Variable(labels_1.data.float()) loss_1 = criterion_1(outputs_1, labels_1) val_loss_1 += loss_1.data[0] loss_2 = criterion_2(outputs_2, labels_2) val_loss_2 += loss_2.data[0] val_corrects_2 += torch.sum(preds_2 == labels_2.data) val_elapsed_time = time.time() - val_start_time val_accuracy_1 = val_corrects_1 / (num_val_all * 7) val_accuracy_2 = val_corrects_2 / num_val_we_use val_average_loss_1 = val_loss_1 / (num_val_all * 7) val_average_loss_2 = val_loss_2 / num_val_we_use print('epoch: {:4d}' ' train time: {:2.0f}m{:2.0f}s' ' train loss_1: {:4.4f}' ' train loss_1: {:4.4f}' ' train accu_1: {:.4f}' ' valid time: {:2.0f}m{:2.0f}s' ' valid loss_1: {:4.4f}' ' valid accu_1: {:.4f}'.format( epoch, train_elapsed_time // 60, train_elapsed_time % 60, train_average_loss_1, train_accuracy_1, val_elapsed_time // 60, val_elapsed_time % 60, val_average_loss_1, val_accuracy_1)) print('epoch: {:4d}' ' train time: {:2.0f}m{:2.0f}s' ' train loss_2: {:4.4f}' ' train accu_2: {:.4f}' ' valid time: {:2.0f}m{:2.0f}s' ' valid loss_2: {:4.4f}' ' valid accu_2: {:.4f}'.format( epoch, train_elapsed_time // 60, train_elapsed_time % 60, train_average_loss_2, train_accuracy_2, val_elapsed_time // 60, val_elapsed_time % 60, val_average_loss_2, val_accuracy_2)) if optimizer_choice == 0: if sgd_adjust_lr == 0: exp_lr_scheduler.step() elif sgd_adjust_lr == 1: exp_lr_scheduler.step(val_average_loss_1 + val_average_loss_2) if val_accuracy_2 > best_val_accuracy_2 and val_accuracy_1 > 0.95: best_val_accuracy_2 = val_accuracy_2 best_val_accuracy_1 = val_accuracy_1 correspond_train_acc_1 = train_accuracy_1 correspond_train_acc_2 = train_accuracy_2 best_model_wts = copy.deepcopy(model.state_dict()) elif val_accuracy_2 == best_val_accuracy_2 and val_accuracy_1 > 0.95: if val_accuracy_1 > best_val_accuracy_1: correspond_train_acc_1 = train_accuracy_1 correspond_train_acc_2 = train_accuracy_2 best_model_wts = copy.deepcopy(model.state_dict()) elif val_accuracy_1 == best_val_accuracy_1: if train_accuracy_2 > correspond_train_acc_2: correspond_train_acc_2 = train_accuracy_2 correspond_train_acc_1 = train_accuracy_1 best_model_wts = copy.deepcopy(model.state_dict()) elif train_accuracy_2 == correspond_train_acc_2: if train_accuracy_1 > best_val_accuracy_1: correspond_train_acc_1 = train_accuracy_1 best_model_wts = copy.deepcopy(model.state_dict()) record_np[epoch, 0] = train_accuracy_1 record_np[epoch, 1] = train_accuracy_2 record_np[epoch, 2] = train_average_loss_1 record_np[epoch, 3] = train_average_loss_2 record_np[epoch, 4] = val_accuracy_1 record_np[epoch, 5] = val_accuracy_2 record_np[epoch, 6] = val_average_loss_1 record_np[epoch, 7] = val_average_loss_2 print('best accuracy_1: {:.4f} cor train accu_1: {:.4f}'.format( best_val_accuracy_1, correspond_train_acc_1)) print('best accuracy_2: {:.4f} cor train accu_2: {:.4f}'.format( best_val_accuracy_2, correspond_train_acc_2)) save_val_1 = int("{:4.0f}".format(best_val_accuracy_1 * 10000)) save_val_2 = int("{:4.0f}".format(best_val_accuracy_2 * 10000)) save_train_1 = int("{:4.0f}".format(correspond_train_acc_1 * 10000)) save_train_2 = int("{:4.0f}".format(correspond_train_acc_2 * 10000)) public_name = "cnn_gru" \ + "_epoch_" + str(epochs) \ + "_length_" + str(sequence_length) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train1_" + str(save_train_1) \ + "_train2_" + str(save_train_2) \ + "_val1_" + str(save_val_1) \ + "_val2_" + str(save_val_2) model_name = public_name + ".pth" torch.save(best_model_wts, model_name) record_name = public_name + ".npy" np.save(record_name, record_np)
# define model net = model.attention_net(topN=PROPOSAL_NUM) if resume: ckpt = torch.load(resume) net.load_state_dict(ckpt['net_state_dict']) start_epoch = ckpt['epoch'] + 1 creterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(net.parameters(), lr=LR, momentum=0.9, weight_decay=WD) # optimizer = torch.optim.Adam(net.parameters(), lr=LR, weight_decay=WD) schedulers = [MultiStepLR(optimizer, milestones=[60, 100], gamma=0.1)] net = net.cuda() net = DataParallel(net) for epoch in range(start_epoch, 500): for scheduler in schedulers: scheduler.step() # begin training _print('--' * 50) net.train() total_tmp = 0 raw_tmp = 0 rank_tmp = 0 concat_tmp = 0 partcls_tmp = 0 for i, data in enumerate(trainloader): img, label = data[0].cuda(), data[1].cuda()
print('{} train iters per epoch:'.format(len(trainloader))) test_dataset = Dataset(opt.test_root, opt.test_pd_root, opt.test_list, phase='test', input_shape=opt.input_shape) testloader = data.DataLoader(test_dataset, batch_size=opt.train_batch_size, shuffle=False, num_workers=opt.num_workers) criterion = LossFunction() embedding_net=Unet_down() regression_net=Unet_up() model = AlignmentNet(embedding_net, regression_net) if opt.finetune: model = DataParallel(model) load_model(model, opt.load_model_path) model.load_state_dict(torch.load(opt.load_model_path)) model.to(torch.device("cuda")) else: model.to(device) model = DataParallel(model) if opt.optimizer == 'sgd': optimizer = torch.optim.SGD([{'params': model.parameters()}], lr=opt.lr, weight_decay=opt.weight_decay) else: optimizer = torch.optim.Adam([{'params': model.parameters()}], lr=opt.lr, weight_decay=opt.weight_decay) scheduler = StepLR(optimizer, step_size=opt.lr_step, gamma=0.1)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--bpe_token', action='store_true', help='subword') parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json") parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe") args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = pytorch_transformers.modeling_gpt2.GPT2Config.from_json_file(args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) full_tokenizer.max_len = n_ctx device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir tb_writer = SummaryWriter(log_dir=args.writer_dir) if not os.path.exists(output_dir): os.mkdir(output_dir) if raw: print('building files') build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces, full_tokenizer=full_tokenizer, min_length=min_length) print('files built') if not args.pretrained_model: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) else: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(args.pretrained_model) model.train() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = pytorch_transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = pytorch_transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) multi_gpu = True print('starting training') overall_step = 0 running_loss = 0 for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point: start_point + n_ctx]) start_point += stride start_point -= stride last = tokens[start_point + n_ctx:] last.extend([full_tokenizer.convert_tokens_to_ids(['[PAD]']) * (n_ctx - len(last))]) random.shuffle(samples) for step in range(len(samples) // batch_size): # drop last # prepare data batch = samples[step * batch_size: (step + 1) * batch_size] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() overall_step += 1 if (overall_step + 1) % log_step == 0: tb_writer.add_scalar('loss', loss.item(), overall_step) if (overall_step + 1) % log_step == 0: print('now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'.format( datetime.now().hour, datetime.now().minute, (step + 1) // gradient_accumulation, piece_num, epoch + 1, running_loss * gradient_accumulation / log_step)) running_loss = 0 piece_num += 1 print('saving model for epoch {}'.format(epoch + 1)) if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished') if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model')
def main(args: argparse.Namespace): logger = CompleteLogger(args.log, args.phase) print(args) if args.seed is not None: random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') cudnn.benchmark = True # Data loading code train_transform = utils.get_train_transform(args.height, args.width, args.train_resizing, random_horizontal_flip=True, random_color_jitter=False, random_gray_scale=False, random_erasing=False) val_transform = utils.get_val_transform(args.height, args.width) print("train_transform: ", train_transform) print("val_transform: ", val_transform) working_dir = osp.dirname(osp.abspath(__file__)) source_root = osp.join(working_dir, args.source_root) target_root = osp.join(working_dir, args.target_root) # source dataset source_dataset = datasets.__dict__[args.source]( root=osp.join(source_root, args.source.lower())) sampler = RandomMultipleGallerySampler(source_dataset.train, args.num_instances) train_source_loader = DataLoader(convert_to_pytorch_dataset( source_dataset.train, root=source_dataset.images_dir, transform=train_transform), batch_size=args.batch_size, num_workers=args.workers, sampler=sampler, pin_memory=True, drop_last=True) train_source_iter = ForeverDataIterator(train_source_loader) val_loader = DataLoader(convert_to_pytorch_dataset( list(set(source_dataset.query) | set(source_dataset.gallery)), root=source_dataset.images_dir, transform=val_transform), batch_size=args.batch_size, num_workers=args.workers, shuffle=False, pin_memory=True) # target dataset target_dataset = datasets.__dict__[args.target]( root=osp.join(target_root, args.target.lower())) train_target_loader = DataLoader(convert_to_pytorch_dataset( target_dataset.train, root=target_dataset.images_dir, transform=train_transform), batch_size=args.batch_size, num_workers=args.workers, shuffle=True, pin_memory=True, drop_last=True) train_target_iter = ForeverDataIterator(train_target_loader) test_loader = DataLoader(convert_to_pytorch_dataset( list(set(target_dataset.query) | set(target_dataset.gallery)), root=target_dataset.images_dir, transform=val_transform), batch_size=args.batch_size, num_workers=args.workers, shuffle=False, pin_memory=True) # create model num_classes = source_dataset.num_train_pids backbone = utils.get_model(args.arch) pool_layer = nn.Identity() if args.no_pool else None model = ReIdentifier(backbone, num_classes, finetune=args.finetune, pool_layer=pool_layer).to(device) model = DataParallel(model) # define optimizer and lr scheduler optimizer = Adam(model.module.get_parameters(base_lr=args.lr, rate=args.rate), args.lr, weight_decay=args.weight_decay) lr_scheduler = WarmupMultiStepLR(optimizer, args.milestones, gamma=0.1, warmup_factor=0.1, warmup_steps=args.warmup_steps) # resume from the best checkpoint if args.phase != 'train': checkpoint = torch.load(logger.get_checkpoint_path('best'), map_location='cpu') model.load_state_dict(checkpoint) # analysis the model if args.phase == 'analysis': # plot t-SNE utils.visualize_tsne(source_loader=val_loader, target_loader=test_loader, model=model, filename=osp.join(logger.visualize_directory, 'analysis', 'TSNE.pdf'), device=device) # visualize ranked results visualize_ranked_results(test_loader, model, target_dataset.query, target_dataset.gallery, device, visualize_dir=logger.visualize_directory, width=args.width, height=args.height, rerank=args.rerank) return if args.phase == 'test': print("Test on source domain:") validate(val_loader, model, source_dataset.query, source_dataset.gallery, device, cmc_flag=True, rerank=args.rerank) print("Test on target domain:") validate(test_loader, model, target_dataset.query, target_dataset.gallery, device, cmc_flag=True, rerank=args.rerank) return # define loss function criterion_ce = CrossEntropyLossWithLabelSmooth(num_classes).to(device) criterion_triplet = SoftTripletLoss(margin=args.margin).to(device) # start training best_val_mAP = 0. best_test_mAP = 0. for epoch in range(args.epochs): # print learning rate print(lr_scheduler.get_lr()) # train for one epoch train(train_source_iter, train_target_iter, model, criterion_ce, criterion_triplet, optimizer, epoch, args) # update learning rate lr_scheduler.step() if (epoch + 1) % args.eval_step == 0 or (epoch == args.epochs - 1): # evaluate on validation set print("Validation on source domain...") _, val_mAP = validate(val_loader, model, source_dataset.query, source_dataset.gallery, device, cmc_flag=True) # remember best mAP and save checkpoint torch.save(model.state_dict(), logger.get_checkpoint_path('latest')) if val_mAP > best_val_mAP: shutil.copy(logger.get_checkpoint_path('latest'), logger.get_checkpoint_path('best')) best_val_mAP = max(val_mAP, best_val_mAP) # evaluate on test set print("Test on target domain...") _, test_mAP = validate(test_loader, model, target_dataset.query, target_dataset.gallery, device, cmc_flag=True, rerank=args.rerank) best_test_mAP = max(test_mAP, best_test_mAP) # evaluate on test set model.load_state_dict(torch.load(logger.get_checkpoint_path('best'))) print("Test on target domain:") _, test_mAP = validate(test_loader, model, target_dataset.query, target_dataset.gallery, device, cmc_flag=True, rerank=args.rerank) print("test mAP on target = {}".format(test_mAP)) print("oracle mAP on target = {}".format(best_test_mAP)) logger.close()
def __init__(self, model_name, batch_size, gpu_memory): super().__init__(batch_size, gpu_memory) if model_name in [ 'pt_vgg', 'pt_resnet', 'pt_inception', 'pt_densenet' ]: model = model_class_dict[model_name](pretrained=True) self.mean = np.reshape([0.485, 0.456, 0.406], [1, 3, 1, 1]) self.std = np.reshape([0.229, 0.224, 0.225], [1, 3, 1, 1]) model = DataParallel(model.cuda()) else: model = model_class_dict[model_name]() if model_name in ['pt_post_avg_cifar10', 'pt_post_avg_imagenet']: # checkpoint = torch.load(model_path_dict[model_name]) self.mean = np.reshape([0.485, 0.456, 0.406], [1, 3, 1, 1]) self.std = np.reshape([0.229, 0.224, 0.225], [1, 3, 1, 1]) else: model = DataParallel(model).cuda() checkpoint = torch.load(model_path_dict[model_name] + '.pth') self.mean = np.reshape([0.485, 0.456, 0.406], [1, 3, 1, 1]) self.std = np.reshape([0.225, 0.225, 0.225], [1, 3, 1, 1]) model.load_state_dict(checkpoint) model.float() self.mean, self.std = self.mean.astype(np.float32), self.std.astype( np.float32) model.eval() self.model = model
# training parameters BATCH_SIZE = 100 LR = 0.0002 EPOCHS = 20 # data_loader IMG_SIZE = 32 ''' 生成网络 ''' Net_G = Generator(depth=128) Net_D = Discriminator(depth=128) Net_G.weight_init(mean=0.0, std=0.02) Net_D.weight_init(mean=0.0, std=0.02) Net_G = DataParallel(Net_G) Net_D = DataParallel(Net_D) if GPU_NUMS > 1: Net_G.cuda() Net_D.cuda() ''' 读入数据并进行预处理 ''' transform = Compose([ Scale(IMG_SIZE), ToTensor(), Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ]) train_loader = torch.utils.data.DataLoader( # MNIST('data', train=True, download=True, transform=transform), MNISTDataSet('../ganData/mnist.npz', train=True, transform=transform),
def main(): global args args = parser.parse_args() bestLoss = 1000 torch.manual_seed(0) torch.cuda.set_device(0) model = import_module(args.model) config, net, loss, get_pbb = model.get_model() start_epoch = args.start_epoch save_dir = args.save_dir if args.resume: print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(save_dir + 'detector_' + args.resume) start_epoch = checkpoint['epoch'] net.load_state_dict(checkpoint['state_dict']) if not os.path.exists(save_dir): os.makedirs(save_dir) logfile = os.path.join(save_dir, 'log') if args.test != 1: sys.stdout = Logger(logfile) pyfiles = [f for f in os.listdir('./') if f.endswith('.py')] for f in pyfiles: shutil.copy(f, os.path.join(save_dir, f)) n_gpu = setgpu(args.gpu) args.n_gpu = n_gpu net = net.cuda() loss = loss.cuda() cudnn.benchmark = True net = DataParallel(net) datadir = config_training['preprocess_result_path'] luna_data = np.load( '/home/jiancong/LungNodule_DL/detector/luna_folds/luna_fold6.npy') luna_train = luna_data[1] luna_test = luna_data[0] if args.test == 1: print("start test") margin = 32 sidelen = 144 split_comber = SplitComb(sidelen, config['max_stride'], config['stride'], margin, config['pad_value']) dataset = LungNodule3Ddetector(datadir, luna_test, config, phase='test', split_comber=split_comber) test_loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=args.workers, collate_fn=collate, pin_memory=False) test(test_loader, net, get_pbb, save_dir, config) return dataset = LungNodule3Ddetector(datadir, luna_train, config, phase='train') train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) dataset = LungNodule3Ddetector(datadir, luna_test, config, phase='val') val_loader = DataLoader(dataset, batch_size=16, shuffle=False, num_workers=args.workers, pin_memory=True) optimizer = torch.optim.SGD(net.parameters(), args.lr, momentum=0.9, weight_decay=args.weight_decay) def get_lr(epoch): if epoch <= args.epochs * 0.2: lr = args.lr elif epoch <= args.epochs * 0.4: lr = 0.1 * args.lr elif epoch <= args.epochs * 0.6: lr = 0.05 * args.lr else: lr = 0.01 * args.lr return lr for epoch in range(start_epoch, args.epochs + 1): train(train_loader, net, loss, epoch, optimizer, get_lr, save_dir) print("finsihed epoch {}".format(epoch)) valiloss = validate(val_loader, net, loss) if bestLoss > valiloss: bestLoss = valiloss state_dict = net.module.state_dict() for key in state_dict.keys(): state_dict[key] = state_dict[key].cpu() torch.save( { 'epoch': epoch + 1, 'save_dir': save_dir, 'state_dict': state_dict, 'args': args }, os.path.join(save_dir, 'detector_%03d.ckpt' % epoch)) print("save model on epoch %d" % epoch)
for epoch in range(o.epoch): for i in tqdm(d): g, y, k, s = [x.to(o.device) for x in i] x = y optimizer.zero_grad() out = m(x) log("out", out) loss = npsnr(out, g) loss.backward() optimizer.step() losss.append(loss.detach().item()) assert not isnan(losss[-1]) print("stage", stage, "epoch", epoch + 1) log("loss", mean(losss[-5:])) num += 1 # if num > (o.epoch * iter_num - 4): if num % 50 == 1: show(torch.cat((y[0, 0], g[0, 0], out[0, 0]), 1), # save=f"save/{stage:02}{epoch:02}.png", ) plt.clf() plt.plot(range(len(losss)), losss) plt.xlabel("batch") plt.ylabel("loss") plt.title(f"{iter_num} iter x {o.epoch} epoch") plt.savefig(f"save/{stage:02}loss.png") m = DataParallel(M()).to(o.device) train(m)
def main(): args = parser.parse_args() log_out_dir = os.path.join(RESULT_DIR, 'logs', args.out_dir, 'fold%d' % args.fold) if not os.path.exists(log_out_dir): os.makedirs(log_out_dir) log = Logger() log.open(os.path.join(log_out_dir, 'log.train.txt'), mode='a') model_out_dir = os.path.join(RESULT_DIR, 'models', args.out_dir, 'fold%d' % args.fold) log.write(">> Creating directory if it does not exist:\n>> '{}'\n".format( model_out_dir)) if not os.path.exists(model_out_dir): os.makedirs(model_out_dir) # set cuda visible device if not args.all_gpus: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id cudnn.benchmark = True # cudnn.enabled = False # set random seeds torch.manual_seed(0) torch.cuda.manual_seed_all(0) np.random.seed(0) model_params = {} model_params['architecture'] = args.arch model_params['num_classes'] = args.num_classes model_params['in_channels'] = args.in_channels if 'efficientnet' in args.arch: model_params['image_size'] = args.img_size model_params['encoder'] = args.effnet_encoder model = init_network(model_params) if args.load_state_dict_path is not None: if args.load_state_dict_path == 'use-img-level-densenet-ckpt': model_dir = '../output/models/densenet121_1024_all_data__obvious_neg__gradaccum_20__start_lr_3e6' pretrained_ckpt_path = os.path.join(f'{model_dir}', f'fold{args.fold}', 'final.pth') else: pretrained_ckpt_path = args.load_state_dict_path init_pretrained = torch.load(pretrained_ckpt_path) model.load_state_dict(init_pretrained['state_dict']) if args.all_gpus: model = DataParallel(model) model.cuda() # define loss function (criterion) try: criterion = eval(args.loss)().cuda() except: raise (RuntimeError("Loss {} not available!".format(args.loss))) start_epoch = 0 best_loss = 1e5 best_epoch = 0 best_focal = float('inf') # define scheduler try: scheduler = eval(args.scheduler)( scheduler_lr_multiplier=args.scheduler_lr_multiplier, scheduler_epoch_offset=args.scheduler_epoch_offset) except: raise (RuntimeError("Scheduler {} not available!".format( args.scheduler))) optimizer = scheduler.schedule(model, start_epoch, args.epochs)[0] # optionally resume from a checkpoint if args.resume: args.resume = os.path.join(model_out_dir, args.resume) if os.path.isfile(args.resume): # load checkpoint weights and update model and optimizer log.write(">> Loading checkpoint:\n>> '{}'\n".format(args.resume)) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] best_epoch = checkpoint['best_epoch'] best_focal = checkpoint['best_map'] model.load_state_dict(checkpoint['state_dict']) optimizer_fpath = args.resume.replace('.pth', '_optim.pth') if os.path.exists(optimizer_fpath): log.write(">> Loading checkpoint:\n>> '{}'\n".format( optimizer_fpath)) optimizer.load_state_dict( torch.load(optimizer_fpath)['optimizer']) log.write(">>>> loaded checkpoint:\n>>>> '{}' (epoch {})\n".format( args.resume, checkpoint['epoch'])) else: log.write(">> No checkpoint found at '{}'\n".format(args.resume)) # Data loading code train_transform = train_multi_augment2 with open('../input/imagelevel_folds_obvious_staining_5.pkl', 'rb') as f: folds = pickle.load(f) fold = args.fold trn_img_paths, val_img_paths = folds[fold] train_df = get_train_df_ohe(clean_from_duplicates=True) basepath_2_ohe_vector = { img: vec for img, vec in zip(train_df['img_base_path'], train_df.iloc[:, 2:].values) } public_hpa_df_17 = get_public_df_ohe(clean_from_duplicates=True) public_basepath_2_ohe_vector = { img_path: vec for img_path, vec in zip(public_hpa_df_17['img_base_path'], public_hpa_df_17.iloc[:, 2:].values) } basepath_2_ohe_vector.update(public_basepath_2_ohe_vector) available_paths = set( np.concatenate((train_df['img_base_path'].values, public_hpa_df_17['img_base_path'].values))) trn_img_paths = [path for path in trn_img_paths if path in available_paths] val_img_paths = [path for path in val_img_paths if path in available_paths] labels_df = pd.read_hdf(args.cell_level_labels_path) # modifying minor class labels cherrypicked_mitotic_spindle = pd.read_csv( '../input/mitotic_cells_selection.csv') cherrypicked_mitotic_spindle_img_cell = set( cherrypicked_mitotic_spindle[['ID', 'cell_i']].apply(tuple, axis=1).values) cherrypicked_mitotic_spindle_img_cell = { (img, cell_i - 1) for img, cell_i in cherrypicked_mitotic_spindle_img_cell } class_names = get_class_names() mitotic_spindle_class_i = class_names.index('Mitotic spindle') if args.include_nn_mitotic: cherrypicked_mitotic_spindle_based_on_nn = pd.read_csv( '../input/mitotic_pos_nn_added.csv') cherrypicked_mitotic_spindle_img_cell.update( set(cherrypicked_mitotic_spindle_based_on_nn[[ 'ID', 'cell_i' ]].apply(tuple, axis=1).values)) print('len cherrypicked_mitotic_spindle_img_cell', len(cherrypicked_mitotic_spindle_img_cell)) mitotic_bool_idx = labels_df.index.isin( cherrypicked_mitotic_spindle_img_cell) def modify_label(labels, idx, val): labels[idx] = val return labels labels_df.loc[mitotic_bool_idx, 'image_level_pred'] = labels_df.loc[ mitotic_bool_idx, 'image_level_pred'].map( lambda x: modify_label(x, mitotic_spindle_class_i, 1)) if args.include_nn_mitotic: cherrypicked_not_mitotic_spindle_based_on_nn = pd.read_csv( '../input/mitotic_neg_nn_added.csv') cherrypicked_not_mitotic_spindle_based_on_nn = set( cherrypicked_not_mitotic_spindle_based_on_nn[[ 'ID', 'cell_i' ]].apply(tuple, axis=1).values) not_mitotic_bool_idx = labels_df.index.isin( cherrypicked_not_mitotic_spindle_based_on_nn) labels_df.loc[not_mitotic_bool_idx, 'image_level_pred'] = labels_df.loc[ not_mitotic_bool_idx, 'image_level_pred'].map(lambda x: modify_label( x, mitotic_spindle_class_i, 0)) if args.ignore_negative: raise NotImplementedError if args.upsample_minorities: cells_to_upsample = list(cherrypicked_mitotic_spindle_img_cell) aggresome_class_i = class_names.index('Aggresome') confident_aggresome_indices = list( labels_df.index[labels_df['image_level_pred'].map( lambda x: x[aggresome_class_i] > 0.9)]) print('confident_aggresome_indices len', len(confident_aggresome_indices)) print('confident_aggresome_indices[:5]', confident_aggresome_indices[:5]) cells_to_upsample += confident_aggresome_indices else: cells_to_upsample = None train_dataset = ProteinDatasetCellSeparateLoading( trn_img_paths, labels_df=labels_df, cells_to_upsample=cells_to_upsample, img_size=args.img_size, in_channels=args.in_channels, transform=train_transform, basepath_2_ohe=basepath_2_ohe_vector, normalize=args.normalize, target_raw_img_size=args.target_raw_img_size) train_loader = DataLoader( train_dataset, sampler=RandomSampler(train_dataset), batch_size=args.batch_size, drop_last=False, num_workers=args.workers, pin_memory=True, ) # valid_dataset = ProteinDatasetCellLevel(val_img_paths, # labels_df=labels_df, # img_size=args.img_size, # batch_size=64, # is_trainset=True, # in_channels=args.in_channels) valid_dataset = ProteinDatasetCellSeparateLoading( val_img_paths, labels_df=labels_df, img_size=args.img_size, in_channels=args.in_channels, basepath_2_ohe=basepath_2_ohe_vector, normalize=args.normalize, target_raw_img_size=args.target_raw_img_size) valid_loader = DataLoader(valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=args.batch_size, drop_last=False, num_workers=args.workers, pin_memory=True) log.write('** start training here! **\n') log.write('\n') log.write( 'epoch iter rate | train_loss/acc | valid_loss/acc/map/focal |best_epoch/best_focal| min \n' ) log.write( '-----------------------------------------------------------------------------------------------------------------\n' ) start_epoch += 1 if args.eval_at_start: with torch.no_grad(): valid_loss, valid_acc, val_focal, val_map_score = validate( valid_loader, model, criterion, -1, log) print('\r', end='', flush=True) log.write( '%5.1f %5d %0.6f | %0.4f %0.4f | %0.4f %6.4f %6.4f %6.1f | %6.4f %6.4f | %3.1f min \n' % \ (-1, -1, -1, -1, -1, valid_loss, valid_acc, val_map_score, val_focal, best_epoch, best_focal, -1)) for epoch in range(start_epoch, args.epochs + 1): end = time.time() # set manual seeds per epoch np.random.seed(epoch) torch.manual_seed(epoch) torch.cuda.manual_seed_all(epoch) # adjust learning rate for each epoch lr_list = scheduler.step(model, epoch, args.epochs) lr = lr_list[0] # train for one epoch on train set iter, train_loss, train_acc = train( train_loader, model, criterion, optimizer, epoch, clipnorm=args.clipnorm, lr=lr, agg_steps=args.gradient_accumulation_steps) with torch.no_grad(): valid_loss, valid_acc, val_focal, val_map_score = validate( valid_loader, model, criterion, epoch, log) # remember best loss and save checkpoint is_best = val_focal < best_focal best_loss = min(valid_loss, best_loss) best_epoch = epoch if is_best else best_epoch best_focal = val_focal if is_best else best_focal print('\r', end='', flush=True) log.write('%5.1f %5d %0.6f | %0.4f %0.4f | %0.4f %6.4f %6.4f %6.1f | %6.4f %6.4f | %3.1f min \n' % \ (epoch, iter + 1, lr, train_loss, train_acc, valid_loss, valid_acc, val_map_score, val_focal, best_epoch, best_focal, (time.time() - end) / 60)) save_model(model, is_best, model_out_dir, optimizer=optimizer, epoch=epoch, best_epoch=best_epoch, best_map=best_focal)
def train(args): print('start training...') model, model_file = create_model(args) train_loader, val_loader = get_train_val_loaders(batch_size=args.train_batch_size, val_batch_size=args.val_batch_size) frame_loader, _ = get_frame_train_loader(batch_size=args.frame_batch_size) #model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0) if args.optim == 'Adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0001) elif args.optim == 'RAdam': optimizer = RAdam(model.parameters(), lr=args.lr, weight_decay=0.0001) else: optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=0.0001) if args.lrs == 'plateau': lr_scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=args.factor, patience=args.patience, min_lr=args.min_lr) else: lr_scheduler = CosineAnnealingLR(optimizer, args.t_max, eta_min=args.min_lr) model = model.cuda() if torch.cuda.device_count() > 1: model_name = model.name model = DataParallel(model) model.name = model_name #model=model.train() best_f2 = 0. best_key = 'top1' print('epoch | lr | % | loss | avg | loss | top1 | top10 | best | time | save |') if not args.no_first_val: val_metrics = validate(args, model, val_loader) print('val | | | | | {:.4f} | {:.4f} | {:.4f} | {:.4f} | | |'.format( val_metrics['valid_loss'], val_metrics['top1'], val_metrics['top10'], val_metrics[best_key] )) best_f2 = val_metrics[best_key] if args.val: return model.train() if args.lrs == 'plateau': lr_scheduler.step(best_f2) else: lr_scheduler.step() #for epoch in range(args.start_epoch, args.num_epochs): def get_batch(loader, iterator=None, epoch=0, batch_idx=0): ret_epoch = epoch ret_batch_idx = batch_idx + 1 if iterator is None: iterator = loader.__iter__() try: b = iterator.__next__() except StopIteration: iterator = loader.__iter__() b = iterator.__next__() ret_epoch += 1 ret_epoch = 0 return b, iterator, epoch, ret_batch_idx frame_epoch = args.start_epoch train_epoch = 0 frame_iter = frame_loader.__iter__() train_iter = train_loader.__iter__() train_step = 0 frame_batch_idx = -1 train_batch_idx = -1 while frame_epoch <= args.num_epochs: frame_loss = 0. train_loss = 0. current_lr = get_lrs(optimizer) bg = time.time() def train_batch(rgb, audio, labels): output = model(rgb, audio) loss = criterion(output, labels) batch_size = rgb.size(0) loss.backward() optimizer.step() optimizer.zero_grad() return loss.item() for i in range(200): batch, frame_iter, frame_epoch, frame_batch_idx = get_batch(frame_loader, frame_iter, frame_epoch, frame_batch_idx) rgb, audio, labels = batch[0].cuda(), batch[2].cuda(), batch[4].cuda() loss_val = train_batch(rgb, audio, labels) frame_loss += loss_val print('\r F{:4d} | {:.7f} | {:06d}/{} | {:.4f} | {:.4f} |'.format( frame_epoch, float(current_lr[0]), args.frame_batch_size*(frame_batch_idx+1), frame_loader.num, loss_val, frame_loss/(i+1)), end='') print('') for i in range(100): batch, train_iter, train_epoch, train_batch_idx = get_batch(train_loader, train_iter, train_epoch, train_batch_idx) rgb, audio, labels = [x.cuda() for x in batch] loss_val = train_batch(rgb, audio, labels) train_loss += loss_val print('\r T{:4d} | {:.7f} | {:06d}/{} | {:.4f} | {:.4f} |'.format( train_epoch, float(current_lr[0]), args.train_batch_size*(train_batch_idx+1), train_loader.num, loss_val, train_loss/(i+1)), end='') if train_step > 0 and train_step % args.iter_val == 0: if isinstance(model, DataParallel): torch.save(model.module.state_dict(), model_file+'_latest') else: torch.save(model.state_dict(), model_file+'_latest') val_metrics = validate(args, model, val_loader) _save_ckp = '' if args.always_save or val_metrics[best_key] > best_f2: best_f2 = val_metrics[best_key] if isinstance(model, DataParallel): torch.save(model.module.state_dict(), model_file) else: torch.save(model.state_dict(), model_file) _save_ckp = '*' print(' {:.4f} | {:.4f} | {:.4f} | {:.4f} | {:.2f} | {:4s} |'.format( val_metrics['valid_loss'], val_metrics['top1'], val_metrics['top10'], best_f2, (time.time() - bg) / 60, _save_ckp)) model.train() if args.lrs == 'plateau': lr_scheduler.step(best_f2) else: lr_scheduler.step() current_lr = get_lrs(optimizer) train_step += 1
def main(): global args args = parser.parse_args() torch.manual_seed(0) torch.cuda.set_device(0) model = import_module(args.model) config, net, loss, get_pbb = model.get_model() start_epoch = args.start_epoch save_dir = args.save_dir if args.resume: checkpoint = torch.load(args.resume) if start_epoch == 0: start_epoch = checkpoint['epoch'] + 1 if not save_dir: save_dir = checkpoint['save_dir'] else: save_dir = os.path.join('results',save_dir) net.load_state_dict(checkpoint['state_dict']) else: if start_epoch == 0: start_epoch = 1 if not save_dir: exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime()) save_dir = os.path.join('results', args.model + '-' + exp_id) else: save_dir = os.path.join('results',save_dir) if not os.path.exists(save_dir): os.makedirs(save_dir) logfile = os.path.join(save_dir,'log') if args.test!=1: sys.stdout = Logger(logfile) pyfiles = [f for f in os.listdir('./') if f.endswith('.py')] for f in pyfiles: shutil.copy(f,os.path.join(save_dir,f)) n_gpu = setgpu(args.gpu) args.n_gpu = n_gpu net = net.cuda() loss = loss.cuda() cudnn.benchmark = True net = DataParallel(net) datadir = config_training['preprocess_result_path'] if args.test == 1: margin = 32 sidelen = 144 split_comber = SplitComb(sidelen,config['max_stride'],config['stride'],margin,config['pad_value']) dataset = data.DataBowl3Detector( datadir, 'full.npy', config, phase='test', split_comber=split_comber) test_loader = DataLoader( dataset, batch_size = 1, shuffle = False, num_workers = args.workers, collate_fn = data.collate, pin_memory=False) test(test_loader, net, get_pbb, save_dir,config) return #net = DataParallel(net) dataset = data.DataBowl3Detector( datadir, 'kaggleluna_full.npy', config, phase = 'train') train_loader = DataLoader( dataset, batch_size = args.batch_size, shuffle = True, num_workers = args.workers, pin_memory=True) dataset = data.DataBowl3Detector( datadir, 'valsplit.npy', config, phase = 'val') val_loader = DataLoader( dataset, batch_size = args.batch_size, shuffle = False, num_workers = args.workers, pin_memory=True) optimizer = torch.optim.SGD( net.parameters(), args.lr, momentum = 0.9, weight_decay = args.weight_decay) def get_lr(epoch): if epoch <= args.epochs * 0.5: lr = args.lr elif epoch <= args.epochs * 0.8: lr = 0.1 * args.lr else: lr = 0.01 * args.lr return lr for epoch in range(start_epoch, args.epochs + 1): train(train_loader, net, loss, epoch, optimizer, get_lr, args.save_freq, save_dir) validate(val_loader, net, loss)
def train_linear(epochs, batch_size, dev_ids, learning_rate=0.001, save_file=None, show_batch=True): # get data and dataloader voice_data = VoiceData() dataloader = DataLoader(voice_data, shuffle=True, batch_size=batch_size) # get device device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # declare model if torch.cuda.is_available(): torch.cuda.empty_cache() model = LinMod().to(device) model = DataParallel(model, device_ids=dev_ids) else: model = LinMod() # training mode model.train() # declare training methodology criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) track_epoch_time = [] for e in range(epochs): start = time.time() print("Starting Epoch", e) # Loop over minibatches for i, (x, y) in enumerate(dataloader): # move to device x = x.to(device) y = y.to(device) # zero gradient optimizer.zero_grad() # forward y_ = model(x) loss = criterion(y_,y) loss.backward() optimizer.step() if show_batch: print("batch {}/{} \t loss: {}".format(i, len(voice_data)/batch_size, float(loss)), end='\r') epoch_time = time.time()-start print("\nEpoch-time: ", epoch_time) track_epoch_time.append(epoch_time) # save file if save_file: torch.save(model, save_file) track_epoch_time = track_epoch_time[1:] total_epoch_time = sum(track_epoch_time) avg_epoch_time = total_epoch_time/len(track_epoch_time) return({'GPUs':len(dev_ids), 'batch_size':batch_size, 'epoch_time':avg_epoch_time})
class MSG_GAN: """ Unconditional TeacherGAN args: depth: depth of the GAN (will be used for each generator and discriminator) latent_size: latent size of the manifold used by the GAN use_eql: whether to use the equalized learning rate use_ema: whether to use exponential moving averages. ema_decay: value of ema decay. Used only if use_ema is True device: device to run the GAN on (GPU / CPU) """ def __init__(self, depth=7, latent_size=512, use_eql=True, use_ema=True, ema_decay=0.999, th_low=0.45, th_high=0.8, dis_optimize_always=False, device=th.device("cpu")): """ constructor for the class """ from torch.nn import DataParallel self.gen = Generator(depth, latent_size, use_eql=use_eql).to(device) # Parallelize them if required: if device == th.device("cuda"): self.gen = DataParallel(self.gen) self.dis = Discriminator(depth, latent_size, use_eql=use_eql, gpu_parallelize=True).to(device) else: self.dis = Discriminator(depth, latent_size, use_eql=True).to(device) # state of the object self.use_ema = use_ema self.ema_decay = ema_decay self.th_low = th_low self.th_high = th_high self.dis_optimize_always = dis_optimize_always self.use_eql = use_eql self.latent_size = latent_size self.depth = depth self.device = device if self.use_ema: from MSG_GAN.CustomLayers import update_average # create a shadow copy of the generator self.gen_shadow = copy.deepcopy(self.gen) # updater function: self.ema_updater = update_average # initialize the gen_shadow weights equal to the # weights of gen self.ema_updater(self.gen_shadow, self.gen, beta=0) # by default the generator and discriminator are in eval mode self.gen.eval() self.dis.eval() if self.use_ema: self.gen_shadow.eval() def generate_samples(self, num_samples): """ generate samples using this gan :param num_samples: number of samples to be generated :return: generated samples tensor: list[ Tensor(B x H x W x C)] """ noise = th.randn(num_samples, self.latent_size).to(self.device) generated_images = self.gen(noise) # reshape the generated images generated_images = list( map(lambda x: (x.detach().permute(0, 2, 3, 1) / 2) + 0.5, generated_images)) return generated_images def optimize_discriminator(self, dis_optim, noise, real_batch, loss_fn, gen_loss): """ performs one step of weight update on discriminator using the batch of data :param dis_optim: discriminator optimizer :param noise: input noise of sample generation :param real_batch: real samples batch should contain a list of tensors at different scales :param loss_fn: loss function to be used (object of GANLoss) :return: current loss """ # generate a batch of samples fake_samples = self.gen(noise) fake_samples = list(map(lambda x: x.detach(), fake_samples)) loss = loss_fn.dis_loss(real_batch, fake_samples) dis_loss = loss.item() # optimize discriminator # From http://blog.otoro.net/2016/04/01/generating-large-images-from-latent-vectors # "...calculate D’s loss function first, and only perform gradient descent on D if G’s loss function is less # than some upper bound (so it is relatively not that weak against D in the first place), # and also if D’s loss function is greater than some lower bound (so that it is not relatively that strong # versus G). We have tried to use an upper bound of 0.80 and a lower bound of 0.45." # print('gen_loss={} < self.th_high={} : {} and dis_loss={} > self.th_low={} : {}'.format( # gen_loss, self.th_high, gen_loss < self.th_high, dis_loss, self.th_low, dis_loss > self.th_low # )) if (gen_loss < self.th_high and dis_loss > self.th_low) or self.dis_optimize_always: # print('Condition met to run loss.backward() for discriminator') dis_optim.zero_grad() loss.backward() dis_optim.step() return loss.item() def optimize_generator(self, gen_optim, noise, real_batch, loss_fn): """ performs one step of weight update on generator using the batch of data :param gen_optim: generator optimizer :param noise: input noise of sample generation :param real_batch: real samples batch should contain a list of tensors at different scales :param loss_fn: loss function to be used (object of GANLoss) :return: current loss """ # generate a batch of samples fake_samples = self.gen(noise) loss = loss_fn.gen_loss(real_batch, fake_samples) # optimize generator gen_optim.zero_grad() loss.backward() gen_optim.step() # if self.use_ema is true, apply the moving average here: if self.use_ema: self.ema_updater(self.gen_shadow, self.gen, self.ema_decay) return loss.item() def create_grid(self, samples, img_files): """ utility function to create a grid of GAN samples :param samples: generated samples for storing list[Tensors] :param img_files: list of names of files to write :return: None (saves multiple files) """ from torchvision.utils import save_image from torch.nn.functional import interpolate from numpy import sqrt, power # dynamically adjust the colour of the images samples = [ Generator.adjust_dynamic_range(sample) for sample in samples ] # resize the samples to have same resolution: for i in range(len(samples)): samples[i] = interpolate(samples[i], scale_factor=power(2, self.depth - 1 - i)) # save the images: for sample, img_file in zip(samples, img_files): save_image(sample, img_file, nrow=int(sqrt(sample.shape[0])), normalize=True, scale_each=True, padding=0) def train(self, data, gen_optim, dis_optim, loss_fn, normalize_latents=True, start=1, num_epochs=12, feedback_factor=10, checkpoint_factor=1, data_percentage=100, num_samples=36, log_dir=None, sample_dir="./samples", save_dir="./models", save_real=False): """ Method for training the network :param data: pytorch dataloader which iterates over images :param gen_optim: Optimizer for generator. please wrap this inside a Scheduler if you want to :param dis_optim: Optimizer for discriminator. please wrap this inside a Scheduler if you want to :param loss_fn: Object of GANLoss :param normalize_latents: whether to normalize the latent vectors during training :param start: starting epoch number :param num_epochs: total number of epochs to run for (ending epoch number) note this is absolute and not relative to start :param feedback_factor: number of logs generated and samples generated during training per epoch :param checkpoint_factor: save model after these many epochs :param data_percentage: amount of data to be used :param num_samples: number of samples to be drawn for feedback grid :param log_dir: path to directory for saving the loss.log file :param sample_dir: path to directory for saving generated samples' grids :param save_dir: path to directory for saving the trained models :return: None (writes multiple files to disk) """ from torch.nn.functional import avg_pool2d # turn the generator and discriminator into train mode self.gen.train() self.dis.train() assert isinstance(gen_optim, th.optim.Optimizer), \ "gen_optim is not an Optimizer" assert isinstance(dis_optim, th.optim.Optimizer), \ "dis_optim is not an Optimizer" print("Starting the training process ... ") # create fixed_input for debugging fixed_input = th.randn(num_samples, self.latent_size).to(self.device) if normalize_latents: fixed_input = (fixed_input / fixed_input.norm(dim=-1, keepdim=True) * (self.latent_size**0.5)) # create a global time counter global_time = time.time() global_step = 0 # See http://blog.otoro.net/2016/04/01/generating-large-images-from-latent-vectors/ and comments in # optimize_discriminator() above gen_loss = 0 for epoch in range(start, num_epochs + 1): start_time = timeit.default_timer( ) # record time at the start of epoch print("\nEpoch: %d" % epoch) total_batches = len(iter(data)) limit = int((data_percentage / 100) * total_batches) for (i, batch) in enumerate(data, 1): # extract current batch of data for training images = batch.to(self.device) extracted_batch_size = images.shape[0] # create a list of downsampled images from the real images: images = [images] + [ avg_pool2d(images, int(np.power(2, i))) for i in range(1, self.depth) ] images = list(reversed(images)) # sample some random latent points gan_input = th.randn(extracted_batch_size, self.latent_size).to(self.device) # normalize them if asked if normalize_latents: gan_input = (gan_input / gan_input.norm(dim=-1, keepdim=True) * (self.latent_size**0.5)) # optimize the discriminator: dis_loss = self.optimize_discriminator(dis_optim, gan_input, images, loss_fn, gen_loss) # optimize the generator: gen_loss = self.optimize_generator(gen_optim, gan_input, images, loss_fn) # provide a loss feedback if i % ( int(limit / feedback_factor) + 1 ) == 0 or i == 1: # Avoid div by 0 error on small training sets elapsed = time.time() - global_time elapsed = str(datetime.timedelta(seconds=elapsed)) print("Elapsed [%s] batch: %d d_loss: %f g_loss: %f" % (elapsed, i, dis_loss, gen_loss)) # also write the losses to the log file: if log_dir is not None: log_file = os.path.join(log_dir, "loss.log") os.makedirs(os.path.dirname(log_file), exist_ok=True) with open(log_file, "a") as log: log.write( str(global_step) + "\t" + str(dis_loss) + "\t" + str(gen_loss) + "\n") # create a grid of samples and save it reses = [ str(int(np.power(2, dep))) + "_x_" + str(int(np.power(2, dep))) for dep in range(2, self.depth + 2) ] gen_img_files = [ os.path.join( sample_dir, res, "gen_" + str(epoch) + "_" + str(i) + ".png") for res in reses ] # Make sure all the required directories exist # otherwise make them os.makedirs(sample_dir, exist_ok=True) for gen_img_file in gen_img_files: os.makedirs(os.path.dirname(gen_img_file), exist_ok=True) dis_optim.zero_grad() gen_optim.zero_grad() with th.no_grad(): self.create_grid( self.gen(fixed_input) if not self.use_ema else self.gen_shadow(fixed_input), gen_img_files) # create a grid of real images and save it if save_real: real_img_files = [ os.path.join( sample_dir, res, "real_" + str(epoch) + "_" + str(i) + ".png") for res in reses ] # Make sure all the required directories exist # otherwise make them os.makedirs(sample_dir, exist_ok=True) for real_img_file in real_img_files: os.makedirs(os.path.dirname(real_img_file), exist_ok=True) self.create_grid(images, real_img_files) # increment the global_step: global_step += 1 if i > limit: break # calculate the time required for the epoch stop_time = timeit.default_timer() print("Time taken for epoch: %.3f secs" % (stop_time - start_time)) if epoch % checkpoint_factor == 0 or epoch == 1 or epoch == num_epochs: os.makedirs(save_dir, exist_ok=True) gen_save_file = os.path.join(save_dir, "GAN_GEN_" + str(epoch) + ".pth") dis_save_file = os.path.join(save_dir, "GAN_DIS_" + str(epoch) + ".pth") gen_optim_save_file = os.path.join( save_dir, "GAN_GEN_OPTIM_" + str(epoch) + ".pth") dis_optim_save_file = os.path.join( save_dir, "GAN_DIS_OPTIM_" + str(epoch) + ".pth") th.save(self.gen.state_dict(), gen_save_file) th.save(self.dis.state_dict(), dis_save_file) th.save(gen_optim.state_dict(), gen_optim_save_file) th.save(dis_optim.state_dict(), dis_optim_save_file) if self.use_ema: gen_shadow_save_file = os.path.join( save_dir, "GAN_GEN_SHADOW_" + str(epoch) + ".pth") th.save(self.gen_shadow.state_dict(), gen_shadow_save_file) print("Training completed ...") # return the generator and discriminator back to eval mode self.gen.eval() self.dis.eval()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path0', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--tokenized_data_path1', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=64, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss,设置为gradient accumulation的整数倍') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度') parser.add_argument('--max_length', default=256, type=int, required=False, help='最短收录文章长度') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--bpe_token', action='store_true', help='subword') parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json") parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe") parser.add_argument('--max_steps_perEpoch_perPiece', default=1000000, type=int, required=False) parser.add_argument('--steps_savemodel', default=10000, type=int, required=False, help='保存模型步数') parser.add_argument('--padding', action='store_true', help='输入是否定长') args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert #os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) full_tokenizer.max_len = 999999 device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) epochs = args.epochs batch_size = args.batch_size lr = args.lr log_step = args.log_step gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm output_dir = args.output_dir assert log_step % gradient_accumulation == 0 if not os.path.exists(output_dir): os.mkdir(output_dir) if not args.pretrained_model: model = transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) else: model = transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(args.pretrained_model) model.train() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) if fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model, device_ids=[int(i) for i in args.device.split(',')]) multi_gpu = True print('starting training') step_loss = 0 running_loss = 10 loss_ = 10 iter0 = iterData(args.tokenized_data_path0, rate=0.045, batch_size=batch_size, epochs=epochs) iter1 = iterData(args.tokenized_data_path1, rate=1.0, batch_size=batch_size, epochs=epochs) step = 0 epoch0 = -1 while True: data0 = next(iter0) data1 = next(iter1) if data0=='__STOP__' or data1=='__STOP__': break epoch, epochs, idx_file0, nb_files0, batch_inputs0 = data0 epoch, epochs, idx_file1, nb_files1, batch_inputs1 = data1 batch_inputs = batch_inputs1+batch_inputs0 random.shuffle(batch_inputs) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_inputs) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() step_loss += 1 #scheduler.step() if (step + 1) % log_step == 0: loss_ = running_loss * gradient_accumulation / (log_step / gradient_accumulation) print('now time: {}:{}. step: {}, progress-innerEpoch: {}/{}, progress-outerEpoch: {}/{}, loss {}'.format( datetime.now().hour, datetime.now().minute, step+1, str(idx_file0+1)+':'+str(idx_file1+1), str(nb_files0)+':'+str(nb_files1), epoch + 1, epochs, loss_)) running_loss = 0 if step%args.steps_savemodel==0: print('saving model for epoch {}'.format(epoch + 1)) output_dir_ = output_dir + 'model_epoch{}_step{}_loss-{}'.format(epoch + 1, step,'%0.2f'%loss_) if not os.path.exists(output_dir_): os.mkdir(output_dir_) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir_) step += 1 if epoch!=epoch0: if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1)) epoch0 = epoch print('epoch {} finished'.format(epoch + 1)) if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model') print('training finished')
def train_model(train_dataset, train_num_each, val_dataset, val_num_each): num_train = len(train_dataset) num_val = len(val_dataset) train_idx = [i for i in range(num_train)] np.random.seed(0) np.random.shuffle(train_idx) val_idx = [i for i in range(num_val)] print('num of train dataset: {:6d}'.format(num_train)) print('num of valid dataset: {:6d}'.format(num_val)) train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False) val_loader = DataLoader(val_dataset, batch_size=val_batch_size, sampler=val_idx, num_workers=workers, pin_memory=False) # model = models.resnet50(pretrained=True) # num_ftrs = model.fc.in_features # model.fc = nn.Linear(num_ftrs, 7) model = multi_resnet() if use_gpu: model = model.cuda() model = DataParallel(model) criterion = nn.BCEWithLogitsLoss(size_average=False) if multi_optim == 0: if optimizer_choice == 0: optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9) exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam(model.parameters()) elif multi_optim == 1: if optimizer_choice == 0: optimizer = optim.SGD([ { 'params': model.module.share.parameters() }, { 'params': model.module.fc1.parameters(), 'lr': 1e-3 }, ], lr=1e-4, momentum=0.9) exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min') elif optimizer_choice == 1: optimizer = optim.Adam([ { 'params': model.module.share.parameters() }, { 'params': model.module.fc1.parameters(), 'lr': 1e-3 }, ], lr=1e-4) best_model_wts = copy.deepcopy(model.state_dict()) best_val_accuracy = 0.0 correspond_train_acc = 0.0 all_info = [] all_train_accuracy = [] all_train_loss = [] all_val_accuracy = [] all_val_loss = [] sig_f = nn.Sigmoid() for epoch in range(epochs): train_idx = [i for i in range(num_train)] np.random.seed(0) np.random.shuffle(train_idx) train_loader = DataLoader(train_dataset, batch_size=train_batch_size, sampler=train_idx, num_workers=workers, pin_memory=False) model.train() train_loss = 0.0 train_corrects = 0 train_start_time = time.time() for data in train_loader: inputs, labels_1, labels_2 = data if use_gpu: inputs = Variable(inputs.cuda()) labels = Variable(labels_1.cuda()) else: inputs = Variable(inputs) labels = Variable(labels_1) optimizer.zero_grad( ) # 如果optimizer(net.parameters()), 那么效果和net.zero_grad()一样 outputs = model.forward(inputs) sig_out = outputs.data.cpu() sig_out = sig_f(sig_out) predict = torch.ByteTensor(sig_out > 0.5) predict = predict.long() train_corrects += torch.sum(predict == labels.data.cpu()) # print(train_corrects) labels = Variable(labels.data.float()) loss = criterion(outputs, labels) loss.backward() optimizer.step() train_loss += loss.data[0] # print(train_corrects) train_elapsed_time = time.time() - train_start_time train_accuracy = train_corrects / num_train / 7 train_average_loss = train_loss / num_train / 7 model.eval() val_loss = 0.0 val_corrects = 0 val_start_time = time.time() for data in val_loader: inputs, labels_1, labels_2 = data if use_gpu: inputs = Variable(inputs.cuda()) labels = Variable(labels_1.cuda()) else: inputs = Variable(inputs) labels = Variable(labels_1) outputs = model.forward(inputs) sig_out = outputs.data.cpu() sig_out = sig_f(sig_out) predict = torch.ByteTensor(sig_out > 0.5) predict = predict.long() val_corrects += torch.sum(predict == labels.data.cpu()) labels = Variable(labels.data.float()) loss = criterion(outputs, labels) val_loss += loss.data[0] # print(val_corrects) val_elapsed_time = time.time() - val_start_time val_accuracy = val_corrects / num_val / 7 val_average_loss = val_loss / num_val / 7 print('epoch: {:4d}' ' train in: {:2.0f}m{:2.0f}s' ' train loss: {:4.4f}' ' train accu: {:.4f}' ' valid in: {:2.0f}m{:2.0f}s' ' valid loss: {:4.4f}' ' valid accu: {:.4f}'.format(epoch, train_elapsed_time // 60, train_elapsed_time % 60, train_average_loss, train_accuracy, val_elapsed_time // 60, val_elapsed_time % 60, val_average_loss, val_accuracy)) all_train_loss.append(train_average_loss) all_train_accuracy.append(train_accuracy) all_val_loss.append(val_average_loss) all_val_accuracy.append(val_accuracy) if optimizer_choice == 0: exp_lr_scheduler.step(val_average_loss) if val_accuracy > best_val_accuracy: best_val_accuracy = val_accuracy correspond_train_acc = train_accuracy best_model_wts = copy.deepcopy(model.state_dict()) elif val_accuracy == best_val_accuracy: if train_accuracy > correspond_train_acc: correspond_train_acc = train_accuracy best_model_wts = copy.deepcopy(model.state_dict()) print('best accuracy: {:.4f} cor train accu: {:.4f}'.format( best_val_accuracy, correspond_train_acc)) save_val = int("{:4.0f}".format(best_val_accuracy * 10000)) save_train = int("{:4.0f}".format(correspond_train_acc * 10000)) model_name = "tool" \ + "_epoch_" + str(epochs) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train_" + str(save_train) \ + "_val_" + str(save_val) \ + ".pth" torch.save(best_model_wts, model_name) all_info.append(all_train_accuracy) all_info.append(all_train_loss) all_info.append(all_val_accuracy) all_info.append(all_val_loss) record_name = "tool" \ + "_epoch_" + str(epochs) \ + "_opt_" + str(optimizer_choice) \ + "_mulopt_" + str(multi_optim) \ + "_flip_" + str(use_flip) \ + "_crop_" + str(crop_type) \ + "_batch_" + str(train_batch_size) \ + "_train_" + str(save_train) \ + "_val_" + str(save_val) \ + ".pkl" with open(record_name, 'wb') as f: pickle.dump(all_info, f) print()
class UNetTrainer(object): """UNet trainer""" def __init__(self, start_epoch=0, save_dir='', resume="", devices_num=2, num_classes=2, color_dim=1): self.net = UNet(color_dim=color_dim, num_classes=num_classes) self.start_epoch = start_epoch if start_epoch != 0 else 1 self.save_dir = os.path.join('../models', save_dir) self.loss = CrossEntropyLoss() self.num_classes = num_classes if resume: checkpoint = torch.load(resume) if self.start_epoch == 0: self.start_epoch = checkpoint['epoch'] + 1 if not self.save_dir: self.save_dir = checkpoint['save_dir'] self.net.load_state_dict(checkpoint['state_dir']) if not os.path.exists(self.save_dir): os.makedirs(self.save_dir) self.net.cuda() self.loss.cuda() if devices_num == 2: self.net = DataParallel(self.net, device_ids=[0, 1]) #self.loss = DataParallel(self.loss, device_ids=[0, 1]) def train(self, train_loader, val_loader, lr=0.001, weight_decay=1e-4, epochs=200, save_freq=10): self.logfile = os.path.join(self.save_dir, 'log') sys.stdout = Logger(self.logfile) self.epochs = epochs self.lr = lr optimizer = torch.optim.Adam( self.net.parameters(), #lr, #momentum=0.9, weight_decay=weight_decay) for epoch in range(self.start_epoch, epochs + 1): self.train_(train_loader, epoch, optimizer, save_freq) self.validate_(val_loader, epoch) def train_(self, data_loader, epoch, optimizer, save_freq=10): start_time = time.time() self.net.train() #lr = self.get_lr(epoch) #for param_group in optimizer.param_groups: # param_group['lr'] = lr metrics = [] for i, (data, target) in enumerate(tqdm(data_loader)): data_t, target_t = data, target data = Variable(data.cuda(non_blocking=True)) target = Variable(target.cuda(non_blocking=True)) output = self.net(data) #unet输出结果 output = output.transpose(1, 3).transpose(1, 2).contiguous().view( -1, self.num_classes) target = target.view(-1) loss_output = self.loss(output, target) optimizer.zero_grad() loss_output.backward() #反向传播loss optimizer.step() loss_output = loss_output.data[0] #loss数值 acc = accuracy(output, target) metrics.append([loss_output, acc]) if i == 0: batch_size = data.size(0) _, output = output.data.max(dim=1) output = output.view(batch_size, 1, 1, 320, 480).cpu() #预测结果图 data_t = data_t[0, 0].unsqueeze(0).unsqueeze(0) #原img图 target_t = target_t[0].unsqueeze(0) #gt图 t = torch.cat([output[0].float(), data_t, target_t.float()], 0) #第一个参数为list,拼接3张图像 #show_list = [] #for j in range(10): # show_list.append(data_t[j, 0].unsqueeze(0).unsqueeze(0)) # show_list.append(target_t[j].unsqueeze(0)) # show_list.append(output[j].float()) # #t = torch.cat(show_list, 0) torchvision.utils.save_image(t, "temp_image/%02d_train.jpg" % epoch, nrow=3) #if i == 20: # break if epoch % save_freq == 0: if 'module' in dir(self.net): state_dict = self.net.module.state_dict() else: state_dict = self.net.state_dict() for key in state_dict.keys(): state_dict[key] = state_dict[key].cpu() torch.save( { 'epoch': epoch, 'save_dir': self.save_dir, 'state_dir': state_dict }, os.path.join(self.save_dir, '%03d.ckpt' % epoch)) end_time = time.time() metrics = np.asarray(metrics, np.float32) self.print_metrics(metrics, 'Train', end_time - start_time, epoch) def validate_(self, data_loader, epoch): start_time = time.time() self.net.eval() metrics = [] for i, (data, target) in enumerate(data_loader): data_t, target_t = data, target data = Variable(data.cuda(non_blocking=True), volatile=True) target = Variable(target.cuda(non_blocking=True), volatile=True) output = self.net(data) output = output.transpose(1, 3).transpose(1, 2).contiguous().view( -1, self.num_classes) target = target.view(-1) loss_output = self.loss(output, target) loss_output = loss_output.data[0] acc = accuracy(output, target) metrics.append([loss_output, acc]) if i == 0: batch_size = data.size(0) _, output = output.data.max(dim=1) output = output.view(batch_size, 1, 1, 320, 480).cpu() data_t = data_t[0, 0].unsqueeze(0).unsqueeze(0) target_t = target_t[0].unsqueeze(0) t = torch.cat([output[0].float(), data_t, target_t.float()], 0) # show_list = [] # for j in range(10): # show_list.append(data_t[j, 0].unsqueeze(0).unsqueeze(0)) # show_list.append(target_t[j].unsqueeze(0)) # show_list.append(output[j].float()) # # t = torch.cat(show_list, 0) torchvision.utils.save_image(t, "temp_image/%02d_val.jpg" % epoch, nrow=3) #if i == 10: # break end_time = time.time() metrics = np.asarray(metrics, np.float32) self.print_metrics(metrics, 'Validation', end_time - start_time) def print_metrics(self, metrics, phase, time, epoch=-1): """metrics: [loss, acc] """ if epoch != -1: print("Epoch: {}".format(epoch), ) print(phase, ) print('loss %2.4f, accuracy %2.4f, time %2.2f' % (np.mean(metrics[:, 0]), np.mean(metrics[:, 1]), time)) if phase != 'Train': print def get_lr(self, epoch): if epoch <= self.epochs * 0.5: lr = self.lr elif epoch <= self.epochs * 0.8: lr = 0.1 * self.lr else: lr = 0.01 * self.lr return lr def save_py_files(self, path): """copy .py files in exps dir, cfgs dir and current dir into save_dir, and keep the files structure """ #exps dir pyfiles = [f for f in os.listdir(path) if f.endswith('.py')] path = "/".join(path.split('/')[-2:]) exp_save_path = os.path.join(self.save_dir, path) mkdir(exp_save_path) for f in pyfiles: shutil.copy(os.path.join(path, f), os.path.join(exp_save_path, f)) #current dir pyfiles = [f for f in os.listdir('./') if f.endswith('.py')] for f in pyfiles: shutil.copy(f, os.path.join(self.save_dir, f)) #cfgs dir shutil.copytree('./cfgs', os.path.join(self.save_dir, 'cfgs'))
correct += pred.eq(target.data).to('cpu').sum() print('Accuracy: %d %%' % (100 * correct / len(test_loader.dataset))) if __name__ == '__main__': model = models.resnet50(pretrained=True) # resnet50 #model = models.resnet101(pretrained=True) # resnet101 num_features = model.fc.in_features model.fc = nn.Linear(num_features, 1222) #print(model) #model.cuda() model.to('cuda') model = DataParallel(model) traindir = ('/faces_83/train_images') testdir = ('/faces_83/test_images') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]))
def restore_checkpoint(self): """ Restore from the last checkpoint if available. Otherwise, configure this trainer from the scratch. """ # Check if there exists any checkpoints. chkpt_path = self.last_checkpoint if chkpt_path: # reload configuration from the checkpoint self._config = TrainerConfig.from_pretrained(str(chkpt_path)) self._logger.info("TrainerConfig at [%s] is restored.", chkpt_path) # Recover random number generator states self.set_seed() # Set seed before restoring RNG random_path = Path(chkpt_path, 'random.pt') random_states = torch.load(random_path) numpy.random.set_state(random_states['numpy']) random.setstate(random_states['random']) self.trainset.set_rng_state(random_states['trainset']) torch.set_rng_state(random_states['torch']['cpu']) if torch.cuda.is_available(): torch.cuda.set_rng_state_all(random_states['torch']['cuda']) # Record that the RNG is restored. self._logger.info( "State of random number generator is restored from [%s]", random_path) self._random_restored = True # Recover the trainer's internal states internal_states = torch.load(Path(chkpt_path, 'internal.pt')) for key, value in internal_states.items(): if hasattr(self, key): setattr(self, key, value) else: self.set_seed() # Set seed. # Build/restore model self._config.model.set_chkpt_path(chkpt_path) self._module = Solver.from_pretrained(config=self._config.model) self._module_init = { id(p): p.clone() for p in self._module.parameters() } self._module.to(self.main_device) self._logger.info("A network at [%s] is restored.", chkpt_path) # Compute the epoch/step information self._minibatch_per_epoch = len(self.trainset) self._step_per_epoch = int( math.ceil(self._minibatch_per_epoch / self._config.gradient_accumulation_steps)) self._steps_to_go = self._step_per_epoch * self._config.epoch self._logger.info("Steps / Epoch = %5d", self._step_per_epoch) self._logger.info("We will run %3d epoch(s) or %6d step(s)", self._config.epoch, self._steps_to_go) self._logger.info( "Per a single step, %2d gradient(s) will be accumulated. (Total %2d mini-batch(es)/epoch)", self._config.gradient_accumulation_steps, self._minibatch_per_epoch) self._logger.info( "We will report TRAINING loss/accuracy for every %3d epoch(s)", self._config.epoch_report) self._logger.info( "We will report DEV ACC. and save CHKPTs for every %3d epoch(s)", self._config.epoch_chkpt) # Restore the number of steps that were passed before if chkpt_path: self._epoch = int(chkpt_path.name) self._logger.info("Attempt to restore from the checkpoint [%s]", chkpt_path) self._logger.info("Resume training from epoch %s", self._epoch) # Classify parameters to form parameter groups to build optimizer no_w_decay = {'bias', 'norm', 'Norm', '_embedding'} parameters = [((2 if 'text_model.model.embeddings' in n else (1 if 'text_model' in n else 0), any(t in n for t in no_w_decay)), p) for n, p in self._module.named_parameters()] parameters = groupby(sorted(parameters, key=lambda t: t[0]), key=lambda t: t[0]) # Build optimizer groups optimizer_grouped_parameters = [] for (encoder_type_flag, is_without_wd), group in parameters: group = {'params': [p for _, p in group]} if is_without_wd: group['weight_decay'] = 0.0 if encoder_type_flag == 2 and self._config.fix_encoder_embedding: group['lr'] = 0.0 elif encoder_type_flag == 1: group['lr'] = self._config.optimizer.kwargs[ 'lr'] * self._config.lr_multiplier_encoder optimizer_grouped_parameters.append(group) # Build optimizer before restoration self._optimizer = self._config.optimizer.build( optimizer_grouped_parameters) self._logger.info("We will use the following optimizer: %s", self._optimizer) # Restore the optimizer if available. if chkpt_path: # Check if saved optimizer exists optimizer_file = Path(chkpt_path, 'optimizer.pt') if optimizer_file.is_file(): self._optimizer.load_state_dict(torch.load(optimizer_file)) self._logger.info( "An optimizer for module at [%s] is restored.", optimizer_file) # Specify warmup strategy if warmup value is not negative warmup_steps = int(self._step_per_epoch * self._config.epoch_warmup) if warmup_steps >= 0: # Build scheduler before restoration self._scheduler = get_linear_schedule_with_warmup( self._optimizer, num_warmup_steps=warmup_steps, num_training_steps=self._steps_to_go) self._logger.info( "We will use linear scheduling: warm up %s epochs or %s steps", self._config.epoch_warmup, warmup_steps) # Restore the scheduler if available if chkpt_path: # Check if saved scheduler exists scheduler_file = Path(chkpt_path, 'scheduler.pt') if scheduler_file.is_file(): self._scheduler.load_state_dict(torch.load(scheduler_file)) self._logger.info( "A scheduler for module at [%s] is restored.", scheduler_file) # Log the threshold of gradient clipping. if self._config.gradient_clip > 0: self._logger.info("We will use gradient clipping at %.3f", self._config.gradient_clip) else: self._logger.info("We will not use gradient clipping") # Log the structure of the network. parameters_size = sum(p.numel() for p in self._module.parameters()) disk_space = sum( required_space_param(p) for p in self._module.parameters()) self._logger.info('==== [Network Structure] ====\n%s', str(self._module)) self._logger.info( 'There are %12d parameters in a network. Required space for checkpointing is %.3fMB.', parameters_size, disk_space / 1048576) # Wrap data parallel if we can use more than one GPU if len(self.device_order) > 1 and not self.disable_dataparallel: self._module = DataParallel(self._module, device_ids=self.device_order, output_device=self.device_order[0]) self._logger.info( "We identified [%s] devices for parallel training", len(self.device_order)) else: self._logger.info("We don't use DataParallel.") # Set answer checker self._answer_checker = AnswerChecker( is_expression_type=_unwrap_parallel( self._module).is_expression_type, logger=self._logger)
def train(args): # gpu init multi_gpus = False if len(args.gpus.split(',')) > 1: multi_gpus = True os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # log init save_dir = os.path.join( args.save_dir, args.model_pre + args.backbone.upper() + '_' + datetime.now().strftime('%Y%m%d_%H%M%S')) if os.path.exists(save_dir): raise NameError('model dir exists!') os.makedirs(save_dir) logging = init_log(save_dir) _print = logging.info # dataset loader transform = transforms.Compose([ transforms.ToTensor(), # range [0, 255] -> [0.0,1.0] transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) # range [0.0, 1.0] -> [-1.0,1.0] ]) # validation dataset trainset = CASIAWebFace(args.train_root, args.train_file_list, transform=transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=8, drop_last=False) # test dataset lfwdataset = LFW(args.lfw_test_root, args.lfw_file_list, transform=transform) lfwloader = torch.utils.data.DataLoader(lfwdataset, batch_size=128, shuffle=False, num_workers=4, drop_last=False) # define backbone and margin layer if args.backbone == 'MobileFace': net = MobileFaceNet(feature_dim=args.feature_dim) elif args.backbone == 'Res50': net = ResNet50() elif args.backbone == 'Res101': net = ResNet101() elif args.backbone == 'Res50_IR': net = SEResNet_IR(50, feature_dim=args.feature_dim, mode='ir') elif args.backbone == 'SERes50_IR': net = SEResNet_IR(50, feature_dim=args.feature_dim, mode='se_ir') elif args.backbone == 'SphereNet': net = SphereNet(num_layers=64, feature_dim=args.feature_dim) else: print(args.backbone, ' is not available!') if args.margin_type == 'ArcFace': margin = ArcMarginProduct(args.feature_dim, trainset.class_nums, s=args.scale_size) elif args.margin_type == 'CosFace': pass elif args.margin_type == 'SphereFace': pass elif args.margin_type == 'InnerProduct': margin = InnerProduct(args.feature_dim, trainset.class_nums) else: print(args.margin_type, 'is not available!') if args.resume: print('resume the model parameters from: ', args.net_path, args.margin_path) net.load_state_dict(torch.load(args.net_path)['net_state_dict']) margin.load_state_dict(torch.load(args.margin_path)['net_state_dict']) # define optimizers for different layers criterion_classi = torch.nn.CrossEntropyLoss().to(device) optimizer_classi = optim.SGD([{ 'params': net.parameters(), 'weight_decay': 5e-4 }, { 'params': margin.parameters(), 'weight_decay': 5e-4 }], lr=0.1, momentum=0.9, nesterov=True) scheduler_classi = lr_scheduler.MultiStepLR(optimizer_classi, milestones=[35, 60, 85], gamma=0.1) criterion_center = AgentCenterLoss(trainset.class_nums, args.feature_dim, args.scale_size).to(device) optimizer_center = optim.SGD(criterion_center.parameters(), lr=0.5) scheduler_center = lr_scheduler.MultiStepLR(optimizer_center, milestones=[35, 60, 85], gamma=0.1) if multi_gpus: net = DataParallel(net).to(device) margin = DataParallel(margin).to(device) else: net = net.to(device) margin = margin.to(device) best_lfw_acc = 0.0 best_lfw_iters = 0 total_iters = 0 for epoch in range(1, args.total_epoch + 1): scheduler_classi.step() scheduler_center.step() # train model _print('Train Epoch: {}/{} ...'.format(epoch, args.total_epoch)) net.train() if args.plot: all_features, all_labels = [], [] since = time.time() for data in trainloader: img, label = data[0].to(device), data[1].to(device) feature = net(img) output = margin(feature) loss_classi = criterion_classi(output, label) loss_center = criterion_center(feature, label) total_loss = loss_classi + loss_center * args.weight_center optimizer_classi.zero_grad() optimizer_center.zero_grad() total_loss.backward() optimizer_classi.step() # by doing so, weight_cent would not impact on the learning of centers #for param in criterion_center.parameters(): # param.grad.data *= (1. / args.weight_center) optimizer_center.step() total_iters += 1 if args.plot: feat = feature.data.cpu().numpy() #for i in range(feat.shape[0]): # feat[i] = feat[i] / np.sqrt((np.dot(feat[i], feat[i]))) all_features.append(feat) all_labels.append(label.data.cpu().numpy()) # print train information if total_iters % 10 == 0: # current training accuracy _, predict = torch.max(output.data, 1) total = label.size(0) correct = (np.array(predict.cpu()) == np.array( label.data.cpu())).sum() time_cur = (time.time() - since) / 10 since = time.time() print( "Iters: {:0>6d}/[{:0>2d}], loss_classi: {:.4f}, loss_center: {:.4f}, train_accuracy: {:.4f}, time: {:.2f} s/iter, learning rate: {}" .format(total_iters, epoch, loss_classi.item(), loss_center.item(), correct / total, time_cur, scheduler_classi.get_lr()[0])) # save model if total_iters % args.save_freq == 0: msg = 'Saving checkpoint: {}'.format(total_iters) _print(msg) if multi_gpus: net_state_dict = net.module.state_dict() margin_state_dict = margin.module.state_dict() else: net_state_dict = net.state_dict() margin_state_dict = margin.state_dict() if not os.path.exists(save_dir): os.mkdir(save_dir) torch.save( { 'iters': total_iters, 'net_state_dict': net_state_dict }, os.path.join(save_dir, 'Iter_%06d_net.ckpt' % total_iters)) torch.save( { 'iters': total_iters, 'net_state_dict': margin_state_dict }, os.path.join(save_dir, 'Iter_%06d_margin.ckpt' % total_iters)) #torch.save({ # 'iters': total_iters, # 'net_state_dict': criterion_center.state_dict()}, # os.path.join(save_dir, 'Iter_%06d_center.ckpt' % total_iters)) # test accuracy if total_iters % args.test_freq == 0: # test model on lfw net.eval() getFeatureFromTorch('./result/cur_lfw_result.mat', net, device, lfwdataset, lfwloader) lfw_accs = evaluation_10_fold('./result/cur_lfw_result.mat') _print('LFW Ave Accuracy: {:.4f}'.format( np.mean(lfw_accs) * 100)) if best_lfw_acc < np.mean(lfw_accs) * 100: best_lfw_acc = np.mean(lfw_accs) * 100 best_lfw_iters = total_iters net.train() if args.plot: all_features = np.concatenate(all_features, 0) all_labels = np.concatenate(all_labels, 0) plot_features(all_features, all_labels, trainset.class_nums, epoch, save_dir) _print('Finally Best Accuracy: LFW: {:.4f} in iters: {}'.format( best_lfw_acc, best_lfw_iters)) print('finishing training')
class Trainer(object): """ Trainer class """ def __init__(self, chkpt_path: str, config: TrainerConfig, train: str, test: str, dev: str = None, disable_dataparallel: bool = False): """ Instantiate trainer :param str chkpt_path: Path to checkpoint the model, optimizer and scheduler :param TrainerConfig config: Configuration instance for Trainer :param str train: Path to JSON with lines file which contains the training set :param str test: Path to JSON with lines file which contains the evaluation set :param str dev: Path to JSON with lines file which contains the development set (optional) :param bool disable_dataparallel: True if module should not be parallelized across different GPU devices. False by default. """ # Register configuration self._config = config self.disable_dataparallel = disable_dataparallel # Prepare internal states self._best_on_dev = 0.0 #: Best score on the development set self._ema_on_dev = None #: Exponential Moving Average score on the development set. self._random_restored = False #: Whether the RNG state restored or not # Epoch & step information self._epoch = 0 self._steps_to_go = 0 self._step_per_epoch = 0 self._minibatch_per_epoch = 0 # Dictionary that records the last performance metrics self._last_performances = {} self._last_metrics = {} # Prepare checkpointing self._chkpt_path = Path(chkpt_path) if not self._chkpt_path.exists(): self._chkpt_path.mkdir(parents=True) # Logging file handler file_handler = logging.FileHandler(filename=Path( chkpt_path, 'train.log'), encoding='UTF-8') file_handler.setFormatter( logging.Formatter( '[%(asctime)s] %(levelname)s %(name)s: %(message)s', datefmt='%m/%d/%Y %H:%M:%S')) file_handler.setLevel(logging.INFO) # Set the logger self._logger = logging.getLogger(self.__class__.__name__ + '_%s' % id(self)) self._logger.addHandler(file_handler) self._logger.setLevel(logging.INFO) # If DEBUG is on, turn on the anomaly detection if 'DEBUG' in ENV: torch.autograd.set_detect_anomaly(True) # Prepare Tensorboard if available. try: from tensorboardX import SummaryWriter self._writer = SummaryWriter(logdir=str(self._chkpt_path), flush_secs=30) except ImportError: self._writer = None # Prepare data-parallel if available. if torch.cuda.is_available(): devices = get_available_device_count() cuda_keys = list(range(devices)) random.shuffle(cuda_keys) self.main_device = torch.device('cuda', cuda_keys[0]) self.device_order = cuda_keys else: self.main_device = torch.device('cpu') self.device_order = [self.main_device] self._logger.info( "We will use [%s] device as a main device for training, with ordering [%s]", self.main_device, self.device_order) # Read the datasets self.set_seed( ) #: Set seed before loading the datasets (because of shuffling in training set) self.trainset, self.devset, self.evalset = self._config.read_datasets( train=train, dev=dev, test=test) self._trainit = iter(self.trainset) # Log dataset statistics self._logger.info('From %s, we loaded %s mini-batch(es)', train, len(self.trainset)) self._logger.info('From %s, we loaded %s mini-batch(es)', dev, len(self.devset)) self._logger.info('From %s, we loaded %s mini-batch(es)', test, len(self.evalset)) self.trainset.print_item_statistics(self._logger) # Build or restore module self._module = None self._module_init = {} self._optimizer = None self._answer_checker = None self.restore_checkpoint() @property def checkpoints(self) -> List[Path]: """ :rtype: List[Path] :return: List of checkpointed steps (dictionaries) """ checkpoints = sorted(Path(self._chkpt_path).glob('*')) checkpoints = [ x for x in checkpoints if x.is_dir() and x.name.isnumeric() ] return checkpoints @property def last_checkpoint(self) -> Path: """ :rtype: Path :return: The last checkpoint if exists. Otherwise, None """ return self.checkpoints[-1] if len(self.checkpoints) else None @property def current_epoch(self) -> int: """ :rtype: int :return: Current epoch index """ return self._epoch @property def is_done(self) -> bool: """ :rtype: bool :return: True if trainer already reached maximum epoch specified. """ return self._epoch == self._config.epoch def close(self): """ Close and clean-up the trainer. """ if self._writer is not None: # Close the TensorboardX self._writer.close() self._writer = None if self._answer_checker is not None: # Kill the answer checker child processes self._answer_checker.close() self._answer_checker = None def rotate_checkpoint(self, max_item: int = 10): """ Rotate checkpoints :param int max_item: Maximum number of allowed checkpoints """ # Check if we should delete older checkpoint(s) if len(self.checkpoints) <= max_item: return for chkpt in self.checkpoints[:-max_item]: # Remove old checkpoints self._logger.info("Deleting old checkpoint [%s]", chkpt) shutil.rmtree(chkpt) def checkpoint(self): """ Make a checkpoint """ # Build dictionary format to make the order directory names and the order of epoch index be the same. directory_format = '%%0%dd' % int( math.ceil(math.log10(self._config.epoch + 1))) # If directory exists, exit the method. output_dir = Path(self._chkpt_path, directory_format % self._epoch) if output_dir.exists(): return # Prepare the directory for checkpointing self._logger.info("Save checkpoint to [%s]", output_dir) output_dir.mkdir(parents=True) # Save the all RNG states used in this trainer. torch.save( { 'numpy': numpy.random.get_state(), 'random': random.getstate(), 'trainset': self.trainset.get_rng_state(), 'torch': { 'cpu': torch.get_rng_state(), 'cuda': torch.cuda.get_rng_state_all() if torch.cuda.is_available() else None } }, Path(output_dir, 'random.pt')) # Save Trainer's internal states torch.save( { '_best_on_dev': self._best_on_dev, '_ema_on_dev': self._ema_on_dev, '_last_performances': self._last_performances, '_last_metrics': self._last_metrics }, Path(output_dir, 'internal.pt')) # Save the model _unwrap_parallel(self._module).save_pretrained(output_dir) # Save the optimizer torch.save(self._optimizer.state_dict(), Path(output_dir, 'optimizer.pt')) # Save the scheduler if available. if hasattr(self, '_scheduler'): torch.save(self._scheduler.state_dict(), Path(output_dir, 'scheduler.pt')) # Write configuration that has been used. self._config.save_pretrained(output_dir) # Rotate checkpoints. self.rotate_checkpoint() def restore_checkpoint(self): """ Restore from the last checkpoint if available. Otherwise, configure this trainer from the scratch. """ # Check if there exists any checkpoints. chkpt_path = self.last_checkpoint if chkpt_path: # reload configuration from the checkpoint self._config = TrainerConfig.from_pretrained(str(chkpt_path)) self._logger.info("TrainerConfig at [%s] is restored.", chkpt_path) # Recover random number generator states self.set_seed() # Set seed before restoring RNG random_path = Path(chkpt_path, 'random.pt') random_states = torch.load(random_path) numpy.random.set_state(random_states['numpy']) random.setstate(random_states['random']) self.trainset.set_rng_state(random_states['trainset']) torch.set_rng_state(random_states['torch']['cpu']) if torch.cuda.is_available(): torch.cuda.set_rng_state_all(random_states['torch']['cuda']) # Record that the RNG is restored. self._logger.info( "State of random number generator is restored from [%s]", random_path) self._random_restored = True # Recover the trainer's internal states internal_states = torch.load(Path(chkpt_path, 'internal.pt')) for key, value in internal_states.items(): if hasattr(self, key): setattr(self, key, value) else: self.set_seed() # Set seed. # Build/restore model self._config.model.set_chkpt_path(chkpt_path) self._module = Solver.from_pretrained(config=self._config.model) self._module_init = { id(p): p.clone() for p in self._module.parameters() } self._module.to(self.main_device) self._logger.info("A network at [%s] is restored.", chkpt_path) # Compute the epoch/step information self._minibatch_per_epoch = len(self.trainset) self._step_per_epoch = int( math.ceil(self._minibatch_per_epoch / self._config.gradient_accumulation_steps)) self._steps_to_go = self._step_per_epoch * self._config.epoch self._logger.info("Steps / Epoch = %5d", self._step_per_epoch) self._logger.info("We will run %3d epoch(s) or %6d step(s)", self._config.epoch, self._steps_to_go) self._logger.info( "Per a single step, %2d gradient(s) will be accumulated. (Total %2d mini-batch(es)/epoch)", self._config.gradient_accumulation_steps, self._minibatch_per_epoch) self._logger.info( "We will report TRAINING loss/accuracy for every %3d epoch(s)", self._config.epoch_report) self._logger.info( "We will report DEV ACC. and save CHKPTs for every %3d epoch(s)", self._config.epoch_chkpt) # Restore the number of steps that were passed before if chkpt_path: self._epoch = int(chkpt_path.name) self._logger.info("Attempt to restore from the checkpoint [%s]", chkpt_path) self._logger.info("Resume training from epoch %s", self._epoch) # Classify parameters to form parameter groups to build optimizer no_w_decay = {'bias', 'norm', 'Norm', '_embedding'} parameters = [((2 if 'text_model.model.embeddings' in n else (1 if 'text_model' in n else 0), any(t in n for t in no_w_decay)), p) for n, p in self._module.named_parameters()] parameters = groupby(sorted(parameters, key=lambda t: t[0]), key=lambda t: t[0]) # Build optimizer groups optimizer_grouped_parameters = [] for (encoder_type_flag, is_without_wd), group in parameters: group = {'params': [p for _, p in group]} if is_without_wd: group['weight_decay'] = 0.0 if encoder_type_flag == 2 and self._config.fix_encoder_embedding: group['lr'] = 0.0 elif encoder_type_flag == 1: group['lr'] = self._config.optimizer.kwargs[ 'lr'] * self._config.lr_multiplier_encoder optimizer_grouped_parameters.append(group) # Build optimizer before restoration self._optimizer = self._config.optimizer.build( optimizer_grouped_parameters) self._logger.info("We will use the following optimizer: %s", self._optimizer) # Restore the optimizer if available. if chkpt_path: # Check if saved optimizer exists optimizer_file = Path(chkpt_path, 'optimizer.pt') if optimizer_file.is_file(): self._optimizer.load_state_dict(torch.load(optimizer_file)) self._logger.info( "An optimizer for module at [%s] is restored.", optimizer_file) # Specify warmup strategy if warmup value is not negative warmup_steps = int(self._step_per_epoch * self._config.epoch_warmup) if warmup_steps >= 0: # Build scheduler before restoration self._scheduler = get_linear_schedule_with_warmup( self._optimizer, num_warmup_steps=warmup_steps, num_training_steps=self._steps_to_go) self._logger.info( "We will use linear scheduling: warm up %s epochs or %s steps", self._config.epoch_warmup, warmup_steps) # Restore the scheduler if available if chkpt_path: # Check if saved scheduler exists scheduler_file = Path(chkpt_path, 'scheduler.pt') if scheduler_file.is_file(): self._scheduler.load_state_dict(torch.load(scheduler_file)) self._logger.info( "A scheduler for module at [%s] is restored.", scheduler_file) # Log the threshold of gradient clipping. if self._config.gradient_clip > 0: self._logger.info("We will use gradient clipping at %.3f", self._config.gradient_clip) else: self._logger.info("We will not use gradient clipping") # Log the structure of the network. parameters_size = sum(p.numel() for p in self._module.parameters()) disk_space = sum( required_space_param(p) for p in self._module.parameters()) self._logger.info('==== [Network Structure] ====\n%s', str(self._module)) self._logger.info( 'There are %12d parameters in a network. Required space for checkpointing is %.3fMB.', parameters_size, disk_space / 1048576) # Wrap data parallel if we can use more than one GPU if len(self.device_order) > 1 and not self.disable_dataparallel: self._module = DataParallel(self._module, device_ids=self.device_order, output_device=self.device_order[0]) self._logger.info( "We identified [%s] devices for parallel training", len(self.device_order)) else: self._logger.info("We don't use DataParallel.") # Set answer checker self._answer_checker = AnswerChecker( is_expression_type=_unwrap_parallel( self._module).is_expression_type, logger=self._logger) def set_seed(self): """ Set the random seeds """ if self._random_restored: # Ignore seed setting when state of rng was restored. return seed = self._config.seed self._logger.info("Seed for random number generation = %s", seed) random.seed(seed) numpy.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) def get_evaluation_output(self, key: str): """ Get the evaluation output of specified key. :param str key: metric key to read :return: metric value of specified key """ return self._last_performances[key] def get_metrics(self) -> dict: """ :return: The latest metric dictionary. """ return self._last_metrics def run_a_chkpt_iter(self): """ Run epochs until checkpointing """ try: accumulated_values = {} for _ in range(self._config.epoch_chkpt): # For each epoch (at most the number of checkpointing epoch) self._epoch += 1 all_grad_applied = True for batch_step in range(self._minibatch_per_epoch): # For each minibatch self._module.eval() self._module.zero_grad() # Load a minibatch batch = next(self._trainit) batch = self._load_batch(batch) # Execute training self._module.train() reported_values = self._step(**batch) reported_values['Loss/generate'] = reported_values[ 'total_loss'] reported_values['total_loss'].backward() all_grad_applied = False # Accumulate statistics and update gradient _accumulate_stats(reported_values, accumulated_values) if (batch_step + 1) % self._config.gradient_accumulation_steps == 0: self._update_grad() all_grad_applied = True else: # If there exists not-updated gradients, update gradient if not all_grad_applied: self._update_grad() if self._config.epoch_report > 0 and self._epoch % self._config.epoch_report == 0: # Log metrics if self._writer is not None: for name, val in accumulated_values.items(): self._writer.add_scalar(name, sum(val) / len(val), self._epoch) # Report current optimizer status self._report_optimizer() accumulated_values.clear() # Evaluate current result on development set self.evaluate() self.checkpoint() except Exception as e: self._logger.error('Exception occurred!', exc_info=e) raise e def train(self): """ Do full-length training (until the maximum epoch) """ # Set seed self.set_seed() # Prepare estimated time calculator class eta = ExpectedTimeToFinishCalculator(self._config.epoch, current=self._epoch) while self._epoch < self._config.epoch: self.run_a_chkpt_iter() eta_time = eta.step(increase=self._config.epoch_chkpt) self._logger.info('Expected time to finish: %s', eta_time) # Evaluate performance on the evaluation set try: self.evaluate(is_development=False) except Exception as e: self._logger.error('Exception occurred!', exc_info=e) raise e finally: # Remove old checkpoints and close Tensorboard writer self.rotate_checkpoint(1) def _update_grad(self): """ Update accumulated gradients """ if self._config.gradient_clip > 0: # If clipping threshold is set, then clip the gradient torch.nn.utils.clip_grad_norm_(self._module.parameters(), self._config.gradient_clip) if self._config.gradient_normalize: # If normalizing gradient is set, then normalize the gradient _normalize_gradients(*self._module.parameters()) # Apply optimizer & scheduler self._optimizer.step() if hasattr(self, '_scheduler'): self._scheduler.step() # Reset the gradient self._module.zero_grad() def _load_batch(self, batch: ProblemInstance, is_training=True, max_len=0) -> dict: """ Load batch instance into dictionary that can feed-able into the model. :param ProblemInstance batch: A mini-batch :param bool is_training: True if this batch is used for training. True by default. :param int max_len: Maximum length of equation to be generated. 0 by default (i.e. depends on the current batch) :rtype: dict :return: Dictionary representing mini-batch """ # Prepare dictionary batch_dict = { 'max_numbers': max(len(numbers) for numbers in batch.text.number_value), IN_TXT: batch.text.token, IN_TPAD: batch.text.pad, IN_TNUM: batch.text.number } # Retrieve information about the target field required_field = _unwrap_parallel(self._module).required_field # Get equation in terms of the target field equation = getattr(batch, required_field) if is_training: # If this is training, then directly provide target equation for teacher-forcing batch_dict[IN_EQN] = equation else: # Otherwise, just provide information about maximum length of generation & arity of operators batch_dict['max_len'] = max(equation.shape[-2], max_len) + 1 if required_field.startswith('tuple'): batch_dict['function_arities'] = getattr( self.evalset, required_field + '_field').function_arities if not isinstance(self._module, DataParallel): # If we applied data parallel, then move the value to the main device batch_dict = { k: v.to(self.main_device) if isinstance(v, torch.Tensor) else v for k, v in batch_dict.items() } # Returned value is a dict. return batch_dict def _step(self, training: bool = True, **kwargs): """ Execute forward computation of the module :param bool training: True if this execution is for training. True by default. :param kwargs: Keyword arguments to execute the module. :return: Result of execution. - If training is True, return value will be a dictionary mapping from string to accuracy/loss Tensors. - Otherwise, return value will be a LongTensor indicating the generated tokens """ result = self._module(**kwargs) if type(result) is dict and training: return {k: v.mean() if training else v for k, v in result.items()} else: return result def _report_optimizer(self): """ Report the current state of the optimizer """ # Classify parameters by their types param_type = { id(p): ('Enc' if 'text_model.' in n else 'Dec') + ('Embed' if '_embedding' in n else 'Trans') for n, p in _unwrap_parallel(self._module).named_parameters() } # Dictionary for accumulating parameter information param_states = { key: { 'weight_norm': [], 'acc_update': [] } for key in set(param_type.values()) } with torch.no_grad(): # Without using gradients, accumulate information about weight and gradient for gid, group in enumerate(self._optimizer.param_groups): for p in group['params']: id_p = id(p) states = param_states[param_type[id_p]] w_init = self._module_init[id_p] w_elem = p.numel() w_norm = p.norm(2).item() / w_elem delta_norm = (w_init - p.clone().cpu()).norm(2).item() / w_elem states['weight_norm'].append(w_norm) states['acc_update'].append(delta_norm) # Write accumulated results if self._writer: for part, states in param_states.items(): prefix = 'Optimizer_%s/%%s' % part for key, val in states.items(): if not len(val): continue # Track average & histograms val = numpy.array(val) self._writer.add_scalar(prefix % key, val.mean(), self._epoch) self._writer.add_scalar(prefix % (key + '_std'), val.std(), self._epoch) def _check_equation(self, checker: AnswerChecker, outputs: torch.Tensor, batch: ProblemInstance): """ Verify whether the outputted equation is correct or not. :param AnswerChecker checker: AnswerChecker instance to compute equation and check answer :param torch.Tensor outputs: LongTensor containing generated equations. - If the model should generate op-tokens, Shape = [B, M, T], where B = batch size, M = beams, and T = length - Otherwise, Shape = [B, M, T, 1+2A], where A = maximum arity. :param batch: :return: """ # Retrieve size information batch_sz, beam_sz = outputs.shape[:2] # Get the target field information required_field = _unwrap_parallel(self._module).required_field # Retrieve the target field field = getattr(self.evalset, required_field + '_field') # Recover string representation of gold set and generated beams golds = field.convert_ids_to_equations(getattr(batch, required_field)) beams = [ field.convert_ids_to_equations(outputs[i]) for i in range(batch_sz) ] outputs = [] for i in range(batch_sz): # For each batch, retrieve information about written numbers and expected answer tuples numbers = batch.text.number_value[i] expected = batch.expected[i] # Test whether the produced equation in each beam results = [ checker.check(beam, numbers, expected) for beam in beams[i] ] # Record outputs: (index, goldset output, generated output, correctness) outputs.append((i, golds[i], beams[i], results)) return outputs def evaluate(self, is_development: bool = True): """ Evaluate the current model. :param bool is_development: True if current evaluation is done on development set. True by default. """ # Shortcut for beam size beam_size = self._config.model.beam_size # Accumulator for output accumulator = [] # Define log storage for information set_type = 'Dev' if is_development else 'Test' errored_path = Path(self._chkpt_path, 'error_sample_%s.log' % set_type) correct_path = Path(self._chkpt_path, 'correct_sample_%s.log' % set_type) result_path = Path(self._chkpt_path, 'results.csv') # Check whether we should write header or not. first_result_output = not result_path.exists() # Open file handlers errored_fp = errored_path.open('w+t', encoding='UTF-8') correct_fp = correct_path.open('w+t', encoding='UTF-8') result_fp = result_path.open('a+t', encoding='UTF-8') # Set module as evaluation phase self._module.eval() # Load dataset dataset = self.devset if is_development else self.evalset max_len = 0 if is_development else MEM_MAX for batch in dataset: # For each batch item, load it and produce outputs kwargs = self._load_batch(batch, is_training=False, max_len=max_len) outputs = self._step(**kwargs, training=False, beam=beam_size) # Convert text into string (for printing purpose) texts = dataset.problem_field.convert_ids_to_string( batch.text.token) # Check the result and print the result for each item. for i, gold, beams, results in self._check_equation( self._answer_checker, outputs, batch): # Record the best output of the beam search results result_dict = { 'Index': batch.index[i], 'Error': str(type(results[0][2])), 'correct': results[0][0], 'error_1_Parse': results[0][2] is not None, 'error_2_Empty': len(results[0][1]) == 0 and results[0][2] is None, 'error_3_Match': not results[0][0] and len(results[0][1]) > 0 and results[0][2] is None, 'correct_in_beam': any(r[0] for r in results) } # Accumulate the test result. accumulator.append(result_dict) # Select appropriate file handler fp = errored_fp if not result_dict['correct'] else correct_fp # Write problem & result fp.writelines([ '[Q] ', batch.index[i], '\n', texts[i], '\n', '---------------------------------------\n', '[EXPECTED]\t%s\n' % ' '.join(gold), '---ANSWER:\t%s\n' % batch.expected[i], '---------------------------------------\n' ]) fp.writelines([ '[BEAM#%3d]\t%s\n' '---ANSWER:\t%s\n%s' % (b, ' '.join(beam), res[1], '' if res[2] is None else '----ERROR:\t%s %s\n' % (type(res[2]), str(res[2]))) for b, (beam, res) in enumerate(zip(beams, results)) ]) fp.write('\n') # Close file handlers errored_fp.close() correct_fp.close() # Write CSV results sorted_keys = sorted(accumulator[0].keys()) # Write CSV header if first_result_output: _write_csv_line(result_fp, 'Set', 'GlobalStep', 'Beam', *sorted_keys) # Write CSV results for values in accumulator: _write_csv_line(result_fp, set_type, self._epoch, beam_size, *[values[key] for key in sorted_keys]) # Close CSV handler result_fp.close() # Average metric across items (correctness & errors) metric_dict = {} for key in sorted_keys: value = [item[key] for item in accumulator] if type(value[0]) is not str: average = sum(value) / len(value) # Write accumulated results self._logger.info('Evaluating on %s (beam %s): %s = %.6f', set_type, beam_size, key, average) metric_dict[set_type + '/' + key] = average # Reset the dataset (since dataset reached EOF) dataset.reset() # Write exponential moving average & maximum value into metric dict if is_development: self._best_on_dev = max(self._best_on_dev, metric_dict['Dev/correct']) if self._ema_on_dev is None: self._ema_on_dev = metric_dict['Dev/correct'] else: self._ema_on_dev = metric_dict[ 'Dev/correct'] * 0.6 + self._ema_on_dev * 0.4 metric_dict['Dev/correct_max'] = self._best_on_dev metric_dict['Dev/correct_ema'] = self._ema_on_dev # Record last output self._last_performances[set_type] = [ item['correct'] for item in sorted(accumulator, key=lambda d: d['Index']) ] self._last_metrics.update(metric_dict)
def train(args): # gpu init multi_gpus = False best_lfw_acc = 0.0 best_lfw_iters = 0 best_agedb30_acc = 0.0 best_agedb30_iters = 0 best_cfp_fp_acc = 0.0 best_cfp_fp_iters = 0 if len(args.gpus.split(',')) > 1: multi_gpus = True os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # log init save_dir = os.path.join( args.save_dir, args.backbone.upper() + datetime.now().date().strftime('%Y%m%d')) if not os.path.exists(save_dir): #raise NameError('model dir exists!') os.makedirs(save_dir) logging = init_log(save_dir) _print = logging.info # define backbone and margin layer if args.backbone == 'MobileFace': net = MobileFaceNet(512).to(config.device) elif args.backbone == 'MNasMobile': net = MnasNet(512).to(config.device) elif args.backbone == 'ProxyNas': net = ProxyNas(512).to(config.device) elif args.backbone == 'SERes50_IR': net = SE_IR(50, 0.6, 'ir_se').to(config.device) elif args.backbone == 'IR_50': net = SE_IR(50, 0.6, 'ir').to(config.device) else: print(args.backbone, ' is not available!') summary(net.to(config.device), (3, 112, 112)) #define tranform if args.backbone == 'ProxyNas': transform = transforms.Compose([ transforms.Resize(112, 112), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) else: # dataset loader transform = transforms.Compose([ transforms.Resize((112, 112)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), # range [0, 255] -> [0.0,1.0] transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) # range [0.0, 1.0] -> [-1.0,1.0] ]) # validation dataset trainset = VGG_FP(config=config, transform=transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=config.batch_size, shuffle=True, num_workers=8, drop_last=False) num_iter = len(trainset) // config.batch_size numclass = trainset.class_nums if args.has_test: lfwdataset = LFW(config=config, transform=transform) lfwloader = torch.utils.data.DataLoader(lfwdataset, batch_size=config.batch_size, shuffle=False, num_workers=8, drop_last=False) agedbdataset = AgeDB30(config=config, transform=transform) agedbloader = torch.utils.data.DataLoader(agedbdataset, batch_size=config.batch_size, shuffle=False, num_workers=8, drop_last=False) cfpfpdataset = CFP_FP(config=config, transform=transform) cfpfploader = torch.utils.data.DataLoader(cfpfpdataset, batch_size=config.batch_size, shuffle=False, num_workers=8, drop_last=False) if args.margin_type == 'ArcFace': margin = ArcMarginProduct(512, numclass, s=args.scale_size) elif args.margin_type == 'CosFace': pass elif args.margin_type == 'SphereFace': pass else: print(args.margin_type, 'is not available!') if args.resume: print('resume the model parameters from: ', args.net_path, args.margin_path) net.load_state_dict(torch.load(args.net_path)['net_state_dict']) margin.load_state_dict(torch.load(args.margin_path)['net_state_dict']) # define optimizers for different layer criterion = torch.nn.CrossEntropyLoss().to(device) optimizer_ft = optim.SGD([{ 'params': net.parameters(), 'weight_decay': 5e-4 }, { 'params': margin.parameters(), 'weight_decay': 5e-4 }], lr=0.001, momentum=0.9, nesterov=True) exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer_ft, milestones=config.milestones, gamma=0.1) if multi_gpus: net = DataParallel(net).to(device) margin = DataParallel(margin).to(device) else: net = net.to(device) margin = margin.to(device) total_iters = 1 vis = Visualizer(env=args.backbone) start_epoch = total_iters // num_iter if args.resume: total_iters = args.resume with open('result/log_vis_train.txt', 'r') as fw: for line in fw.readlines(): nodes = line.split(':') vis.plot_curves({'softmax loss': np.float(nodes[1])}, iters=np.float(nodes[0]), title='train loss', xlabel='iters', ylabel='train loss') vis.plot_curves({'train accuracy': np.float(nodes[2])}, iters=np.float(nodes[0]), title='train accuracy', xlabel='iters', ylabel='train accuracy') with open('result/log_vis_test.txt', 'r') as fw2: for line in fw2.readlines(): nodes = line.split(':') vis.plot_curves( { 'lfw': np.float(nodes[1]), 'agedb-30': np.float(nodes[2]), 'cfp-fp': np.float(nodes[3]) }, iters=np.float(nodes[0]), title='test accuracy', xlabel='iters', ylabel='test accuracy') for epoch in range(1, args.total_epoch + 1): exp_lr_scheduler.step() if epoch < start_epoch: continue # train model _print('Train Epoch: {}/{} ...'.format(epoch, args.total_epoch)) net.train() log_vis_train = open('result/log_vis_train.txt', 'a') log_vis_test = open('result/log_vis_test.txt', 'a') since = time.time() for data in trainloader: img, label = data[0].to(device), data[1].to(device) optimizer_ft.zero_grad() raw_logits = net(img) output = margin(raw_logits, label) total_loss = criterion(output, label) total_loss.backward() optimizer_ft.step() # print train information if total_iters % 200 == 0: # current training accuracy _, predict = torch.max(output.data, 1) total = label.size(0) correct = (np.array(predict) == np.array(label.data)).sum() time_cur = (time.time() - since) / 100 since = time.time() vis.plot_curves({'softmax loss': total_loss.item()}, iters=total_iters, title='train loss', xlabel='iters', ylabel='train loss') vis.plot_curves({'train accuracy': correct / total}, iters=total_iters, title='train accuracy', xlabel='iters', ylabel='train accuracy') log_vis_train.write("%d:%f:%f\n" % (total_iters, total_loss.item(), (correct / total))) print( "Iters: {:0>6d}/[{:0>2d}], loss: {:.4f}, train_accuracy: {:.4f}, time: {:.2f} s/iter, learning rate: {}" .format(total_iters, epoch, total_loss.item(), correct / total, time_cur, exp_lr_scheduler.get_lr()[0])) # save model if total_iters % args.save_freq == 0: msg = 'Saving checkpoint: {}'.format(total_iters) _print(msg) if multi_gpus: net_state_dict = net.module.state_dict() margin_state_dict = margin.module.state_dict() else: net_state_dict = net.state_dict() margin_state_dict = margin.state_dict() if not os.path.exists(save_dir): os.mkdir(save_dir) torch.save( { 'iters': total_iters, 'net_state_dict': net_state_dict }, os.path.join(save_dir, 'Iter_%06d_net.ckpt' % total_iters)) torch.save( { 'iters': total_iters, 'net_state_dict': margin_state_dict }, os.path.join(save_dir, 'Iter_%06d_margin.ckpt' % total_iters)) # test accuracy if total_iters % args.test_freq == 0 and args.has_test: # test model on lfw net.eval() getFeatureFromTorch('./result/cur_lfw_result.mat', net, device, lfwdataset, lfwloader) lfw_accs = evaluation_10_fold('./result/cur_lfw_result.mat') _print('LFW Ave Accuracy: {:.4f}'.format( np.mean(lfw_accs) * 100)) if best_lfw_acc <= np.mean(lfw_accs) * 100: best_lfw_acc = np.mean(lfw_accs) * 100 best_lfw_iters = total_iters # test model on AgeDB30 getFeatureFromTorch('./result/cur_agedb30_result.mat', net, device, agedbdataset, agedbloader) age_accs = evaluation_10_fold( './result/cur_agedb30_result.mat') _print('AgeDB-30 Ave Accuracy: {:.4f}'.format( np.mean(age_accs) * 100)) if best_agedb30_acc <= np.mean(age_accs) * 100: best_agedb30_acc = np.mean(age_accs) * 100 best_agedb30_iters = total_iters # test model on CFP-FP getFeatureFromTorch('./result/cur_cfpfp_result.mat', net, device, cfpfpdataset, cfpfploader) cfp_accs = evaluation_10_fold('./result/cur_cfpfp_result.mat') _print('CFP-FP Ave Accuracy: {:.4f}'.format( np.mean(cfp_accs) * 100)) if best_cfp_fp_acc <= np.mean(cfp_accs) * 100: best_cfp_fp_acc = np.mean(cfp_accs) * 100 best_cfp_fp_iters = total_iters _print( 'Current Best Accuracy: LFW: {:.4f} in iters: {}, AgeDB-30: {:.4f} in iters: {} and CFP-FP: {:.4f} in iters: {}' .format(best_lfw_acc, best_lfw_iters, best_agedb30_acc, best_agedb30_iters, best_cfp_fp_acc, best_cfp_fp_iters)) # _print('Current Best Accuracy:LFW: {:.4f} in iters: {} and CFP-FP: {:.4f} in iters: {}'.format( # best_lfw_acc, best_lfw_iters, best_cfp_fp_acc, best_cfp_fp_iters)) vis.plot_curves( { 'lfw': np.mean(lfw_accs), 'agedb-30': np.mean(age_accs), 'cfp-fp': np.mean(cfp_accs) }, iters=total_iters, title='test accuracy', xlabel='iters', ylabel='test accuracy') log_vis_test.write('%d:%f:%f:%f\n' % (total_iters, np.mean(lfw_accs), np.mean(cfp_accs), np.mean(age_accs))) net.train() total_iters += 1 _print( 'Finally Best Accuracy: LFW: {:.4f} in iters: {}, AgeDB-30: {:.4f} in iters: {} and CFP-FP: {:.4f} in iters: {}' .format(best_lfw_acc, best_lfw_iters, best_agedb30_acc, best_agedb30_iters, best_cfp_fp_acc, best_cfp_fp_iters)) _print( 'Finally Best Accuracy: LFW: {:.4f} in iters: {} and CFP-FP: {:.4f} in iters: {}' .format(best_lfw_acc, best_lfw_iters, best_cfp_fp_acc, best_cfp_fp_iters)) print('finishing training')
trainLoader = DataLoader( data_train, batch_size=64, shuffle=True,num_workers=6) #valLoader = DataLoader( # data_val, batch_size=16, shuffle=False,num_workers=6) dataset_train_len=len(data_train) #dataset_val_len=len(data_val) #densenet = models.densenet121(num_classes=14) densenet = models.densenet121(pretrained=True) densenet.classifier = nn.Linear(1024,14) # densenet = pickle.load(open('../../../../media/data/yangliu/xrays/our_trained_densenet_epoch_14.pkl', 'rb')) densenet = densenet.cuda() densenet = DataParallel(densenet) #with open(weight_dir+'densenet_epoch_15.pkl','rb') as f: # densenet = pickle.load(f) parameter=0 for param in densenet.parameters(): parameter+=param.data.nelement() print ('Total trainable parameters are {}'.format(parameter)) optimizer=optim.Adam(densenet.parameters(),lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0) model_ft = train_model(densenet, optimizer,num_epochs=100)
def main(args): print(f"\nModel: {args.arch}") # Set the N & M m.N = args.N m.M = args.M # Select the hardware device to use for inference. if torch.cuda.is_available(): device = torch.device('cuda', torch.cuda.current_device()) torch.backends.cudnn.benchmark = True else: device = torch.device('cpu') # Disable gradient calculations by default. torch.set_grad_enabled(False) # create checkpoint dir os.makedirs(args.checkpoint, exist_ok=True) if args.arch == 'hg1': model = hg1(pretrained=False) elif args.arch == 'hg2': model = hg2(pretrained=False) elif args.arch == 'hg3': model = hg3(pretrained=False) elif args.arch == 'hg4': model = hg4(pretrained=False) elif args.arch == 'hg5': model = hg5(pretrained=False) elif args.arch == 'hg6': model = hg6(pretrained=False) elif args.arch == 'hg7': model = hg7(pretrained=False) elif args.arch == 'hg8': model = hg8(pretrained=False) else: raise Exception('unrecognised model architecture: ' + args.arch) model = DataParallel(model).to(device) optimizer = RMSprop(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) best_acc = 0 # optionally resume from a checkpoint if args.resume: assert os.path.isfile(args.resume) print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc = checkpoint['best_acc'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt')) logger.set_names( ['Epoch', 'LR', 'Train Loss', 'Val Loss', 'Train Acc', 'Val Acc']) # create data loader train_dataset = Mpii(args.image_path, is_train=True, inp_res=args.input_shape) train_loader = DataLoader(train_dataset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers, pin_memory=True) val_dataset = Mpii(args.image_path, is_train=False, inp_res=args.input_shape) val_loader = DataLoader(val_dataset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=True) # train and eval lr = args.lr epoch_times = [] end = time.time() f = open(f"{args.checkpoint}/epoch_times.txt", 'w') for epoch in trange(args.start_epoch, args.epochs, desc='Overall', ascii=True): start = end lr = adjust_learning_rate(optimizer, epoch, lr, args.schedule, args.gamma) # train for one epoch train_loss, train_acc = do_training_epoch(train_loader, model, device, Mpii.DATA_INFO, optimizer, acc_joints=Mpii.ACC_JOINTS) # evaluate on validation set valid_loss, valid_acc, predictions = do_validation_epoch( val_loader, model, device, Mpii.DATA_INFO, False, acc_joints=Mpii.ACC_JOINTS) # print metrics tqdm.write( f'[{epoch + 1:3d}/{args.epochs:3d}] lr={lr:0.2e} ' f'train_loss={train_loss:0.4f} train_acc={100 * train_acc:0.2f} ' f'valid_loss={valid_loss:0.4f} valid_acc={100 * valid_acc:0.2f}') # append logger file logger.append( [epoch + 1, lr, train_loss, valid_loss, train_acc, valid_acc]) logger.plot_to_file(os.path.join(args.checkpoint, 'log.svg'), ['Train Acc', 'Val Acc']) # remember best acc and save checkpoint is_best = valid_acc > best_acc best_acc = max(valid_acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, predictions, is_best, checkpoint=args.checkpoint, snapshot=args.snapshot) end = time.time() epoch_times.append(end - start) print( f"Average Epoch Time After Epoch {epoch}: {sum(epoch_times) / len(epoch_times)} sec", file=f) f.close() logger.close()
return acc if __name__ == '__main__': opt = Config() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #获取所有图像的相对路径 identity_list = get_lfw_list(opt.lfw_test_list) #获取图像的绝对路径 img_paths = [os.path.join(opt.lfw_root, each) for each in identity_list] if opt.backbone == 'resnet18': model = resnet_face18(opt.use_se) # elif opt.backbone == 'resnet34': model = resnet34() elif opt.backbone == 'resnet50': model = resnet50() # You can easily run your operations on multiple GPUs by making your model run parallelly using DataParallel # https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html model = DataParallel(model) # load_model(model, opt.test_model_path) model.load_state_dict(torch.load(opt.test_model_path, map_location=device)) model.to(device) model.eval() lfw_test(model, img_paths, identity_list, opt.lfw_test_list, opt.test_batch_size)
def main(): parser = argparse.ArgumentParser("PyTorch Xview Pipeline") arg = parser.add_argument arg('--config', metavar='CONFIG_FILE', help='path to configuration file') arg('--workers', type=int, default=6, help='number of cpu threads to use') arg('--gpu', type=str, default='0', help='List of GPUs for parallel training, e.g. 0,1,2,3') arg('--output-dir', type=str, default='weights/') arg('--resume', type=str, default='') arg('--fold', type=int, default=0) arg('--prefix', type=str, default='classifier_') arg('--data-dir', type=str, default="/mnt/sota/datasets/deepfake") arg('--folds-csv', type=str, default='folds.csv') arg('--crops-dir', type=str, default='crops') arg('--label-smoothing', type=float, default=0.01) arg('--logdir', type=str, default='logs') arg('--zero-score', action='store_true', default=False) arg('--from-zero', action='store_true', default=False) arg('--distributed', action='store_true', default=False) arg('--freeze-epochs', type=int, default=0) arg("--local_rank", default=0, type=int) arg("--seed", default=777, type=int) arg("--padding-part", default=3, type=int) arg("--opt-level", default='O1', type=str) arg("--test_every", type=int, default=1) arg("--no-oversample", action="store_true") arg("--no-hardcore", action="store_true") arg("--only-changed-frames", action="store_true") args = parser.parse_args() os.makedirs(args.output_dir, exist_ok=True) if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') else: os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu cudnn.benchmark = True conf = load_config(args.config) model = classifiers.__dict__[conf['network']](encoder=conf['encoder']) model = model.cuda() if args.distributed: model = convert_syncbn_model(model) ohem = conf.get("ohem_samples", None) reduction = "mean" if ohem: reduction = "none" loss_fn = [] weights = [] for loss_name, weight in conf["losses"].items(): loss_fn.append(losses.__dict__[loss_name](reduction=reduction).cuda()) weights.append(weight) loss = WeightedLosses(loss_fn, weights) loss_functions = {"classifier_loss": loss} optimizer, scheduler = create_optimizer(conf['optimizer'], model) bce_best = 100 start_epoch = 0 batch_size = conf['optimizer']['batch_size'] data_train = DeepFakeClassifierDataset( mode="train", oversample_real=not args.no_oversample, fold=args.fold, padding_part=args.padding_part, hardcore=not args.no_hardcore, crops_dir=args.crops_dir, data_path=args.data_dir, label_smoothing=args.label_smoothing, folds_csv=args.folds_csv, transforms=create_train_transforms(conf["size"]), normalize=conf.get("normalize", None)) data_val = DeepFakeClassifierDataset(mode="val", fold=args.fold, padding_part=args.padding_part, crops_dir=args.crops_dir, data_path=args.data_dir, folds_csv=args.folds_csv, transforms=create_val_transforms( conf["size"]), normalize=conf.get("normalize", None)) val_data_loader = DataLoader(data_val, batch_size=batch_size * 2, num_workers=args.workers, shuffle=False, pin_memory=False) os.makedirs(args.logdir, exist_ok=True) summary_writer = SummaryWriter(args.logdir + '/' + conf.get("prefix", args.prefix) + conf['encoder'] + "_" + str(args.fold)) if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location='cpu') state_dict = checkpoint['state_dict'] state_dict = {k[7:]: w for k, w in state_dict.items()} model.load_state_dict(state_dict, strict=False) if not args.from_zero: start_epoch = checkpoint['epoch'] if not args.zero_score: bce_best = checkpoint.get('bce_best', 0) print("=> loaded checkpoint '{}' (epoch {}, bce_best {})".format( args.resume, checkpoint['epoch'], checkpoint['bce_best'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) if args.from_zero: start_epoch = 0 current_epoch = start_epoch if conf['fp16']: model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale='dynamic') snapshot_name = "{}{}_{}_{}".format(conf.get("prefix", args.prefix), conf['network'], conf['encoder'], args.fold) if args.distributed: model = DistributedDataParallel(model, delay_allreduce=True) else: model = DataParallel(model).cuda() # register each block, in order to extract the blocks' feature maps for name, block in model.encoder.blocks.named_children(): block.register_forward_hook(hook_function) data_val.reset(1, args.seed) max_epochs = conf['optimizer']['schedule']['epochs'] for epoch in range(start_epoch, max_epochs): data_train.reset(epoch, args.seed) train_sampler = None if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( data_train) train_sampler.set_epoch(epoch) if epoch < args.freeze_epochs: print("Freezing encoder!!!") model.module.encoder.eval() for p in model.module.encoder.parameters(): p.requires_grad = False else: model.module.encoder.train() for p in model.module.encoder.parameters(): p.requires_grad = True train_data_loader = DataLoader(data_train, batch_size=batch_size, num_workers=args.workers, shuffle=train_sampler is None, sampler=train_sampler, pin_memory=False, drop_last=True) train_epoch(current_epoch, loss_functions, model, optimizer, scheduler, train_data_loader, summary_writer, conf, args.local_rank, args.only_changed_frames) model = model.eval() if args.local_rank == 0: torch.save( { 'epoch': current_epoch + 1, 'state_dict': model.state_dict(), 'bce_best': bce_best, }, args.output_dir + '/' + snapshot_name + "_last") torch.save( { 'epoch': current_epoch + 1, 'state_dict': model.state_dict(), 'bce_best': bce_best, }, args.output_dir + snapshot_name + "_{}".format(current_epoch)) if (epoch + 1) % args.test_every == 0: bce_best = evaluate_val(args, val_data_loader, bce_best, model, snapshot_name=snapshot_name, current_epoch=current_epoch, summary_writer=summary_writer) current_epoch += 1
def main(): if raw: build_files(data_path=RAW_DATA_PATH) exit(1) model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel( config=model_config) MULTI_GPU = False if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) MULTI_GPU = True model.to(device) total_lines = 0 for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: total_lines += len(f.readlines()) total_steps = int(total_lines * EPOCHS / BATCH_SIZE) print('total steps = {}'.format(total_steps)) optimizer = pytorch_transformers.AdamW(model.parameters(), lr=LR, correct_bias=True) scheduler = pytorch_transformers.WarmupLinearSchedule( optimizer, warmup_steps=WARMUP_STEPS, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) print('starting training') for epoch in range(EPOCHS): print('epoch {}'.format(epoch)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: running_loss = 0 sub_lines = f.readlines() sub_lines = [line.split()[:n_ctx] for line in sub_lines] random.shuffle(sub_lines) for step in range(len(sub_lines) // BATCH_SIZE): batch = sub_lines[step * BATCH_SIZE:(step + 1) * BATCH_SIZE] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) optimizer.zero_grad() outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] if MULTI_GPU: loss = loss.mean() if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) running_loss += loss.item() scheduler.step() optimizer.step() if (step + 1) % LOG_STEP == 0: print( 'step {} of piece {} of epoch {}, loss {}'.format( step + 1, piece_num, epoch + 1, running_loss / LOG_STEP)) running_loss = 0 piece_num += 1 print('saving model for epoch {}'.format(epoch)) if not os.path.exists('./model/model_epoch{}'.format(epoch + 1)): os.mkdir('./model/model_epoch{}'.format(epoch + 1)) model.save_pretrained('./model/model_epoch{}'.format(epoch + 1)) torch.save(scheduler.state_dict(), './model/model_epoch{}/scheduler.pt'.format(epoch + 1)) torch.save(optimizer.state_dict(), './model/model_epoch{}/optimizer.pt'.format(epoch + 1)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished') if not os.path.exists('./model/final_model'): os.mkdir('./model/final_model') model.save_pretrained('./model/final_model') torch.save(scheduler.state_dict(), './model/final_model/scheduler.pt') torch.save(optimizer.state_dict(), './model/final_model/optimizer.pt')
base_params = filter(lambda p: id(p) not in ignored_params, net.parameters()) optimizer_ft = optim.SGD([ {'params': base_params, 'weight_decay': 4e-5}, {'params': net.linear1.parameters(), 'weight_decay': 4e-4}, {'params': ArcMargin.weight, 'weight_decay': 4e-4}, {'params': prelu_params, 'weight_decay': 0.0} ], lr=0.1, momentum=0.9, nesterov=True) exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer_ft, milestones=[36, 52, 58], gamma=0.1) net = net.cuda() ArcMargin = ArcMargin.cuda() if multi_gpus: net = DataParallel(net) ArcMargin = DataParallel(ArcMargin) criterion = torch.nn.CrossEntropyLoss() best_acc = 0.0 best_epoch = 0 for epoch in range(start_epoch, TOTAL_EPOCH+1): exp_lr_scheduler.step() # train model _print('Train Epoch: {}/{} ...'.format(epoch, TOTAL_EPOCH)) net.train() train_total_loss = 0.0 total = 0 since = time.time()
def make_network(configs): PoseNet = importNet(configs['network']) train_cfg = configs['train'] config = configs['inference'] poseNet = PoseNet(**config) forward_net = DataParallel(poseNet.cuda()) def calc_loss(*args, **kwargs): return poseNet.calc_loss(*args, **kwargs) config['net'] = Trainer(forward_net, configs['inference']['keys'], calc_loss) train_cfg['optimizer'] = torch.optim.Adam(config['net'].parameters(), train_cfg['learning_rate']) exp_path = os.path.join('exp', configs['opt'].exp) if not os.path.exists(exp_path): os.mkdir(exp_path) logger = open(os.path.join(exp_path, 'log'), 'a+') def make_train(batch_id, config, phase, **inputs): for i in inputs: inputs[i] = make_input(inputs[i]) net = config['inference']['net'] config['batch_id'] = batch_id if phase != 'inference': result = net(inputs['imgs'], **{i: inputs[i] for i in inputs if i != 'imgs'}) num_loss = len(config['train']['loss']) ## I use the last outputs as the loss ## the weights of the loss are controlled by config['train']['loss'] losses = { i[0]: result[-num_loss + idx] * i[1] for idx, i in enumerate(config['train']['loss']) } loss = 0 toprint = '\n{}: '.format(batch_id) for i in losses: loss = loss + torch.mean(losses[i]) my_loss = make_output(losses[i]) my_loss = my_loss.mean(axis=0) if my_loss.size == 1: toprint += ' {}: {}'.format(i, format(my_loss.mean(), '.8f')) else: toprint += '\n{}'.format(i) for j in my_loss: toprint += ' {}'.format(format(j.mean(), '.8f')) logger.write(toprint) logger.flush() if batch_id == 200000: ## decrease the learning rate after 200000 iterations for param_group in optimizer.param_groups: param_group['lr'] = 1e-5 optimizer = train_cfg['optimizer'] optimizer.zero_grad() loss.backward() optimizer.step() return None else: out = {} net = net.eval() result = net(**inputs) if type(result) != list and type(result) != tuple: result = [result] out['preds'] = [make_output(i) for i in result] return out return make_train
def main(): global args args = parser.parse_args() torch.manual_seed(0) ################################## nodmodel = import_module(args.model1) config1, nod_net, loss, get_pbb = nodmodel.get_model() args.lr_stage = config1['lr_stage'] args.lr_preset = config1['lr'] save_dir = args.save_dir ################################## casemodel = import_module(args.model2) config2 = casemodel.config args.lr_stage2 = config2['lr_stage'] args.lr_preset2 = config2['lr'] topk = config2['topk'] case_net = casemodel.CaseNet(topk = topk,nodulenet=nod_net) args.miss_ratio = config2['miss_ratio'] args.miss_thresh = config2['miss_thresh'] if args.debug: args.save_dir = 'debug' ################################### ################################ start_epoch = args.start_epoch if args.resume: checkpoint = torch.load(args.resume) if start_epoch == 0: start_epoch = checkpoint['epoch'] + 1 if not save_dir: save_dir = checkpoint['save_dir'] else: save_dir = os.path.join('results',save_dir) case_net.load_state_dict(checkpoint['state_dict']) else: if start_epoch == 0: start_epoch = 1 if not save_dir: exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime()) save_dir = os.path.join('results', args.model1 + '-' + exp_id) else: save_dir = os.path.join('results',save_dir) if args.epochs == None: end_epoch = args.lr_stage2[-1] else: end_epoch = args.epochs ################################ if not os.path.exists(save_dir): os.makedirs(save_dir) logfile = os.path.join(save_dir,'log') if args.test1!=1 and args.test2!=1 : sys.stdout = Logger(logfile) pyfiles = [f for f in os.listdir('./') if f.endswith('.py')] for f in pyfiles: shutil.copy(f,os.path.join(save_dir,f)) ################################ torch.cuda.set_device(0) #nod_net = nod_net.cuda() case_net = case_net.cuda() loss = loss.cuda() cudnn.benchmark = True if not args.debug: case_net = DataParallel(case_net) nod_net = DataParallel(nod_net) ################################ if args.test1 == 1: testsplit = np.load('full.npy') dataset = DataBowl3Classifier(testsplit, config2, phase = 'test') predlist = test_casenet(case_net,dataset).T anstable = np.concatenate([[testsplit],predlist],0).T df = pandas.DataFrame(anstable) df.columns={'id','cancer'} df.to_csv('allstage1.csv',index=False) return if args.test2 ==1: testsplit = np.load('test.npy') dataset = DataBowl3Classifier(testsplit, config2, phase = 'test') predlist = test_casenet(case_net,dataset).T anstable = np.concatenate([[testsplit],predlist],0).T df = pandas.DataFrame(anstable) df.columns={'id','cancer'} df.to_csv('quick',index=False) return if args.test3 == 1: testsplit3 = np.load('stage2.npy') dataset = DataBowl3Classifier(testsplit3,config2,phase = 'test') predlist = test_casenet(case_net,dataset).T anstable = np.concatenate([[testsplit3],predlist],0).T df = pandas.DataFrame(anstable) df.columns={'id','cancer'} df.to_csv('stage2_ans.csv',index=False) return print(save_dir) print(args.save_freq) trainsplit = np.load('kaggleluna_full.npy') valsplit = np.load('valsplit.npy') testsplit = np.load('test.npy') dataset = DataBowl3Detector(trainsplit,config1,phase = 'train') train_loader_nod = DataLoader(dataset,batch_size = args.batch_size, shuffle = True,num_workers = args.workers,pin_memory=True) dataset = DataBowl3Detector(valsplit,config1,phase = 'val') val_loader_nod = DataLoader(dataset,batch_size = args.batch_size, shuffle = False,num_workers = args.workers,pin_memory=True) optimizer = torch.optim.SGD(nod_net.parameters(), args.lr,momentum = 0.9,weight_decay = args.weight_decay) trainsplit = np.load('full.npy') dataset = DataBowl3Classifier(trainsplit,config2,phase = 'train') train_loader_case = DataLoader(dataset,batch_size = args.batch_size2, shuffle = True,num_workers = args.workers,pin_memory=True) dataset = DataBowl3Classifier(valsplit,config2,phase = 'val') val_loader_case = DataLoader(dataset,batch_size = max([args.batch_size2,1]), shuffle = False,num_workers = args.workers,pin_memory=True) dataset = DataBowl3Classifier(trainsplit,config2,phase = 'val') all_loader_case = DataLoader(dataset,batch_size = max([args.batch_size2,1]), shuffle = False,num_workers = args.workers,pin_memory=True) optimizer2 = torch.optim.SGD(case_net.parameters(), args.lr,momentum = 0.9,weight_decay = args.weight_decay) for epoch in range(start_epoch, end_epoch + 1): if epoch ==start_epoch: lr = args.lr debug = args.debug args.lr = 0.0 args.debug = True train_casenet(epoch,case_net,train_loader_case,optimizer2,args) args.lr = lr args.debug = debug if epoch<args.lr_stage[-1]: train_nodulenet(train_loader_nod, nod_net, loss, epoch, optimizer, args) validate_nodulenet(val_loader_nod, nod_net, loss) if epoch>config2['startepoch']: train_casenet(epoch,case_net,train_loader_case,optimizer2,args) val_casenet(epoch,case_net,val_loader_case,args) val_casenet(epoch,case_net,all_loader_case,args) if epoch % args.save_freq == 0: state_dict = case_net.module.state_dict() for key in state_dict.keys(): state_dict[key] = state_dict[key].cpu() torch.save({ 'epoch': epoch, 'save_dir': save_dir, 'state_dict': state_dict, 'args': args}, os.path.join(save_dir, '%03d.ckpt' % epoch))
def main(): global args args = parser.parse_args() torch.manual_seed(0) torch.cuda.set_device(0) model = import_module(args.model) config, net, loss, get_pbb = model.get_model() start_epoch = args.start_epoch save_dir = args.save_dir if args.resume: checkpoint = torch.load(args.resume) if start_epoch == 0: start_epoch = checkpoint['epoch'] + 1 if not save_dir: save_dir = checkpoint['save_dir'] else: save_dir = os.path.join('results', save_dir) net.load_state_dict(checkpoint['state_dict']) else: if start_epoch == 0: start_epoch = 1 if not save_dir: exp_id = time.strftime('%Y%m%d-%H%M%S', time.localtime()) save_dir = os.path.join('results', args.model + '-' + exp_id) else: save_dir = os.path.join('results', save_dir) if not os.path.exists(save_dir): os.makedirs(save_dir) logfile = os.path.join(save_dir, 'log') if args.test != 1: sys.stdout = Logger(logfile) pyfiles = [f for f in os.listdir('./') if f.endswith('.py')] for f in pyfiles: shutil.copy(f, os.path.join(save_dir, f)) n_gpu = setgpu(args.gpu) args.n_gpu = n_gpu net = net.cuda() loss = loss.cuda() cudnn.benchmark = True net = DataParallel(net) datadir = config_training['preprocess_result_path'] print(net) if args.test == 1: margin = 32 sidelen = 144 split_comber = SplitComb(sidelen, config['max_stride'], config['stride'], margin, config['pad_value']) dataset = data.DataBowl3Detector(datadir, 'full.npy', config, phase='test', split_comber=split_comber) test_loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=args.workers, collate_fn=data.collate, pin_memory=False) with torch.no_grad(): test(test_loader, net, get_pbb, save_dir, config) return #net = DataParallel(net) dataset = data.DataBowl3Detector(datadir, 'kaggleluna_full.npy', config, phase='train') train_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) dataset = data.DataBowl3Detector(datadir, 'valsplit.npy', config, phase='val') val_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) optimizer = torch.optim.SGD(net.parameters(), args.lr, momentum=0.9, weight_decay=args.weight_decay) def get_lr(epoch): if epoch <= args.epochs * 0.5: lr = args.lr elif epoch <= args.epochs * 0.8: lr = 0.1 * args.lr else: lr = 0.01 * args.lr return lr for epoch in range(start_epoch, args.epochs + 1): train(train_loader, net, loss, epoch, optimizer, get_lr, args.save_freq, save_dir) with torch.no_grad(): validate(val_loader, net, loss)