def training_obj(self, train, train_target, weights, model_opt, val, val_target, global_step): if not self.gen_error_alpha: logits = self.model(train, weights) loss = self.criterion(logits, train_target) accuracy = utils.accuracy(logits, train_target)[0] loss1, loss2 = loss, torch.zeros_like(loss) else: logits_train = self.model(train, weights) loss_train = self.criterion(logits_train, train_target) logits_val = self.model(val, weights) loss_val = self.criterion(logits_val, val_target) loss2 = torch.abs(loss_val - loss_train) self.loss_diff_sign.update( torch.mean(((loss_val - loss_train) > 0).float()).data) loss1 = loss_train loss = loss1 + self.gen_error_alpha_lambda * loss2 accuracy = utils.accuracy(logits_train, train_target)[0] if self.alpha_loss: alpha_loss = self.alpha.module.alpha_loss(weights) loss += self.args.alpha_loss_lambda * alpha_loss if self.count % self.report_freq == 0: self.writer.add_scalar('meta/alpha_loss', torch.mean(alpha_loss), global_step) return loss, accuracy, loss1, loss2
def evaluate(self, cfg): self.phase = 'test' # switch to evaluate mode self.net.eval() self.imgs_all = [] self.pred_index_all = [] self.target_index_all = [] self.fake_image_num = 0 with torch.no_grad(): print('# Cls val images num = {0}'.format(self.val_image_num)) # batch_index = int(self.val_image_num / cfg.BATCH_SIZE) # random_id = random.randint(0, batch_index) for i, data in enumerate(self.val_loader): self.set_input(data, self.cfg.DATA_TYPE) self._forward() self._process_fc() # accuracy prec1 = util.accuracy(self.cls.data, self.label, topk=(1, )) self.loss_meters['VAL_CLS_ACC'].update(prec1[0].item(), self.batch_size) # Mean ACC mean_acc = self._cal_mean_acc(cfg=cfg, data_loader=self.val_loader) print('mean_acc: [{0}]'.format(mean_acc)) return mean_acc
def test(): model.eval() output = model(features, adj) loss_test = F.mse_loss(output[idx_test], labels[idx_test]) acc_test = accuracy(output, labels) print("Test set results:", "loss= {:.4f}".format(loss_test.data[0]), "accuracy= {:.4f}".format(acc_test.data[0]))
def test(test_feature,test_label): model.eval() output = model(test_feature, adj) print(output.max(1)[1].data) print(test_label.data) acc_test = accuracy(output, test_label) loss_test = F.nll_loss(output, test_label) print("Test set results: loss={:.4f} test acc={:.4f}".format(loss_test.data[0],acc_test.data[0]))
def test_step(self, batch, batch_idx): inp_img, target = batch preds = self.forward(inp_img) loss = self.criterrion(preds, target) accu = accuracy(preds, target) results = {'test_loss': loss, 'test_accuracy': accu} return results
def validation_step(self, batch, batch_idx): inp_img, target = batch preds = self.forward(inp_img) loss = self.criterrion(preds, target) accu = accuracy(preds, target)[0] results = {'val_loss': loss, 'val_accuracy': accu} return results
def _construct_TRAIN_G_LOSS(self, epoch=None): loss_total = torch.zeros(1) if self.use_gpu: loss_total = loss_total.cuda() if self.gen is not None: assert (self.gen.size(-1) == self.cfg.FINE_SIZE) if 'CLS' in self.cfg.LOSS_TYPES: cls_loss = self.loss['cls_loss'].mean() loss_total = loss_total + cls_loss cls_loss = round(cls_loss.item(), 4) self.loss_meters['TRAIN_CLS_LOSS'].update(cls_loss, self.batch_size) prec1 = util.accuracy(self.cls.data, self.label, topk=(1, )) self.loss_meters['TRAIN_CLS_ACC'].update(prec1[0].item(), self.batch_size) # ) content supervised if self.cfg.NITER_START_CONTENT <= epoch <= self.cfg.NITER_END_CONTENT: if 'SEMANTIC' in self.cfg.LOSS_TYPES: content_loss = self.loss['content_loss'].mean() loss_total = loss_total + content_loss content_loss = round(content_loss.item(), 4) self.loss_meters['TRAIN_SEMANTIC_LOSS'].update( content_loss, self.batch_size) if self.cfg.NITER_START_PIX2PIX <= epoch <= self.cfg.NITER_END_PIX2PIX: if 'PIX2PIX' in self.cfg.LOSS_TYPES: pix2pix_loss = self.loss['pix2pix_loss'].mean() loss_total = loss_total + pix2pix_loss pix2pix_loss = round(pix2pix_loss.item(), 4) self.loss_meters['TRAIN_PIXEL_LOSS'].update( pix2pix_loss, self.batch_size) if self.cfg.NITER_START_GAN <= epoch <= self.cfg.NITER_END_GAN: if 'GAN' in self.cfg.LOSS_TYPES: self.forward_D(detach=False) loss_GAN = self.criterion_GAN(self.pred_fake, self._real) * self.cfg.ALPHA_GAN loss_total += loss_GAN loss_G_GAN = round(loss_GAN.item(), 4) self.loss_meters['TRAIN_G_LOSS'].update( loss_G_GAN, self.batch_size) # total loss return loss_total
def train(iteration): t = time.time() model.train() optimizer.zero_grad() output = model(features, adj) loss_train = F.nll_loss(output, labels) acc_train = accuracy(output, labels) loss_train.backward() optimizer.step() if not fastmode: model.eval() output = model(features, adj) los_val_ = [] if (iteration+1) % 10 == 0: loss_val = F.nll_loss(output, labels) acc_val = accuracy(output, labels) los_val_.append(loss_val.data[0]) print('Epoch:{:04d} Val loss:{:.4f} Val acc:{:.4f}'.format(epoch+1, loss_val.data[0], acc_val.data[0])) return loss_train.data[0],acc_train.data[0],los_val_
def _cal_loss(self, epoch=None): loss_total = torch.zeros(1) if self.use_gpu: loss_total = loss_total.cuda() if self.gen is not None: assert (self.gen.size(-1) == self.cfg.FINE_SIZE) if 'CLS' in self.cfg.LOSS_TYPES: cls_loss = self.criterion_cls(self.cls, self.label) * self.cfg.ALPHA_CLS loss_total = loss_total + cls_loss cls_loss = round(cls_loss.item(), 4) self.loss_meters['TRAIN_CLS_LOSS'].update(cls_loss, self.batch_size) prec1 = util.accuracy(self.cls.data, self.label, topk=(1, )) self.loss_meters['TRAIN_CLS_ACC'].update(prec1[0].item(), self.batch_size) # ) content supervised if self.cfg.NITER_START_CONTENT <= epoch <= self.cfg.NITER_END_CONTENT: if 'SEMANTIC' in self.cfg.LOSS_TYPES: source_features = self.content_model( (self.gen + 1) / 2, layers=self.content_layers) target_features = self.content_model( (self.target_modal + 1) / 2, layers=self.content_layers) len_layers = len(self.content_layers) loss_fns = [self.criterion_content] * len_layers alpha = [1] * len_layers layer_wise_losses = [ alpha[i] * loss_fns[i](source_feature, target_features[i]) for i, source_feature in enumerate(source_features) ] * self.cfg.ALPHA_CONTENT content_loss = sum(layer_wise_losses) loss_total = loss_total + content_loss self.loss_meters['TRAIN_SEMANTIC_LOSS'].update( content_loss.item(), self.batch_size) # total loss return loss_total
def _cal_loss(self, epoch=None): loss_total = torch.zeros(1) if self.use_gpu: loss_total = loss_total.cuda() cls_loss = self.criterion_cls(self.cls, self.label) * self.cfg.ALPHA_CLS loss_total = loss_total + cls_loss cls_loss = round(cls_loss.item(), 4) self.loss_meters['TRAIN_CLS_LOSS'].update(cls_loss, self.batch_size) prec1 = util.accuracy(self.cls.data, self.label, topk=(1,)) self.loss_meters['TRAIN_CLS_ACC'].update(prec1[0].item(), self.batch_size) # total loss return loss_total
def train_epoch(train_loader, net, criterion, optimizer, scheduler, cur_epoch, rank): batch_time, data_time, losses, top1, topk = utils.get_meters() progress = utils.ProgressMeter( len(train_loader), [batch_time, data_time, losses, top1, topk], prefix=" = TRAIN: [{}]".format(cur_epoch), ) lr = utils.get_epoch_lr(cur_epoch) utils.set_lr(optimizer, lr) # set sampler train_loader.sampler.set_epoch(cur_epoch) # switch to train mode net.train() end = time.time() for idx, (inputs, targets) in enumerate(train_loader): data_time.update(time.time() - end) inputs, targets = inputs.cuda(), targets.cuda(non_blocking=True) outputs = net(inputs) loss = criterion(outputs, targets) optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() batch_size = inputs.size(0) acc_1, acc_k = utils.accuracy(outputs, targets, topk=(1, 5)) loss, acc_1, acc_k = utils.scaled_all_reduce([loss, acc_1, acc_k]) losses.update(loss.item(), batch_size) top1.update(acc_1[0], batch_size) topk.update(acc_k[0], batch_size) batch_time.update(time.time() - end) end = time.time() if rank == 0 and (idx % cfg.TRAIN.PRINT_FEQ == 0 or (idx + 1) == len(train_loader)): progress.display(idx)
def validate(model, criterion, device, valid_loader): model.eval() epoch_loss = 0 epoch_acc = 0 steps = 0 with torch.no_grad(): for images_batch, targets_batch in valid_loader: images_batch = images_batch.to(device) targets_batch = targets_batch.to(device) predicts = model(images_batch) loss = criterion(predicts, targets_batch) epoch_loss += loss.item() epoch_acc += accuracy(predicts, targets_batch)[0].item() steps += 1 epoch_loss /= steps epoch_acc /= steps return epoch_loss, epoch_acc
def train(model, optimizer, criterion, device, train_loader, valid_loader, args, information, checkpoints_path): model.to(device) print('Train started...') locer = 0 for epoch in tqdm(range(args.start_epoch, args.epochs)): print(f'Epoch {epoch+1} started') model.train() epoch_loss = 0 epoch_acc = 0 steps = 0 for images_batch, targets_batch in train_loader: images_batch = images_batch.to(device) targets_batch = targets_batch.to(device) predicts = model(images_batch) loss = criterion(predicts, targets_batch) epoch_loss += loss.item() epoch_acc += accuracy(predicts, targets_batch)[0].item() optimizer.zero_grad() loss.backward() optimizer.step() steps += 1 epoch_loss /= steps epoch_acc /= steps val_loss, val_accuracy = validate(model, criterion, device, valid_loader) epoch_info = f'Epoch finished! Train loss: {epoch_loss}, ' \ f'Train acc: {epoch_acc}, Val loss: {val_loss}, Val acc: {val_accuracy}' print(epoch_info) information[0].loc[locer] = [ epoch_loss, epoch_acc, val_loss, val_accuracy ] information[0].to_csv(f'info/{information[1]}.csv', index=False) torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict() }, os.path.join(checkpoints_path, f'epoch_{epoch + 1}.pth')) locer += 1
def evaluate(self, cfg, epoch=None): self.phase = 'test' # switch to evaluate mode self.net.eval() self.imgs_all = [] self.pred_index_all = [] self.target_index_all = [] with torch.no_grad(): print('# Cls val images num = {0}'.format(self.val_image_num)) # batch_index = int(self.val_image_num / cfg.BATCH_SIZE) # random_id = random.randint(0, batch_index) for i, data in enumerate(self.val_loader): self.set_input(data, self.cfg.DATA_TYPE) self._forward() self._process_fc() if not cfg.INFERENCE: # loss if self.loss['cls_loss'] is not None: cls_loss = self.loss['cls_loss'].mean() self.loss_meters['VAL_CLS_LOSS'].update( round(cls_loss.item(), 4), self.batch_size) # accuracy prec1 = util.accuracy(self.cls.data, self.label, topk=(1, )) self.loss_meters['VAL_CLS_ACC'].update(prec1[0].item(), self.batch_size) # Mean ACC mean_acc = self._cal_mean_acc(cfg=cfg, data_loader=self.val_loader) print('mean_acc:', mean_acc) return mean_acc
def test(model, criterion, device, test_loader, args, information, checkpoints_path): model.eval() epoch_loss = 0 epoch_acc = 0 steps = 0 with torch.no_grad(): for images_batch, targets_batch in tqdm(test_loader): images_batch = images_batch.to(device) targets_batch = targets_batch.to(device) predicts = model(images_batch) loss = criterion(predicts, targets_batch) epoch_loss += loss.item() epoch_acc += accuracy(predicts, targets_batch)[0].item() steps += 1 epoch_loss /= steps epoch_acc /= steps print(f'Test finished! Test loss: {epoch_loss}, Test acc: {epoch_acc}') information[0].loc[0] = [epoch_loss, epoch_acc] information[0].to_csv(f'test_info/{information[1]}.csv', index=False)
def validate(val_loader, net, criterion, cur_epoch, rank): batch_time, data_time, losses, top1, topk = utils.get_meters() progress = utils.ProgressMeter( len(val_loader), [batch_time, data_time, losses, top1, topk], prefix=" = VAL: [{}]".format(cur_epoch), ) # switch to evaluate mode net.eval() with torch.no_grad(): end = time.time() for idx, (inputs, targets) in enumerate(val_loader): data_time.update(time.time() - end) inputs, targets = inputs.cuda(), targets.cuda(non_blocking=True) outputs = net(inputs) loss = criterion(outputs, targets) acc_1, acc_k = utils.accuracy(outputs, targets, topk=(1, 5)) loss, acc_1, acc_k = utils.scaled_all_reduce([loss, acc_1, acc_k]) batch_size = inputs.size(0) losses.update(loss.item(), batch_size) top1.update(acc_1[0], batch_size) topk.update(acc_k[0], batch_size) batch_time.update(time.time() - end) end = time.time() if rank == 0 and (idx % cfg.TEST.PRINT_FEQ == 0 or (idx + 1) == len(val_loader)): progress.display(idx) return top1.avg, topk.avg
def OneEpoch(epoch, train_loader, OPTIMIZER, DISP_FREQ, NUM_EPOCH_WARM_UP, NUM_BATCH_WARM_UP): losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() batch = 0 #iterator = iter(train_loader) start = time.time() for inputs, labels in train_loader: if (epoch + 1 <= NUM_EPOCH_WARM_UP) and (batch + 1 <= NUM_BATCH_WARM_UP): # adjust LR for each training batch during warm up warm_up_lr(batch + 1, NUM_BATCH_WARM_UP, LR, OPTIMIZER) # compute output inputs = inputs.to(DEVICE, non_blocking=True) labels = labels.to(DEVICE, non_blocking=True).long() features = BACKBONE(inputs) outputs = HEAD(features, labels) loss = LOSS(outputs, labels) # measure accuracy and record loss prec1, prec5 = accuracy(outputs.data, labels, topk = (1, 5)) losses.update(loss.data.item(), inputs.size(0)) top1.update(prec1.data.item(), inputs.size(0)) top5.update(prec5.data.item(), inputs.size(0)) # compute gradient and do SGD step OPTIMIZER.zero_grad() loss.backward() OPTIMIZER.step() # dispaly training loss & acc every DISP_FREQ if ((batch + 1) % DISP_FREQ == 0) and batch != 0: print("=" * 60) print('Epoch {}/{} Batch {}/{}\t' 'Training Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Training Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Training Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch + 1, NUM_EPOCH, batch + 1, len(train_loader) * NUM_EPOCH, loss = losses, top1 = top1, top5 = top5)) print("Running speed in the last 100 batches: {:.3f} iter/s.".format(DISP_FREQ / (time.time() - start))) start = time.time() print("=" * 60) batch += 1 epoch_loss = losses.avg epoch_acc = top1.avg writer.add_scalar("Training_Loss", epoch_loss, epoch + 1) writer.add_scalar("Training_Accuracy", epoch_acc, epoch + 1) print("=" * 60) print('Epoch: {}/{}\t' 'Training Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Training Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Training Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch + 1, NUM_EPOCH, loss = losses, top1 = top1, top5 = top5)) print("=" * 60) # perform validation & save checkpoints per epoch # validation statistics per epoch (buffer for visualization) print("=" * 60) print("Perform Evaluation on LFW, CFP_FF, CFP_FP, AgeDB, CALFW, CPLFW and VGG2_FP, and Save Checkpoints...") accuracy_lfw, best_threshold_lfw, roc_curve_lfw = perform_val(MULTI_GPU, DEVICE, EMBEDDING_SIZE, BATCH_SIZE, BACKBONE, lfw, lfw_issame) buffer_val(writer, "LFW", accuracy_lfw, best_threshold_lfw, roc_curve_lfw, epoch + 1) # accuracy_cfp_ff, best_threshold_cfp_ff, roc_curve_cfp_ff = perform_val(MULTI_GPU, DEVICE, EMBEDDING_SIZE, BATCH_SIZE, BACKBONE, cfp_ff, cfp_ff_issame) # buffer_val(writer, "CFP_FF", accuracy_cfp_ff, best_threshold_cfp_ff, roc_curve_cfp_ff, epoch + 1) # accuracy_cfp_fp, best_threshold_cfp_fp, roc_curve_cfp_fp = perform_val(MULTI_GPU, DEVICE, EMBEDDING_SIZE, BATCH_SIZE, BACKBONE, cfp_fp, cfp_fp_issame) # buffer_val(writer, "CFP_FP", accuracy_cfp_fp, best_threshold_cfp_fp, roc_curve_cfp_fp, epoch + 1) # accuracy_agedb, best_threshold_agedb, roc_curve_agedb = perform_val(MULTI_GPU, DEVICE, EMBEDDING_SIZE, BATCH_SIZE, BACKBONE, agedb, agedb_issame) # buffer_val(writer, "AgeDB", accuracy_agedb, best_threshold_agedb, roc_curve_agedb, epoch + 1) # accuracy_calfw, best_threshold_calfw, roc_curve_calfw = perform_val(MULTI_GPU, DEVICE, EMBEDDING_SIZE, BATCH_SIZE, BACKBONE, calfw, calfw_issame) # buffer_val(writer, "CALFW", accuracy_calfw, best_threshold_calfw, roc_curve_calfw, epoch + 1) # accuracy_cplfw, best_threshold_cplfw, roc_curve_cplfw = perform_val(MULTI_GPU, DEVICE, EMBEDDING_SIZE, BATCH_SIZE, BACKBONE, cplfw, cplfw_issame) # buffer_val(writer, "CPLFW", accuracy_cplfw, best_threshold_cplfw, roc_curve_cplfw, epoch + 1) accuracy_vgg2_fp, best_threshold_vgg2_fp, roc_curve_vgg2_fp = perform_val(MULTI_GPU, DEVICE, EMBEDDING_SIZE, BATCH_SIZE, BACKBONE, vgg2_fp, vgg2_fp_issame) buffer_val(writer, "VGGFace2_FP", accuracy_vgg2_fp, best_threshold_vgg2_fp, roc_curve_vgg2_fp, epoch + 1) print("=" * 60) # save checkpoints per epoch if MULTI_GPU: torch.save(BACKBONE.module.state_dict(), os.path.join(MODEL_ROOT, "Backbone_{}_Epoch_{}_Batch_{}_Time_{}_checkpoint.pth".format(BACKBONE_NAME, epoch + 1, batch, get_time()))) torch.save(HEAD.state_dict(), os.path.join(MODEL_ROOT, "Head_{}_Epoch_{}_Batch_{}_Time_{}_checkpoint.pth".format(HEAD_NAME, epoch + 1, batch, get_time()))) else: torch.save(BACKBONE.state_dict(), os.path.join(MODEL_ROOT, "Backbone_{}_Epoch_{}_Batch_{}_Time_{}_checkpoint.pth".format(BACKBONE_NAME, epoch + 1, batch, get_time()))) torch.save(HEAD.state_dict(), os.path.join(MODEL_ROOT, "Head_{}_Epoch_{}_Batch_{}_Time_{}_checkpoint.pth".format(HEAD_NAME, epoch + 1, batch, get_time())))
model[index].nfe = 0 nfe_backward = nfe_backward / len(odelayer_indexes) logger.info(f'nfe_backward is: {nfe_backward}') batch_time_meter.update(time.time() - end) if is_odenet: f_nfe_meter.update(nfe_forward) b_nfe_meter.update(nfe_backward) end = time.time() if iterations % batches_per_epoch == 0: train_loss /= batches_per_epoch epoch += 1 with torch.no_grad(): val_acc = accuracy(model, test_loader, args) logger.info( "Epoch {:04d} | Time {:.3f} ({:.3f}) | NFE-F {:.1f} | NFE-B {:.1f} | " "Test Acc {:.4f} | Training Loss {:.4f}".format( iterations // batches_per_epoch, batch_time_meter.val, batch_time_meter.avg, f_nfe_meter.avg, b_nfe_meter.avg, val_acc, train_loss)) writer.writerow( [f'{epoch}', f'{iterations}', f'{train_loss}', f'{val_acc}']) csv_file.flush() train_loss = 0. # Save state to file
0]: # adjust LR for each training stage after warm up, you can also choose to adjust LR manually (with slight modification) once plaueau observed schedule_lr(OPTIMIZER) if batch == STAGES[1]: schedule_lr(OPTIMIZER) if batch == STAGES[2]: schedule_lr(OPTIMIZER) # compute output inputs = inputs.to(DEVICE) labels = labels.to(DEVICE).long() features = BACKBONE(inputs) outputs = HEAD(features, labels) loss = LOSS(outputs, labels) # measure accuracy and record loss prec1, prec5 = accuracy(outputs.data, labels, topk=(1, 5)) losses.update(loss.data.item(), inputs.size(0)) top1.update(prec1.data.item(), inputs.size(0)) top5.update(prec5.data.item(), inputs.size(0)) # compute gradient and do SGD step OPTIMIZER.zero_grad() loss.backward() OPTIMIZER.step() # dispaly training loss & acc every DISP_FREQ if batch % 2000 == 0 and batch != 0: print("=" * 60) print('Epoch {}/{} Batch {}/{}\t' 'Training Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Training Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
def train(train_loader, backbone, head, criterion, optimizer, epoch, cfg, writer): DISP_FREQ = 100 # 100 batch batch = 0 # batch index backbone.train() # set to training mode head.train() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() for inputs, labels in tqdm(iter(train_loader)): # compute output start_time = time.time() inputs = inputs.cuda(cfg['GPU'], non_blocking=True) labels = labels.cuda(cfg['GPU'], non_blocking=True) features, conv_features = backbone(inputs) outputs, original_logits = head(features, labels) loss = criterion(outputs, labels) end_time = time.time() duration = end_time - start_time if ((batch + 1) % DISP_FREQ == 0) and batch != 0: print("batch inference time", duration) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure accuracy and record loss prec1, prec5 = accuracy(original_logits.data, labels, topk=(1, 5)) losses.update(loss.data.item(), inputs.size(0)) top1.update(prec1.data.item(), inputs.size(0)) top5.update(prec5.data.item(), inputs.size(0)) # dispaly training loss & acc every DISP_FREQ if ((batch + 1) % DISP_FREQ == 0) or batch == 0: print("=" * 60) print('Epoch {}/{} Batch {}/{}\t' 'Training Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Training Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Training Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch + 1, cfg['NUM_EPOCH'], batch + 1, len(train_loader), loss=losses, top1=top1, top5=top5)) print("=" * 60) sys.stdout.flush() batch += 1 # batch index epoch_loss = losses.avg epoch_acc = top1.avg print("=" * 60) print('Epoch: {}/{}\t' 'Training Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Training Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Training Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch + 1, cfg['NUM_EPOCH'], loss=losses, top1=top1, top5=top5)) sys.stdout.flush() print("=" * 60) if cfg['RANK'] == 0: writer.add_scalar("Training_Loss", epoch_loss, epoch + 1) writer.add_scalar("Training_Accuracy", epoch_acc, epoch + 1) writer.add_scalar("Top1", top1.avg, epoch + 1) writer.add_scalar("Top5", top5.avg, epoch + 1)
def main(): args = run_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_infer: raise ValueError( "At least one of `do_train` or `do_infer` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = ColaProcessor() label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model if args.upper_model == "Linear": model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) elif args.upper_model == "CNN": model = BertCnn.from_pretrained(args.bert_model, num_labels=num_labels, seq_len=args.max_seq_length) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: # model = torch.nn.DataParallel(model) model = DataParallelModel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch predictions = model(input_ids, segment_ids, input_mask, label_ids) for i in range(len(predictions)): predictions[i] = predictions[i].view(-1, num_labels) loss_fct = CrossEntropyLoss() loss_fct_parallel = DataParallelCriterion(loss_fct) loss = loss_fct_parallel(predictions, label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # do eval if global_step % args.eval_freq == 0 and global_step > 0: logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): eval_preds = model(input_ids, segment_ids, input_mask, label_ids) # 计算loss for i in range(len(eval_preds)): eval_preds[i] = eval_preds[i].view(-1, num_labels) loss = loss_fct_parallel(eval_preds, label_ids.view(-1)) if n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tmp_eval_loss = loss eval_preds = torch.cat( eval_preds) # shape: [batch_size, num_labels] logits = eval_preds.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) if args.do_infer: infer_examples = processor.get_infer_examples(args.data_dir) infer_features = convert_examples_to_features(infer_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running Inference *****") logger.info(" Num examples = %d", len(infer_examples)) logger.info(" Batch size = %d", args.infer_batch_size) all_input_ids = torch.tensor([f.input_ids for f in infer_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in infer_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in infer_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in infer_features], dtype=torch.long) infer_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data infer_sampler = SequentialSampler(infer_data) infer_dataloader = DataLoader(infer_data, sampler=infer_sampler, batch_size=args.infer_batch_size) model.eval() for input_ids, input_mask, segment_ids, label_ids in tqdm( infer_dataloader, desc="Inference"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): infer_preds = model(input_ids, segment_ids, input_mask, label_ids) for i in range(len(infer_preds)): infer_preds[i] = infer_preds[i].view(-1, num_labels) infer_preds = torch.cat( infer_preds) # shape: [batch_size, num_labels] logits = infer_preds.detach().cpu().numpy() outputs = np.argmax(logits, axis=1) print(outputs) logger.info("***** Infer finished *****")
0]: # adjust LR for each training stage after warm up, you can also choose to adjust LR manually (with slight modification) once plaueau observed schedule_lr(OPTIMIZER) if batch == STAGES[1]: schedule_lr(OPTIMIZER) if batch == STAGES[2]: schedule_lr(OPTIMIZER) # compute output inputs = inputs.to(DEVICE) labels = labels.to(DEVICE).long() features = BACKBONE(inputs) outputs = HEAD(features, labels) loss = LOSS(outputs, labels) # measure accuracy and record loss prec1, prec5 = accuracy(outputs.data, labels, topk=(1, 5)) losses.update(loss.data.item(), inputs.size(0)) top1.update(prec1.data.item(), inputs.size(0)) top5.update(prec5.data.item(), inputs.size(0)) # compute gradient and do SGD step OPTIMIZER.zero_grad() loss.backward() OPTIMIZER.step() # dispaly training loss & acc every DISP_FREQ if batch % 2000 == 0 and batch != 0: print("=" * 60) print('Epoch {}/{} Batch {}/{}\t' 'Training Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Training Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
def test(features, adj, labels, index=range(features.shape[0])): model.eval() output = model(features, adj) loss = F.nll_loss(output[index], labels[index]) acc = utils.accuracy(output[index], labels[index]) return loss, acc