def multi_infer(params): model = torch.load( params['init_model'], map_location="cuda" if torch.cuda.is_available() else "cpu") print('Load model', params['init_model']) # model = model.to(device) model = model.cuda() model.eval() infer_loader = get_iwildcam_loader(params, mode='infer') infer_loader = data_prefetcher(infer_loader) y_preds, y_scores, y_ids = [], [], [] logits_preds = [] t1 = time() print('Begin to infer') with torch.no_grad(): inputs, labels, ids = infer_loader.next() i = 0 while inputs is not None: # 遍历batch_size上的多个图片 _, concat_logits, _, _, _ = model(inputs) # vector output = torch.nn.functional.softmax(concat_logits, dim=-1) output = output.cpu().detach().numpy() logits_preds.extend(output) # logits-vector 也是softmax后的prob y_preds.extend(np.argmax(output, axis=1)) # list[class_id] y_scores.extend(np.max(output, axis=1)) y_ids.extend(ids) # image_name: list[xxx_00000x.jpg] if (i + 1) % params['print_step'] == 0: print("iter: %d, time_cost_per_iter: %.4f s" % (i, (time() - t1) / params['print_step'])) t1 = time() i += 1 inputs, labels, ids = infer_loader.next() O_ids = list(map(lambda x: x.split('_')[0], y_ids)) # 'Id': [xxx_000.jpg, yyy_000.jpg],'O_Id': [xxx, yyy], 'Class': [class_id], 'Score': [0.1] pred_df = {'Id': y_ids, 'O_Id': O_ids, 'Class': y_preds, 'Score': y_scores} pred_df = pd.DataFrame(pred_df) save_path = params['save_pred_dir'] + params['init_model'].split( '/')[-1].split('.')[0] + '_preds.csv' pred_df.to_csv(save_path, index=False) print("Save {} to {}".format( params['init_model'].split('/')[-1].split('.')[0] + '_preds.csv', save_path)) logits_df = { 'Id': y_ids, 'Class': y_preds, 'Logits': list(logits_preds) } # logits-vector, model embedding用 logits_df = pd.DataFrame(logits_df) save_path = params['save_pred_dir'] + params['init_model'].split( '/')[-1].split('.')[0] + '_logits.csv' logits_df.to_csv(save_path, index=False) print("Save {} to {}".format( params['init_model'].split('/')[-1].split('.')[0] + '_logits.csv', save_path)) print('pred done', pred_df.shape)
def infer(params): model = torch.load(params['init_model']) print('load model', params['init_model']) model = model.to(device) model.eval() infer_loader = get_iwildcam_loader(params, mode='infer') infer_loader = data_prefetcher(infer_loader) y_preds, y_ids = [], [] logits_preds = [] t1 = time() print('begin to infer') with torch.no_grad(): inputs, labels, ids = infer_loader.next() i = 0 while inputs is not None: output = model(inputs) output = torch.nn.functional.softmax(output, dim=-1) output = output.cpu().detach().numpy() logits_preds.extend(output) y_preds.extend(np.argmax(output, axis=1)) y_ids.extend(ids) if (i + 1) % params['print_step'] == 0: print("iter: %d, time_cost_per_iter: %.4f s" % (i, (time() - t1) / params['print_step'])) t1 = time() i += 1 inputs, labels, ids = infer_loader.next() pred_df = {'Id': y_ids, 'Predicted': y_preds} pred_df = pd.DataFrame(pred_df) save_path = params['save_pred_dir'] + params['init_model'].split( '/')[-1].split('.')[0] + '_preds.csv' pred_df.to_csv(save_path, index=False) logits_df = { 'Id': y_ids, 'Predicted': y_preds, 'Logits': list(logits_preds) } logits_df = pd.DataFrame(logits_df) save_path = params['save_pred_dir'] + params['init_model'].split( '/')[-1].split('.')[0] + '_logits.csv' logits_df.to_csv(save_path, index=False) print('pred done', pred_df.shape) return pred_df
def multi_inferv2(params): model = torch.load(params['init_model']) print('=> Load model', params['init_model']) model = model.cuda() model.eval() infer_loader = get_iwildcam_loader(params, mode=params['mode']) infer_loader = data_prefetcher(infer_loader) y_preds, y_scores, y_ids = [], [], [] logits_preds = [] t1 = time() print('=> Begin to infer') with torch.no_grad(): inputs, labels, ids = infer_loader.next() i = 0 while inputs is not None: # 遍历batch_size上的多个图片 output = model(inputs) # vector output = torch.nn.functional.softmax(output, dim=-1) output = output.cpu().detach().numpy() logits_preds.extend(output) y_preds.extend(np.argmax(output, axis=1)) # list[class_id] y_scores.extend(np.max(output, axis=1)) y_ids.extend(ids) # image_name: list[xxx_00000x.jpg] if (i+1) % params['print_step'] == 0: print("iter: %d, time_cost_per_iter: %.4f s" % (i, (time() - t1)/params['print_step'])) t1 = time() i += 1 inputs, labels, ids = infer_loader.next() O_ids = list(map(lambda x: x.split('_')[0], y_ids)) # 'Id': [xxx.jpg, yyy.jpg], 'Class': [class_id], 'Score': [0.1] print("=> Pred Data Len: {}".format(len(y_ids))) pred_df = {'Id': y_ids, 'Class': y_preds, 'Score': y_scores} pred_df = pd.DataFrame(pred_df) save_path = os.path.join(params['save_pred_dir'], params['init_model'].split('/')[-1].split('.')[0]+'_preds.csv') pred_df.to_csv(save_path, index=False) print("=> Save {} to {}".format(params['init_model'].split('/')[-1].split('.')[0]+'_preds.csv', save_path)) logits_df = {'Id': y_ids, 'Class': y_preds, 'Logits': list(logits_preds)} # logits-vector, model embedding用 logits_df = pd.DataFrame(logits_df) save_path = os.path.join(params['save_pred_dir'], params['init_model'].split('/')[-1].split('.')[0]+'_logits.csv') logits_df.to_csv(save_path, index=False) print("=> Save {} to {}".format(params['init_model'].split('/')[-1].split('.')[0]+'_logits.csv', save_path))
def train(params): if params['init_model'] is not None: model = torch.load(params['init_model']) print('load model', params['init_model']) else: model = create_model(params['Net'], pretrained=params['pretrained'], num_classes=params['num_classes'], drop_rate=params['drop_rate'], global_pool='avg', bn_tf=False, bn_momentum=0.99, bn_eps=1e-3, checkpoint_path=params['init_model'], in_chans=3) optimizer = get_optimizer(params, model) param_num = sum([p.data.nelement() for p in model.parameters()]) print("Number of model parameters: {} M".format(param_num / 1024 / 1024)) model = model.to(device) model.train() if params['lr_schedule']: scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=params['lr_decay_epochs'], gamma=0.2) if params['loss'] == 'ce' or params['loss'] == 'cross_entropy': criterion = cross_entropy().to(device) label_type = 'float' elif params['loss'] == 'focal': criterion = focal_loss(gamma=1.0, alpha=1.0).to(device) label_type = 'long' else: print('no exist loss', params['loss']) train_data_loader, dev_data_loader = get_iwildcam_loader( params, mode=params['mode']) train_log = [] dev_log = [] best_acc, best_f1, best_epoch = 0, 0, 0 t1 = time() print('begin to train') use_onehot = params['loss'] != 'focal' for epoch in range(params['epochs']): train_loader = data_prefetcher(train_data_loader, label_type) inputs, labels, ids = train_loader.next() i = 0 while inputs is not None: mixup_now = np.random.random() < params['aug_proba'] if params['mixup'] and mixup_now: inputs, labels_a, labels_b, lam = mixup_data( inputs, labels, params['mixup_alpha']) optimizer.zero_grad() output = model(inputs) if params['mixup'] and mixup_now: loss = mixup_criterion(criterion, output, labels_a, labels_b, lam) else: loss = criterion(output, labels) loss.backward() optimizer.step() if i % params['print_step'] == 0: preds = np.argmax(output.cpu().detach().numpy(), axis=1) if use_onehot: targets = np.argmax(labels.cpu().detach().numpy(), axis=1) else: targets = labels.cpu().detach().numpy() acc = metrics.accuracy_score(targets, preds) loss_val = loss.cpu().detach().numpy() f1 = metrics.f1_score(targets, preds, average='macro') train_log.append([epoch, i, loss_val, acc, f1]) print( "epoch: %d, iter: %d, train_loss: %.4f, train_acc: %.4f, train_f1: %.4f, time_cost_per_iter: %.4f s" % (epoch, i, loss_val, acc, f1, (time() - t1) / params['print_step'])) with open(params['log_dir'] + 'train.tsv', 'a') as f: f.write('%05d\t%05d\t%f\t%f\t%f\n' % (epoch, i, loss_val, acc, f1)) t1 = time() if (i + 1) % params['save_step'] == 0: save_model_path = os.path.join(params['save_dir'], 'model_%d_%d.pkl' % (epoch, i)) torch.save(model, save_model_path) print('save model to', save_model_path) if (i + 1) % params['eval_step'] == 0: t2 = time() model.eval() data_loader = data_prefetcher(dev_data_loader, label_type) loss_val, acc, f1 = evaluate(model, data_loader, criterion, use_onehot) model.train() dev_log.append([epoch, i, loss_val, acc, f1]) if f1 > best_f1: best_acc, best_f1, best_epoch = acc, f1, epoch print('[Evaluation] -------------------------------') print( "epoch: %d, test acc: %.4f, f1-score: %.4f, loss: %.4f, best-f1-score: %.4f, eval_time: %.4f s" % (epoch, acc, f1, loss_val, best_f1, time() - t2)) print('[Evaluation] -------------------------------') with open(params['log_dir'] + 'eval.tsv', 'a') as f: f.write('%05d\t%05d\t%f\t%f\t%f\n' % (epoch, i, loss_val, acc, f1)) inputs, labels, ids = train_loader.next() i += 1 if params['lr_schedule']: scheduler.step(epoch) return model
def main(cfg, kcross=-1, K=-1): tensorboard_dir = os.path.join(cfg.SAVE_DIR, "tb_event") if not os.path.exists(cfg.SAVE_DIR): os.makedirs(cfg.SAVE_DIR) else: print( "This directory has already existed, Please remember to modify your configs" ) if not click.confirm( "\033[1;31;40mContinue and override the former directory?\033[0m", default=False, ): exit(0) if tensorboard_dir is not None and os.path.exists(tensorboard_dir): shutil.rmtree(tensorboard_dir) print("=> output model will be saved in {}".format(cfg.SAVE_DIR)) tb_writer = SummaryWriter(tensorboard_dir) model = create_model( cfg.NET.TYPE, pretrained=cfg.NET.PRETRAINED, num_classes=cfg.NUM_CLASSES, drop_rate=cfg.NET.DROP_RATE, global_pool='avg', bn_tf=False, bn_momentum=0.99, bn_eps=1e-3, checkpoint_path=cfg.INIT_MODEL if cfg.INIT_MODEL != "" else None, in_chans=3) print(model) optimizer = get_optimizer(cfg, model) # use torchvision.models # model = models.__dict__[params['Net']](num_classes=params['num_classes']) param_num = sum([p.data.nelement() for p in model.parameters()]) print("=> Number of model parameters: {} M".format(param_num / 1024 / 1024)) model = model.cuda() # summary(model, (3, cfg.INPUT_SIZE[0], cfg.INPUT_SIZE[1])) model = DataParallel(model) train_data_loader, dev_data_loader = get_iwildcam_loader( cfg, mode=cfg.MODE) # train/eval的dataloader if cfg.TRAIN.LR_SCHEDULE == 'Step': # True scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=cfg.TRAIN.LR_DECAY_EPOCHS, gamma=0.2) elif cfg.TRAIN.LR_SCHEDULE == 'Cosine': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=(cfg.TRAIN.EPOCHS // 9) + 1, eta_min=1e-06) else: raise NotImplementedError("Only Support lr_schdule(step, cosine)") best_acc, best_f1, best_epoch, start_epoch = 0, 0, 0, 1 # ------ Begin Resume ------- if cfg.RESUME: load_ckpt(cfg.SAVE_DIR) # read history parameters from json ckpt = torch.load( cfg.INIT_MODEL, map_location="cuda") # already specify in load_params() print('=> Load checkpoint from ', cfg.INIT_MODEL) model.load_state_dict(ckpt['state_dict']) optimizer.load_state_dict(ckpt['optimizer']) scheduler.load_state_dict(ckpt['scheduler']) start_epoch = ckpt['epoch'] + 1 # best_acc = ckpt['best_acc'] best_f1 = ckpt['best_f1'] best_epoch = ckpt['best_epoch'] if cfg.LOSS.LOSS_TYPE == 'CE': criterion = cross_entropy(func_type='softmax').to(device) if cfg.LOSS.WEIGHT_PER_CLS: CE = torch.nn.CrossEntropyLoss(weight=torch.from_numpy( cfg.LOSS.WEIGHT_PER_CLS).float().to(device)) label_type = 'float' elif cfg.LOSS.LOSS_TYPE == 'Sigmoid_CE': criterion = cross_entropy(func_type='sigmoid').to(device) label_type = 'float' elif cfg.LOSS.LOSS_TYPE == 'Focal': criterion = focal_loss(gamma=1.0, alpha=1.0).to(device) label_type = 'long' elif cfg.LOSS.LOSS_TYPE == 'CB_loss': # FIXME: this is unsure implementation, low score criterion = cb_loss(cfg.LOSS.SAMPLES_PER_CLS, cfg.NUM_CLASSES, 'softmax').to(device) label_type = 'float' else: raise NotImplementedError("Not accessible loss type for: {}".format( cfg.LOSS.LOSS_TYPE)) t0 = time.time() t1 = time.time() print('[INFO]Begin to train') use_onehot = cfg.LOSS.LOSS_TYPE != 'Focal' for epoch in range(start_epoch, cfg.TRAIN.EPOCHS + 1): print('=> Current Lr {:.5e}'.format(optimizer.param_groups[0]['lr'])) if cfg.TRAIN.LR_SCHEDULE: scheduler.step() if cfg.LOSS.CLASS_WEIGHT: train_loss, train_acc, train_f1 = \ train(train_data_loader, model, CE, optimizer, epoch, cfg, label_type, use_onehot) else: train_loss, train_acc, train_f1 = \ train(train_data_loader, model, criterion, optimizer, epoch, cfg, label_type, use_onehot) val_acc, val_f1 = validate(dev_data_loader, model, criterion, cfg, label_type, use_onehot) # TODO: this should also be done with the ProgressMeter print('=> [Epoch-{}] * Acc {:.3f} F1 {:.3f}'.format( epoch, val_acc, val_f1)) is_best = val_f1 > best_f1 best_f1 = max(val_f1, best_f1) best_epoch = epoch if is_best else best_epoch tb_writer.add_scalar('train_loss', train_loss, epoch) tb_writer.add_scalar('val_metrics/val_acc', val_acc, epoch) tb_writer.add_scalar('val_metrics/val_f1-score', val_f1, epoch) save_model_path = os.path.join(cfg.SAVE_DIR, 'model_{:03d}.pkl'.format(epoch)) torch.save( { 'state_dict': model.state_dict(), 'scheduler': scheduler.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, # 'best_acc': best_acc, 'best_f1': best_f1, 'best_epoch': best_epoch, }, save_model_path) print('=> save model to', save_model_path) print("=> Train is over, Time cost: {:.1f} hours...".format( (time.time() - t0) / 3600)) source = 'model_{:03d}.pkl'.format(best_epoch) source_path = os.path.join(cfg.SAVE_DIR, source) target = 'model_best.pkl' target_path = os.path.join(cfg.SAVE_DIR, target) try: shutil.copy(source_path, target_path) print("Save best model to {}: [Epoch: {:d} / f1-score: {:.4f}]".format( target_path, best_epoch, best_f1)) except IOError as e: print("Unable to copy file. %s" % e) except: print("Unexpected error:", sys.exc_info()) # ---- Delete Useless ckpt ckpts = sorted(name for name in os.listdir(cfg.SAVE_DIR) if name.startswith('model')) ckpts = ckpts[:-1] print("=> Start to clean checkpoint from {} to {}".format( ckpts[0], ckpts[-1])) for name in ckpts: os.remove(os.path.join(cfg.SAVE_DIR, name)) if cfg.CROSS_VALIDATION: ksave_path = os.path.join(cfg.SAVE_DIR, 'kcross_model') if not os.path.exists(ksave_path): os.makedirs(ksave_path) kmodel_path = os.path.join(ksave_path, 'kcross_{}.pkl'.format(kcross)) shutil.copy(target_path, kmodel_path) print("=> Save K-best model to {}...".format(kmodel_path))
def multi_infer(cfg): model = create_model( cfg.NET.TYPE, pretrained=cfg.NET.PRETRAINED, num_classes=cfg.NUM_CLASSES, drop_rate=cfg.NET.DROP_RATE, global_pool='avg', bn_tf=False, bn_momentum=0.99, bn_eps=1e-3, checkpoint_path=cfg.INIT_MODEL if cfg.INIT_MODEL != "" else None, in_chans=3) print(model) # model = torch.load(cfg.INIT_MODEL, map_location="cuda" if torch.cuda.is_available() else "cpu") checkpoint = torch.load( cfg.INIT_MODEL, map_location="cuda" if torch.cuda.is_available() else "cpu") print('Load model', cfg.INIT_MODEL) state_dict = checkpoint['state_dict'] for k in list(state_dict.keys()): if k.startswith('module'): state_dict[k[len("module."):]] = state_dict[k] del state_dict[k] msg = model.load_state_dict(state_dict, strict=False) # model = model.to(device) model = model.cuda() model.eval() infer_loader = get_iwildcam_loader(cfg, mode='infer') infer_loader = data_prefetcher(infer_loader) y_preds, y_scores, y_ids = [], [], [] logits_preds = [] t1 = time() print('Begin to infer') with torch.no_grad(): inputs, labels, ids = infer_loader.next() i = 0 while inputs is not None: # loop each image in a batch output = model(inputs) # vector output = torch.nn.functional.softmax(output, dim=-1) output = output.cpu().detach().numpy() # prob logits_preds.extend(output) y_preds.extend(np.argmax(output, axis=1)) # list[class_id] y_scores.extend(np.max(output, axis=1)) y_ids.extend(ids) # image_name: list[xxx_00000x.jpg] if (i + 1) % 40 == 0: print("iter: %d, time_cost_per_iter: %.4f s" % (i, (time() - t1) / 40)) t1 = time() i += 1 inputs, labels, ids = infer_loader.next() O_ids = list(map(lambda x: x.split('_')[0], y_ids)) # 'Id': [xxx_000.jpg, yyy_000.jpg],'O_Id': [xxx, yyy], 'Class': [class_id], 'Score': [0.1] pred_df = {'Id': y_ids, 'O_Id': O_ids, 'Class': y_preds, 'Score': y_scores} pred_df = pd.DataFrame(pred_df) save_path = os.path.join( cfg.SAVE_PRED_DIR, cfg.INIT_MODEL.split('/')[-1].split('.')[0] + '_preds.csv') pred_df.to_csv(save_path, index=False) print("=> Save {} to {}".format( cfg.INIT_MODEL.split('/')[-1].split('.')[0] + '_preds.csv', save_path)) logits_df = { 'Id': y_ids, 'Class': y_preds, 'Logits': list(logits_preds) } # logits-vector, model embedding用 logits_df = pd.DataFrame(logits_df) save_path = os.path.join( cfg.SAVE_PRED_DIR, cfg.INIT_MODEL.split('/')[-1].split('.')[0] + '_logits.csv') logits_df.to_csv(save_path, index=False) print("=> Save {} to {}".format( cfg.INIT_MODEL.split('/')[-1].split('.')[0] + '_logits.csv', save_path))
def multi_inferv2(params): # read parameters from train scripts: parameteres.json with open(os.path.join(params['save_pred_dir'], 'parameters.json')) as file: train_params = json.load(file) params['backbone'] = train_params['backbone'] params['CAT_NUM'] = train_params['CAT_NUM'] params['PROPOSAL_NUM'] = train_params['PROPOSAL_NUM'] ckpt = torch.load(params['init_model']) print('=> Load model', params['init_model']) model = NTS.attention_net(params, CAT_NUM=params['CAT_NUM'], topN=params['PROPOSAL_NUM']) model.load_state_dict(ckpt['state_dict']) model = model.cuda() model.eval() infer_loader = get_iwildcam_loader(params, mode=params['mode']) infer_loader = data_prefetcher(infer_loader) y_preds, y_scores, y_ids = [], [], [] logits_preds = [] t1 = time() print('=> Begin to infer') with torch.no_grad(): inputs, labels, ids = infer_loader.next() i = 0 while inputs is not None: # 遍历batch_size上的多个图片 _, concat_logits, _, _, _ = model(inputs) output = torch.nn.functional.softmax(concat_logits, dim=-1) output = output.cpu().detach().numpy() logits_preds.extend(output) # logits-vector 也是softmax后的prob y_preds.extend(np.argmax(output, axis=1)) # list[class_id] y_scores.extend(np.max(output, axis=1)) y_ids.extend(ids) # image_name: list[xxx_00000x.jpg] if (i + 1) % params['print_step'] == 0: print("iter: %d, time_cost_per_iter: %.4f s" % (i, (time() - t1) / params['print_step'])) t1 = time() i += 1 inputs, labels, ids = infer_loader.next() O_ids = list(map(lambda x: x.split('_')[0], y_ids)) # 'Id': [xxx.jpg, yyy.jpg], 'Class': [class_id], 'Score': [0.1] print("=> Pred Data Len: {}".format(len(y_ids))) pred_df = {'Id': y_ids, 'Class': y_preds, 'Score': y_scores} pred_df = pd.DataFrame(pred_df) save_path = params['save_pred_dir'] + params['init_model'].split( '/')[-1].split('.')[0] + '_preds.csv' pred_df.to_csv(save_path, index=False) print("=> Save {} to {}".format( params['init_model'].split('/')[-1].split('.')[0] + '_preds.csv', save_path)) logits_df = { 'Id': y_ids, 'Class': y_preds, 'Logits': list(logits_preds) } # logits-vector, model embedding用 logits_df = pd.DataFrame(logits_df) save_path = params['save_pred_dir'] + params['init_model'].split( '/')[-1].split('.')[0] + '_logits.csv' logits_df.to_csv(save_path, index=False) print("=> Save {} to {}".format( params['init_model'].split('/')[-1].split('.')[0] + '_logits.csv', save_path)) print('pred done', pred_df.shape)
def main(cfg): tensorboard_dir = os.path.join(cfg.SAVE_DIR, "tb_event") if not os.path.exists(cfg.SAVE_DIR): os.makedirs(cfg.SAVE_DIR) else: print("This directory has already existed, Please remember to modify your configs") if not click.confirm( "\033[1;31;40mContinue and override the former directory?\033[0m", default=False, ): exit(0) if tensorboard_dir is not None and os.path.exists(tensorboard_dir): shutil.rmtree(tensorboard_dir) print("=> output model will be saved in {}".format(cfg.SAVE_DIR)) tb_writer = SummaryWriter(tensorboard_dir) model = NTS.attention_net(cfg, CAT_NUM=cfg.NET.CAT_NUM, topN=cfg.NET.PROPOSAL_NUM) print(model) # special for NTS raw_parameters = list(model.pretrained_model.parameters()) part_parameters = list(model.proposal_net.parameters()) concat_parameters = list(model.concat_net.parameters()) partcls_parameters = list(model.partcls_net.parameters()) raw_optimizer = torch.optim.SGD(raw_parameters, lr=cfg.TRAIN.LR, momentum=0.9, weight_decay=cfg.TRAIN.WEIGHT_DECAY) concat_optimizer = torch.optim.SGD(concat_parameters, lr=cfg.TRAIN.LR, momentum=0.9, weight_decay=cfg.TRAIN.WEIGHT_DECAY) part_optimizer = torch.optim.SGD(part_parameters, lr=cfg.TRAIN.LR, momentum=0.9, weight_decay=cfg.TRAIN.WEIGHT_DECAY) partcls_optimizer = torch.optim.SGD(partcls_parameters, lr=cfg.TRAIN.LR, momentum=0.9, weight_decay=cfg.TRAIN.WEIGHT_DECAY) param_num = sum([p.data.nelement() for p in model.parameters()]) print("Number of model parameters: {} M".format(param_num / 1024 / 1024)) model = model.cuda() model = DataParallel(model) model.train() train_data_loader, dev_data_loader = get_iwildcam_loader(params, mode=params['mode']) # train/eval的dataloader if params['lr_schedule'] == "Step":# True # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=params['lr_decay_epochs'], gamma=0.2) schedulers = [MultiStepLR(raw_optimizer, milestones=cfg.TRAIN.LR_DECAY_EPOCHS, gamma=0.1), MultiStepLR(concat_optimizer, milestones=cfg.TRAIN.LR_DECAY_EPOCHS, gamma=0.1), MultiStepLR(part_optimizer, milestones=cfg.TRAIN.LR_DECAY_EPOCHS, gamma=0.1), MultiStepLR(partcls_optimizer, milestones=cfg.TRAIN.LR_DECAY_EPOCHS, gamma=0.1)] elif params['lr_schedule'] == "Cosine": schedulers = [CosineAnnealingLR(raw_optimizer, T_max=(cfg.TRAIN.EPOCHS // 9) + 1, eta_min=1e-06), CosineAnnealingLR(concat_optimizer, T_max=(cfg.TRAIN.EPOCHS // 9) + 1, eta_min=1e-06), CosineAnnealingLR(part_optimizer, T_max=(cfg.TRAIN.EPOCHS // 9) + 1, eta_min=1e-06), CosineAnnealingLR(partcls_optimizer, T_max=(cfg.TRAIN.EPOCHS // 9) + 1, eta_min=1e-06) ] best_acc, best_f1, best_epoch, start_epoch = 0, 0, 0, 1 # ------ Begin Resume ------- if cfg.RESUME: load_ckpt(cfg.SAVE_DIR) # read history parameters from json ckpt = torch.load(cfg.INIT_MODEL, map_location="cuda") # already specify in load_params() print('=> Load checkpoint from ', cfg.INIT_MODEL) model.load_state_dict(ckpt['state_dict']) raw_optimizer.load_state_dict(ckpt['raw_optimizer']) part_optimizer.load_state_dict(ckpt['part_optimizer']) concat_optimizer.load_state_dict(ckpt['concat_optimizer']) partcls_optimizer.load_state_dict(ckpt['partcls_optimizer']) # optimizer.load_state_dict(ckpt['optimizer']) scheduler.load_state_dict(ckpt['schduler']) # FIXME: to check start_epoch = ckpt['epoch'] + 1 # best_acc = ckpt['best_acc'] best_f1 = ckpt['best_f1'] best_epoch = ckpt['best_epoch'] if cfg.LOSS.LOSS_TYPE == 'CE': criterion = cross_entropy(func_type='softmax').to(device) if cfg.LOSS.WEIGHT_PER_CLS: CE = torch.nn.CrossEntropyLoss(weight=torch.from_numpy(cfg.LOSS.WEIGHT_PER_CLS).float().to(device)) label_type = 'float' elif cfg.LOSS.LOSS_TYPE == 'Sigmoid_CE': criterion = cross_entropy(func_type='sigmoid').to(device) label_type = 'float' elif cfg.LOSS.LOSS_TYPE == 'Focal': criterion = focal_loss(gamma=1.0, alpha=1.0).to(device) label_type = 'long' elif cfg.LOSS.LOSS_TYPE == 'CB_loss': # FIXME: this is unsure implementation, low score criterion = cb_loss(cfg.LOSS.SAMPLES_PER_CLS, cfg.NUM_CLASSES, 'softmax').to(device) label_type = 'float' else: raise NotImplementedError("Not accessible loss type for: {}".format(cfg.LOSS.LOSS_TYPE)) t0 = time() t1 = time() it = 0 print('[INFO]Begin to train') use_onehot = cfg.LOSS.LOSS_TYPE != 'Focal' for epoch in range(start_epoch, cfg.TRAIN.EPOCHS + 1): print('=> Current Lr {:.5e}'.format(optimizer.param_groups[0]['lr'])) if cfg.TRAIN.LR_SCHEDULE: scheduler.step() train_loader = data_prefetcher(train_data_loader, label_type) inputs, labels, ids = train_loader.next() # ids没有用到 i = 0 batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':.4e') train_acc = AverageMeter('Acc', ':6.2f') train_f1 = AverageMeter('F1', ':6.2f') progress = ProgressMeter( len(train_data_loader), [batch_time, data_time, losses, train_acc, train_f1], prefix="Epoch: [{}]".format(epoch)) while inputs is not None: bs = inputs.size(0) # mixup_now = np.random.random() < cfg.AUG.AUG_PROBA # 0.5 一半的概率mixup # if cfg.AUG.MIXUP and mixup_now: # True & 一半的概率 # inputs, labels_a, labels_b, lam = mixup_data(inputs, labels, cfg.AUG.MIXUP_ALPHA) raw_logits, concat_logits, part_logits, _, top_n_prob = model(inputs) # optimizer.zero_grad() raw_optimizer.zero_grad() part_optimizer.zero_grad() concat_optimizer.zero_grad() partcls_optimizer.zero_grad() raw_logits, concat_logits, part_logits, _, top_n_prob = model(inputs) if cfg.AUG.MIXUP and mixup_now: # TODO: to implement NTS with mixup # loss = mixup_criterion(criterion, output, labels_a, labels_b, lam) # mixup之后的图片也要根据mixup的obj算loss pass else: part_loss = NTS.list_loss( part_logits.view(bs * cfg.NET.PROPOSAL_NUM, -1), labels.max(axis=1)[1].unsqueeze(1).repeat(1, cfg.NET.PROPOSAL_NUM).view(-1)).view(bs, cfg.NET.PROPOSAL_NUM) raw_loss = criterion(raw_logits, labels) concat_loss = criterion(concat_logits, labels) rank_loss = NTS.ranking_loss(top_n_prob, part_loss, proposal_num=cfg.NET.PROPOSAL_NUM) CE = torch.nn.CrossEntropyLoss() partcls_loss = CE( part_logits.view(bs * cfg.NET.PROPOSAL_NUM, -1), labels.max(axis=1)[1].unsqueeze(1).repeat(1, cfg.NET.PROPOSAL_NUM).view(-1)) # part_logits, (256,6,209) => (1536,209) # labels: (1536,) total_loss = raw_loss + rank_loss + concat_loss + partcls_loss total_loss.backward() raw_optimizer.step() part_optimizer.step() concat_optimizer.step() partcls_optimizer.step() if i % cfg.PRINT_STEP == 0: preds = np.argmax(concat_logits.cpu().detach().numpy(), axis=1) # argmax on logits if use_onehot: targets = np.argmax(labels.cpu().detach().numpy(), axis=1) else: targets = labels.cpu().detach().numpy() acc = metrics.accuracy_score(targets, preds) loss = concat_loss loss_val = loss.item() f1 = metrics.f1_score(targets,preds,average='macro') # train_log.append([epoch,i, loss_val, acc, f1]) # print("epoch: %d, iter: %d, train_loss: %.4f, train_acc: %.4f, train_f1: %.4f, lr_rate: %.1e, time_cost_per_iter: %.4f s" % ( \ # epoch, i, loss_val, acc, f1, (raw_optimizer.param_groups[0]['lr']), (time() - t1)/params['print_step'])) tb_writer.add_scalar('train_loss', loss_val, it) # with open(params['log_dir'] + 'train.tsv', 'a') as f: # f.write('%05d\t%05d\t%f\t%f\t%f\n' % (epoch, i, loss_val, acc, f1)) t1 = time() if (i+1) % params['eval_step'] == 0: # 95 t2=time() model.eval() data_loader = data_prefetcher(dev_data_loader,label_type) loss_val, acc, f1 = evaluate(model, data_loader, criterion, use_onehot) model.train() dev_log.append([epoch, i, acc, f1]) if f1 > best_f1: best_acc, best_f1, best_iter, best_epoch = acc, f1, i, epoch print('[Evaluation] -------------------------------') print("epoch: %d, test acc: %.4f, f1-score: %.4f, best-f1-score: %.4f, eval_time: %.4f s" % ( epoch, acc, f1, best_f1,time()-t2)) print('[Evaluation] -------------------------------') tb_writer.add_scalar('val_metrics/val_acc', acc, it) tb_writer.add_scalar('val_metrics/val_f1-score', f1, it) tb_writer.add_scalar('val_metrics/val_loss', loss_val, it) with open(params['log_dir'] + 'eval.tsv', 'a') as f: f.write('%05d\t%05d\t%f\t%f\n' % (epoch, i, acc, f1)) save_model_path= os.path.join(params['save_dir'], 'model_%d_%d.pkl' % (epoch, i)) # torch.save(model, save_model_path) # FIXME: this is bad for multi-gpu, use below instead torch.save({ 'state_dict': model.module.state_dict(), 'schduler': scheduler.state_dict(), 'raw_optimizer': raw_optimizer.state_dict(), 'part_optimizer': part_optimizer.state_dict(), 'concat_optimizer': concat_optimizer.state_dict(), 'partcls_optimizer': partcls_optimizer.state_dict(), }, save_model_path) print('[INFO]save model to', save_model_path) inputs, labels, ids = train_loader.next() i += 1 it += 1 print("[INFO]Train is over, Time cost: %.1f hours..." % ((time()-t0) / 3600)) # copy best_f1 model to model_best.pkl source = 'model_%d_%d.pkl' % (best_epoch, best_iter) source_path = os.path.join(params['save_dir'], source) target = 'model_best.pkl' target_path = os.path.join(params['save_dir'], target) try: shutil.copy(source_path, target_path) print("Save best model to {}: [epoch-iter: {:d}-{:d}/ f1-score: {:.4f}]".format(target_path, best_epoch, best_iter, best_f1)) except IOError as e: print("Unable to copy file. %s" % e) except: print("Unexpected error:", sys.exc_info()) # ---- Delete Useless ckpt ckpts = sorted(name for name in os.listdir(params['save_dir']) if name.startswith('model')) ckpts = ckpts[:-1] print("=> Start to clean checkpoint from {} to {}".format(ckpts[0], ckpts[-1])) for name in ckpts: os.remove(os.path.join(params['save_dir'], name))