def forward2(self, inputs, bboxes): features = data_parallel(self.feature_net, (inputs)) #print('fs[-1] ', fs[-1].shape) fs = features[-1] self.crop_boxes = [] for b in range(len(bboxes)): self.crop_boxes.append( np.column_stack((np.zeros( (len(bboxes[b]) + b, 1)), bboxes[b]))) self.crop_boxes = np.concatenate(self.crop_boxes, 0) self.crop_boxes[:, 1:-1] = center_box_to_coord_box(self.crop_boxes[:, 1:-1]) self.crop_boxes = self.crop_boxes.astype(np.int32) self.crop_boxes[:, 1:-1] = ext2factor(self.crop_boxes[:, 1:-1], 8) self.crop_boxes[:, 1:-1] = clip_boxes(self.crop_boxes[:, 1:-1], inputs.shape[2:]) # self.mask_targets = make_mask_target(self.cfg, self.mode, inputs, self.crop_boxes, # truth_boxes, truth_labels, masks) # Make sure to keep feature maps not splitted by data parallel features = [ t.unsqueeze(0).expand(torch.cuda.device_count(), -1, -1, -1, -1, -1) for t in features ] self.mask_probs = data_parallel( self.mask_head, (torch.from_numpy(self.crop_boxes).cuda(), features)) self.mask_probs = crop_mask_regions(self.mask_probs, self.crop_boxes)
def forward_mask(self, inputs, truth_boxes, truth_labels, truth_masks, masks, split_combiner=None, nzhw=None): features, feat_4 = data_parallel(self.feature_net, (inputs)) #print('fs[-1] ', fs[-1].shape) fs = features[-1] # keep batch index, z, y, x, d, h, w, class self.crop_boxes = [] for b in range(len(truth_boxes)): self.crop_boxes.append( np.column_stack((np.zeros((len(truth_boxes[b]) + b, 1)), truth_boxes[b], truth_labels[b]))) self.crop_boxes = np.concatenate(self.crop_boxes, 0) self.crop_boxes[:, 1:-1] = center_box_to_coord_box(self.crop_boxes[:, 1:-1]) self.crop_boxes = self.crop_boxes.astype(np.int32) self.crop_boxes[:, 1:-1] = ext2factor(self.crop_boxes[:, 1:-1], 4) self.crop_boxes[:, 1:-1] = clip_boxes(self.crop_boxes[:, 1:-1], inputs.shape[2:]) # if self.mode in ['eval', 'test']: # self.crop_boxes = top1pred(self.crop_boxes) # else: # self.crop_boxes = random1pred(self.crop_boxes) if self.mode in ['train', 'valid']: self.mask_targets = make_mask_target(self.cfg, self.mode, inputs, self.crop_boxes, truth_boxes, truth_labels, masks) # Make sure to keep feature maps not splitted by data parallel features = [ t.unsqueeze(0).expand(torch.cuda.device_count(), -1, -1, -1, -1, -1) for t in features ] self.mask_probs = data_parallel( self.mask_head, (torch.from_numpy(self.crop_boxes).cuda(), features)) # if self.mode in ['eval', 'test']: # mask_keep = mask_nms(self.cfg, self.mode, self.mask_probs, self.crop_boxes, inputs) # # self.crop_boxes = torch.index_select(self.crop_boxes, 0, mask_keep) # # self.detections = torch.index_select(self.detections, 0, mask_keep) # # self.mask_probs = torch.index_select(self.mask_probs, 0, mask_keep) # self.crop_boxes = self.crop_boxes[mask_keep] # self.detections = self.detections[mask_keep] # self.mask_probs = self.mask_probs[mask_keep] self.mask_probs = crop_mask_regions(self.mask_probs, self.crop_boxes)
def forward_mask(self, images, rcnn_proposals): cfg = self.cfg mode = self.mode self.rcnn_proposals = rcnn_proposals features = data_parallel(self.feature_net, images) self.detections = self.rcnn_proposals self.masks = make_empty_masks(cfg, mode, images) if len(self.rcnn_proposals) > 0: mask_crops = self.mask_crop(features, self.detections) self.mask_logits = data_parallel(self.mask_head, mask_crops) self.masks, self.mask_instances, self.mask_proposals = mask_nms( cfg, images, self.rcnn_proposals, self.mask_logits) self.detections = self.mask_proposals
def test_multi_gpu(self): import torch import torch.nn as nn from torch.nn.parallel.data_parallel import data_parallel from inferno.extensions.containers.graph import Graph input_shape = [8, 1, 3, 128, 128] model = Graph() \ .add_input_node('input') \ .add_node('conv0', nn.Conv3d(1, 10, 3, padding=1), previous='input') \ .add_node('conv1', nn.Conv3d(10, 1, 3, padding=1), previous='conv0') \ .add_output_node('output', previous='conv1') model.cuda() input = torch.rand(*input_shape).cuda() data_parallel(model, input, device_ids=[0, 1, 2, 3])
def get_features(dataloader_embed, model, num_classes): # Acquire the features to use for embedding #, if necessary #if embed or top1 > max_valid: out_ids = [] out_features = [] for store in dataloader_embed: images, labels, names = store images = images.cuda() labels = labels.cuda().long() #print(labels) #print(names) if not (test): global_feat, local_feat, results = data_parallel(model, images) else: global_feat, local_feat, results = model.forward(images) #print(global_feat.shape) out_ids.extend(labels.cpu().long().numpy()) out_features.extend(global_feat.data.cpu().numpy()) #out_features.extend(results.data.cpu().numpy()) #exit() # Modify the features to remove all 'new_whale's, # as those are not proper IDs, ie. different whales # are grouped up with this 'ID' if they have no known ID ids, feats = [], [] for ind, x in enumerate(out_ids): if int(x) == num_classes: continue ids.append(x) y = out_features[ind * 2] y2 = out_features[ind * 2 + 1] feats.append(y) feats.append(y2) return ids, feats
def valid_eval(config, model, dataLoader_valid): with torch.no_grad(): if config.train.enable_eval_on_val: model.eval() model.mode = 'valid' top1_batch = 0. map5_batch = 0. loss = 0. for i, valid_data in enumerate(dataLoader_valid): images, labels = valid_data images = images.cuda() labels = labels.cuda().long() global_feat, local_feat, results = data_parallel(model, images) model.getLoss(global_feat, local_feat, results, labels, config, verbose=(i % config.loss.verbose_interval == 0)) loss += model.loss results = torch.sigmoid(results) top1_batch += accuracy(results, labels, topk=[1])[0] map5_batch += mapk(labels, results, k=5) return loss / len(dataLoader_valid), top1_batch / len( dataLoader_valid), map5_batch / len(dataLoader_valid)
def predict(args): model = PepCNN(num_class=args.num_classes) load_checkpoint(args.checkpoint_path, model) model.cuda() model.eval() probs = [] topks = [] predict_data = PepseqDataset(args.test_file) data_loader = data.DataLoader(predict_data, batch_size=args.batch_size) corrects = 0 for batch in tqdm.tqdm(data_loader): feature, target = batch[0], batch[1] # feature.data.t_(), target.data.sub_(1) # batch first, index align feature, target = feature.cuda(), target.cuda() with torch.no_grad(): logit = data_parallel(model,feature)#######paralle prob = F.softmax(logit, 1) print(logit) corrects += (torch.max(prob, 1)[1].view(target.size()).data == target.data).sum() logit_5, top5 = torch.topk(prob.data.cpu(), args.topk) for i, l in enumerate(logit_5): probs.append(l.numpy()) topks.append(top5[i].numpy()) size = len(data_loader.dataset) accuracy = 100 * corrects.data.cpu().numpy() / size print("acc: {:.4f}%({}/{})".format(accuracy, corrects, size)) if args.predict_file: df = pd.read_csv(args.test_file, sep='\t', header=None) df["probs"] = probs df["topk"] = topks df.to_csv(args.predict_file, columns=[2, 0, "topk", "probs"])
def train(self, data_loader, set_log=True): self.model.train() for i, (x, y) in enumerate(data_loader): x = x.to(self.device) y = y.to(self.device) # logits = self.model(x) logits = data_parallel(self.model, x, device_ids=self.device_ids) loss = self.criteria(logits, y) self.optimizer.zero_grad() loss.backward() self.optimizer.step() acc = (torch.argmax(logits, dim=1) == torch.argmax(y, dim=1)).type(torch.float32).mean() lin = 'epoch: {:0>2}, i: {:0>5}, loss: {:.3}, accuracy: {:.3}'.format(self.current_epoch, i, loss.item(), acc.item()) if i%config.log_step == 0: print(lin) if set_log: self.logger.info(lin) self.writer.add_scalar('/train/loss', loss.item(), global_step=self.current_epoch*config.batch_size+i) self.writer.add_scalar('/train/accuracy', acc.item(), global_step=self.current_epoch*config.batch_size+i) self.current_epoch += 1
def get_probas(id, net, tile_image, tile, flip_predict, start_timer, log, tile_size, tile_average_step, tile_scale, tile_min_score): tile_probability = [] batch = np.array_split(tile_image, len(tile_image) // 4) for t, m in enumerate(batch): print('\r %s %d / %d %s' % (id, t, len(batch), time_to_str(timer() - start_timer, 'sec')), end='', flush=True) m = torch.from_numpy(m).cuda() p = [] with torch.no_grad(): # inference sur l'image de base logit = data_parallel(net, m) p.append(torch.sigmoid(logit)) if flip_predict: # inference sur les images inversées / axes x et y for _dim in [(2, ), (3, ), (2, 3)]: _logit = data_parallel(net, m.flip(dims=_dim)) p.append(_logit.flip(dims=_dim)) p = torch.stack(p).mean(0) tile_probability.append(p.data.cpu().numpy()) print('\r', end='', flush=True) log.write('%s %d / %d %s\n' % (id, t, len(batch), time_to_str(timer() - start_timer, 'sec'))) # before squeeze, dimension = N_tiles x 1 x tile_x x tile_y tile_probability = np.concatenate(tile_probability).squeeze( 1) # N_tiles x tile_x x tile_y height, width = tile['image_small'].shape[:2] probability = to_mask( tile_probability, # height x width tile['coord'], height, width, tile_scale, tile_size, tile_average_step, tile_min_score, aggregate='mean') return probability
def get_probas(net, tile_image, flip_predict): """ Itère sur les images et calcule les probas de chaque image. Les prédictions sont éventuellement moyennées / à des inversions suivants les axes x et y. """ m = torch.from_numpy(tile_image[np.newaxis, ...]).cuda() p = [] with torch.no_grad(): logit = data_parallel(net, m) p.append(torch.sigmoid(logit)) if flip_predict: # inference sur les images inversées / axes x et y for _dim in [(2, ), (3, ), (2, 3)]: _logit = data_parallel(net, m.flip(dims=_dim)) p.append(torch.sigmoid(_logit).flip(dims=_dim)) p = torch.stack(p).mean(0) p = p.squeeze() return p.data.cpu().numpy()
def do_valid(net, valid_loader): valid_num = 0 valid_probability = [] valid_mask = [] net = net.eval() start_timer = timer() with torch.no_grad(): for t, batch in enumerate(valid_loader): batch_size = len(batch['index']) mask = batch['mask'] image = batch['image'].cuda() logit = data_parallel(net, image) # net(input)# probability = torch.sigmoid(logit) valid_probability.append(probability.data.cpu().numpy()) valid_mask.append(mask.data.cpu().numpy()) valid_num += batch_size # --- print('\r %8d / %d %s' % (valid_num, len(valid_loader.dataset), time_to_str(timer() - start_timer, 'sec')), end='', flush=True) # if valid_num==200*4: break assert (valid_num == len(valid_loader.dataset)) # print('') # ------ probability = np.concatenate(valid_probability) mask = np.concatenate(valid_mask) # print('\n1', timer() - start_timer) loss = np_binary_cross_entropy_loss(probability, mask) # print(timer() - start_timer) # print() # print(probability.shape) # print(type(probability), type(mask)) # _tmp = torch.from_numpy(probability) # # print() # # loss = lovasz_loss(torch.logit(torch.from_numpy(probability)), mask) # print('2', timer() - start_timer) dice = np_dice_score(probability, mask) # print('3', timer() - start_timer) tp, tn = np_accuracy(probability, mask) return [dice, loss, tp, tn]
def eval(model, dataLoader_valid): with torch.no_grad(): model.eval() model.mode = 'valid' valid_loss, index_valid = 0, 0 all_results, all_labels = [], [] for valid_data in dataLoader_valid: images, labels, names = valid_data print('labels:{} names:{}'.format(labels, names)) images = images.cuda() labels = labels.cuda().long() feature, results = data_parallel(model, images) model.getLoss(feature[::2], results[::2], labels) print('results before sigmoid: {}'.format(results)) results = torch.sigmoid(results) print('results after sigmoid: {}'.format(results)) results_zeros = (results[::2, :5004] + results[1::2, 5004:]) / 2 # ???? all_results.append(results_zeros) all_labels.append(labels) b = len(labels) valid_loss += model.loss.data.cpu().numpy() * b index_valid += b all_results = torch.cat(all_results, 0) all_labels = torch.cat(all_labels, 0) map5s, top1s, top5s = [], [], [] ts = np.linspace(0.1, 0.9, 9) for t in ts: results_t = torch.cat([ all_results, torch.ones_like(all_results[:, :1]).float().cuda() * t ], 1) all_labels[all_labels == 5004 * 2] = 5004 top1_, top5_ = accuracy(results_t, all_labels) map5_ = mapk(all_labels, results_t, k=5) map5s.append(map5_) top1s.append(top1_) top5s.append(top5_) map5 = max(map5s) i_max = map5s.index(map5) top1 = top1s[i_max] top5 = top5s[i_max] best_t = ts[i_max] valid_loss /= index_valid return valid_loss, top1, top5, map5, best_t
def eval(model, dataLoader_valid): with torch.no_grad(): model.eval() model.mode = 'valid' valid_loss, index_valid = 0, 0 all_results = [] all_labels = [] for valid_data in dataLoader_valid: images, labels, names = valid_data images = images.cuda() labels = labels.cuda().long() # feature, local_feat, results = model(images) feature, local_feat, results = data_parallel(model, images) model.getLoss(feature[::2], local_feat[::2], results[::2], labels) results = torch.sigmoid(results) results_zeros = (results[::2, :2233] + results[1::2, 2233:]) / 2 all_results.append(results_zeros) all_labels.append(labels) b = len(labels) valid_loss += model.loss.data.cpu().numpy() * b index_valid += b all_results = torch.cat(all_results, 0) all_labels = torch.cat(all_labels, 0) map5s, top1s, top5s = [], [], [] if 1: ts = np.linspace(0.1, 0.9, 9) for t in ts: results_t = torch.cat([ all_results, torch.ones_like(all_results[:, :1]).float().cuda() * t ], 1) all_labels[all_labels == 2233 * 2] = 2233 top1_, top5_ = accuracy(results_t, all_labels) map5_ = mapk(all_labels, results_t, k=5) map5s.append(map5_) top1s.append(top1_) top5s.append(top5_) map5 = max(map5s) i_max = map5s.index(map5) top1 = top1s[i_max] top5 = top5s[i_max] best_t = ts[i_max] valid_loss /= index_valid return valid_loss, top1, top5, map5, best_t
def train_epoch(training_input, training_target, batch_size, mod = 'train', mean=0.0, std=1.0): """ Trains one epoch with the given data. :param training_input: Training inputs of shape (num_samples, num_nodes, num_timesteps_train, num_features). :param training_target: Training targets of shape (num_samples, num_nodes, num_timesteps_predict). :param batch_size: Batch size to use during training. :return: Average loss for this epoch. """ permutation = torch.randperm(training_input.shape[0]) epoch_training_losses, preds, labels = [], [], [] for i in range(0, training_input.shape[0], batch_size): if mod == 'train': net.train() else: net.eval() optimizer.zero_grad() begin = i end = min(begin + batch_size, training_input.shape[0]) indices = range(begin, end) if mod == 'train': indices = permutation[i:i + batch_size] X_batch, y_batch = training_input[indices], training_target[indices] X_batch = X_batch.to(device=args.device) y_batch = y_batch.to(device=args.device) out = data_parallel(net, X_batch) loss = loss_criterion(out, y_batch) if mod == 'train': loss.backward() optimizer.step() if i / batch_size % 10 == 0: print('After training %d batches, loss = %lf' % (i / batch_size, loss.item())) epoch_training_losses.append(loss.detach().cpu().numpy()) preds.append(out.detach().cpu().numpy()) labels.append(y_batch.detach().cpu().numpy()) preds = np.concatenate(preds, axis=0).flatten()*std + mean labels = np.concatenate(labels, axis=0).flatten()*std + mean metrics = [mae(labels, preds), mse(labels, preds)] return sum(epoch_training_losses)/len(epoch_training_losses), metrics
def prediction(model, validation_loader, num_classes=5, batch_size=36): with torch.no_grad(): model.eval() # model.mode = 'valid' valid_loss, correctpred = 0, 0 num_batch = len(validation_loader) for valid_data in validation_loader: images, labels = valid_data images = images.cuda() labels = labels.cuda() out = data_parallel(model, images) # out = model(images) valid_loss += getLoss(out, labels) scores = F.softmax(out, dim=1) _, results = torch.max(scores, 1) correctpred += torch.sum(results == labels) valid_loss /= num_batch accuracy = correctpred.float() / (num_batch * batch_size) return valid_loss, accuracy
def predict_model(config, num_classes=1108): model = model_whale(num_classes=num_classes, inchannels=6, model_name=config.train.model_name).cuda() model.load_pretrain(os.path.join(config.test.checkpoints_path, '%08d_model.pth' % (config.test.epoch)), skip=[]) result = defaultdict(partial(np.ndarray, (num_classes,))) for site in [1, 2]: test_dataset = CustomDataset(config.test.csv_file, config.test.img_dir, mode='test', site=site, transforms=transforms[config.test.augmetations]) dataloader_test = DataLoader(test_dataset, batch_size=config.train.batch_size, num_workers=config.train.num_workers) with torch.no_grad(): if config.test.enable_eval: model.eval() else: model.train() for data in tqdm(dataloader_test): images, names = data images = images.cuda() _, _, outs = data_parallel(model, images) outs = torch.sigmoid(outs) for name, out in zip(names, outs): result[name] += out.cpu().numpy() test_csv = pd.read_csv(config.test.csv_file) test_csv['result'] = test_csv['id_code'].map(result) return np.vstack(test_csv['result'].values)
def train(train_loader, val_loader, model, optimizer, args): model.cuda() steps = 0 best_acc = 0 last_step = 0 for epoch in range(1, args.epochs + 1): model.train() print("Epoch {}/{}".format(epoch, args.epochs)) losses = [] for batch in tqdm.tqdm(train_loader): feature, target = batch[0], batch[1] # print(feature.shape) # feature.data.t_(), target.data.sub_(1) # batch first, index align feature, target = feature.cuda(), target.cuda() optimizer.zero_grad() logit = data_parallel(model, feature) # print('prob vector', prob.size()) # print('target vector', target.size()) loss = F.cross_entropy(logit, target) loss.backward() optimizer.step() losses.append(loss.item()) accuracy, val_losses, corrects, size = eval(val_loader, model) print( '\nTrain - loss: {:.6f} Evaluation - loss: {:.6f} acc: {:.4f}%({}/{}) \n' .format(np.mean(losses), np.mean(val_losses), accuracy, corrects, size)) if accuracy > best_acc: best_acc = accuracy save_checkpoint(args.checkpoint_path, model, optimizer) print("Best accuracy is {:.4f}".format(best_acc))
def run_train(show_valid_images=False, sha='', fold=None, loss_type='bce', tile_scale=0.25, tile_size=320, *args, **kwargs): out_dir = f"result/Baseline/fold{'_'.join(map(str, fold))}" initial_checkpoint = None start_lr = kwargs.get('start_lr', 0.001) batch_size = kwargs.get('batch_size', 16) ################################################## ## setup ---------------------------------------- ################################################## for f in [ f'checkpoint_{sha}', f'predictions_{sha}', ]: os.makedirs(out_dir + '/' + f, exist_ok=True) log = Logger() log.open(out_dir + f'/log.train_{sha}.txt', mode='a') log.write('\n--- [START %s] %s\n\n' % (IDENTIFIER, '-' * 64)) log.write('\t%s\n' % COMMON_STRING) log.write('\t__file__ = %s\n' % __file__) log.write('\tout_dir = %s\n' % out_dir) log.write('\n') ################################################## ## dataset --------------------------------------- ################################################## log.write('** dataset setting **\n') train_dataset = HuDataset( image_id=[ make_image_id('train', fold), ], image_dir=[ f'{tile_scale}_{tile_size}_train', ], augment=train_augment, ) train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size, drop_last=True, num_workers=8, pin_memory=True, collate_fn=null_collate) valid_dataset = HuDataset( image_id=[make_image_id('valid', fold)], image_dir=[ f'{tile_scale}_{tile_size}_train', ], ) valid_loader = DataLoader(valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=8, drop_last=False, num_workers=4, pin_memory=True, collate_fn=null_collate) log.write('fold = %s\n' % ' '.join(map(str, fold))) log.write('train_dataset : \n%s\n' % train_dataset) log.write('valid_dataset : \n%s\n' % valid_dataset) log.write('\n') ################################################## ## net ------------------------------------------- ################################################## log.write('** net setting **\n') if is_mixed_precision: scaler = amp.GradScaler() net = AmpNet().cuda() else: net = Net().cuda() if initial_checkpoint is not None: f = torch.load(initial_checkpoint, map_location=lambda storage, loc: storage) start_iteration = f['iteration'] start_epoch = f['epoch'] state_dict = f['state_dict'] net.load_state_dict(state_dict, strict=False) else: start_iteration = 0 start_epoch = 0 # net.load_pretrain(is_print=False) log.write('\tinitial_checkpoint = %s\n' % initial_checkpoint) log.write('\n') ## optimiser ---------------------------------- if 0: ##freeze for p in net.stem.parameters(): p.requires_grad = False pass def freeze_bn(net): for m in net.modules(): if isinstance(m, nn.BatchNorm2d): m.eval() m.weight.requires_grad = False m.bias.requires_grad = False # freeze_bn(net) # ----------------------------------------------- # optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, net.parameters()),lr=schduler(0)) # optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()),lr=start_lr) ##optimizer = torch.optim.RMSprop(net.parameters(), lr =0.0005, alpha = 0.95) # optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=start_lr, momentum=0.5, weight_decay=0.0) # optimizer = Over9000(filter(lambda p: p.requires_grad, net.parameters()), lr=0.001, ) # optimizer = Lookahead(torch.optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=start_lr, momentum=0.0, weight_decay=0.0)) # optimizer = Lookahead(torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()),lr=start_lr)) optimizer = Lookahead(RAdam(filter(lambda p: p.requires_grad, net.parameters()), lr=start_lr), alpha=0.5, k=5) num_iteration = kwargs.get( 'num_iteration', 5000) # total nb. of batch used to train the net iter_log = kwargs.get('iter_log', 250) # show results every iter_log first_iter_save = kwargs.get('first_iter_save', 0) # first checkpoint kept iter_valid = iter_log # validate every iter_valid # iter_save = list(range(0, num_iteration + 1, iter_log)) log.write('optimizer\n %s\n' % optimizer) # log.write('schduler\n %s\n'%(schduler)) log.write('\n') ###################################################################### ## start training here! ############################################## ###################################################################### num_iteration = (num_iteration // len(train_loader) + 1) * len(train_loader) log.write('** start training here! **\n') log.write(' is_mixed_precision = %s \n' % str(is_mixed_precision)) log.write(' loss_type = %s \n' % loss_type) log.write(' batch_size = %d \n' % batch_size) log.write(' num_iterations = %d \n' % num_iteration) log.write(' experiment = %s\n' % str(__file__.split('/')[-2:])) log.write( ' |-------------- VALID---------|---- TRAIN/BATCH ----------------\n' ) log.write( 'rate iter epoch | dice loss tp tn | loss | time \n' ) log.write( '-------------------------------------------------------------------------------------\n' ) # 0.00100 0.50 0.80 | 0.891 0.020 0.000 0.000 | 0.000 0.000 | 0 hr 02 min def message(mode='print'): if iteration % iter_valid == 0 and iteration > 0: iter_save = True if mode == 'print': asterisk = ' ' loss = batch_loss if mode == 'log': asterisk = '*' if iter_save else ' ' loss = train_loss text = \ '%0.5f %5.2f%s %4.2f | ' % (rate, iteration / 1000, asterisk, epoch,) + \ '%4.3f %4.3f %4.3f %4.3f | ' % (*valid_loss,) + \ '%4.3f %4.3f | ' % (*loss,) + \ '%s' % (time_to_str(timer() - start_timer, 'min')) return text # ---- valid_loss = np.zeros(4, np.float32) train_loss = np.zeros(2, np.float32) batch_loss = np.zeros_like(train_loss) sum_train_loss = np.zeros_like(train_loss) sum_train = 0 loss = torch.FloatTensor([0]).sum() start_timer = timer() iteration = start_iteration epoch = start_epoch rate = 0 bookeeping = CheckpointUpdate( net=net, first_iter_save=first_iter_save, # iter_save=iter_save, out_dir=out_dir, sha=sha, nbest=5) while iteration < num_iteration: for t, batch in enumerate(train_loader): if iteration % iter_valid == 0 and iteration > 0: valid_loss = do_valid(net, valid_loader) bookeeping.update(iteration=iteration, epoch=epoch, score=valid_loss[0]) # sys.exit() if iteration % iter_log == 0 and iteration > 0: print('\r', end='', flush=True) log.write(message(mode='log') + '\n') # learning rate schduler ------------- # adjust_learning_rate(optimizer, schduler(iteration)) rate = get_learning_rate(optimizer) # one iteration update ------------- batch_size = len(batch['index']) mask = batch['mask'].cuda() image = batch['image'].cuda() net.train() optimizer.zero_grad() ################################################ ### Compute the loss --------------------------- ################################################ if is_mixed_precision: # assert (False) image = image.half() with amp.autocast(): logit = data_parallel(net, image) loss = get_loss(loss_type, logit, mask) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() else: # logit = data_parallel(net, image) logit = net(image) loss = get_loss(loss_type, logit, mask) loss.backward() optimizer.step() ############################## # print statistics ---------- ############################## epoch += 1 / len(train_loader) iteration += 1 batch_loss = np.array([loss.item(), 0]) sum_train_loss += batch_loss sum_train += 1 if iteration % 100 == 0: train_loss = sum_train_loss / (sum_train + 1e-12) sum_train_loss[...] = 0 sum_train = 0 print('\r', end='', flush=True) print(message(mode='print'), end='', flush=True) # debug if show_valid_images: # if iteration%50==1: pass # buggy code ???? probability = torch.sigmoid(logit) image = image.data.cpu().float().numpy() mask = mask.data.cpu().float().numpy().squeeze(1) probability = probability.data.cpu().float().numpy().squeeze(1) image = np.ascontiguousarray(image.transpose(0, 2, 3, 1)) batch_size, h, w, _ = image.shape for b in range(batch_size): m = image[b] t = mask[b] p = probability[b] # contour = mask_to_inner_contour(p) m = draw_contour_overlay(m, t, color=(0, 0, 1), thickness=3) m = draw_contour_overlay(m, p, color=(0, 1, 0), thickness=3) overlay = np.hstack([ m, np.tile(t.reshape(h, w, 1), (1, 1, 3)), np.tile(p.reshape(h, w, 1), (1, 1, 3)), np.stack([np.zeros_like(p), p, t], 2), ]) image_show_norm('overlay', overlay, min=0, max=1) # image_show_norm('m',m,min=0,max=1) # image_show_norm('t',t,min=0,max=1) # image_show_norm('p',p,min=0,max=1) cv2.waitKey(1) cv2.imwrite(out_dir + '/train/%05d.png' % (b), (overlay * 255).astype(np.uint8)) log.write('\n')
def run_train( show_valid_images=False, sha='', fold=None, loss_type='bce', tile_size=320, # overall size of the input images image_size=320, # overall size of the input images tile_scale=1, backbone='resnet34', include_test=False, *args, **kwargs): out_dir = f"result/Layer_2/fold{'_'.join(map(str, fold))}" initial_checkpoint = None start_lr = kwargs.get('start_lr', 0.001) batch_size = kwargs.get('batch_size', 16) ################################################## ## setup ---------------------------------------- ################################################## for f in [ f'checkpoint_{sha}', f'predictions_{sha}', ]: os.makedirs(out_dir + '/' + f, exist_ok=True) log = Logger() log.open(out_dir + f'/log.train_{sha}.txt', mode='a') log.write('\n--- [START %s] %s\n\n' % (IDENTIFIER, '-' * 64)) log.write('\t%s\n' % COMMON_STRING) log.write('\t__file__ = %s\n' % __file__) log.write('\tout_dir = %s\n' % out_dir) log.write('\n') ################################################## ## dataset --------------------------------------- ################################################## log.write(30 * '-' + '\n' + '*** TRAIN dataset setting ***\n' + 30 * '-' + '\n') # ----------------------------- ### Create CV scheme ---------- # ----------------------------- train_image_id = { 0: '0486052bb', 1: '095bf7a1f', 2: '1e2425f28', 3: '26dc41664', 4: '2f6ecfcdf', 5: '4ef6695ce', 6: '54f2eec69', 7: '8242609fa', 8: 'aaa6a05cc', 9: 'afa5e8098', 10: 'b2dc8411c', 11: 'b9a3865fc', 12: 'c68fe75ea', 13: 'cb2d976f4', 14: 'e79de561c', } test_image_id = { 15: '2ec3f1bb9', 16: '3589adb90', 17: '57512b7f1', 18: 'aa05346ff', 19: 'd488c759a', } if include_test: image_ids = {**train_image_id, **test_image_id} else: image_ids = train_image_id true_positives_dir = [ f'mask_{tile_size}_{tile_scale}_centroids', f"TP_{tile_scale}_{tile_size}_pseudolabels_f765ca3ec", ] false_positives_dir = [ f"TN_{tile_scale}_{tile_size}_train", f"TN_{tile_scale}_{tile_size}_pseudolabels_f765ca3ec", ] train_set, val_set = split_dataset(sha=sha, train_image_id=image_ids, true_positives_dir=true_positives_dir, false_positives_dir=false_positives_dir) # ------------ ### TRAIN SET # ------------ train_dataset = CenteredHuDataset(images=train_set, image_size=image_size, augment=train_albu_augment_layer2, logger=log) train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size, drop_last=True, num_workers=8, pin_memory=True, collate_fn=null_collate) # ------------ ### VALID SET # ------------ log.write(30 * '-' + '\n' + '*** VALID dataset setting ***\n' + 30 * '-' + '\n') valid_dataset = CenteredHuDataset(images=val_set, image_size=image_size, augment=None, logger=log) valid_loader = DataLoader(valid_dataset, sampler=SequentialSampler(valid_dataset), batch_size=8, drop_last=False, num_workers=8, pin_memory=True, collate_fn=null_collate) log.write(30 * '-' + '\n' + '*** dataset setting SUMMARY***\n' + 30 * '-' + '\n') log.write('fold = %s\n' % ' '.join(map(str, fold))) log.write('train_dataset : \n%s\n' % train_dataset) log.write('valid_dataset : \n%s\n' % valid_dataset) log.write('\n') ################################################## ## net ------------------------------------------- ################################################## log.write('** net setting **\n') if is_mixed_precision: scaler = amp.GradScaler() net = AmpNet().cuda() else: net = Net(backbone).cuda() if initial_checkpoint is not None: f = torch.load(initial_checkpoint, map_location=lambda storage, loc: storage) start_iteration = f['iteration'] start_epoch = f['epoch'] state_dict = f['state_dict'] net.load_state_dict(state_dict, strict=False) else: start_iteration = 0 start_epoch = 0 # net.load_pretrain(is_print=False) log.write('\tinitial_checkpoint = %s\n' % initial_checkpoint) log.write('\n') ## optimiser ---------------------------------- if 0: ##freeze for p in net.stem.parameters(): p.requires_grad = False pass def freeze_bn(net): for m in net.modules(): if isinstance(m, nn.BatchNorm2d): m.eval() m.weight.requires_grad = False m.bias.requires_grad = False # freeze_bn(net) # ----------------------------------------------- # optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, net.parameters()),lr=schduler(0)) # optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()),lr=start_lr) ##optimizer = torch.optim.RMSprop(net.parameters(), lr =0.0005, alpha = 0.95) # optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=start_lr, momentum=0.5, weight_decay=0.0) # optimizer = Over9000(filter(lambda p: p.requires_grad, net.parameters()), lr=0.001, ) # optimizer = Lookahead(torch.optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=start_lr, momentum=0.0, weight_decay=0.0)) # optimizer = Lookahead(torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()),lr=start_lr)) optimizer = Lookahead(RAdam(filter(lambda p: p.requires_grad, net.parameters()), lr=start_lr), alpha=0.5, k=5) # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2000, gamma=0.5) lambda1 = lambda epoch: 0.5**(epoch // 2000 + 1) if epoch < 8000 else 0.05 scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1) num_iteration = kwargs.get( 'num_iteration', 5000) # total nb. of batch used to train the net iter_log = kwargs.get('iter_log', 250) # show results every iter_log first_iter_save = kwargs.get('first_iter_save', 0) # first checkpoint kept iter_valid = iter_log # validate every iter_valid # iter_save = list(range(0, num_iteration + 1, iter_log)) log.write('optimizer\n %s\n' % optimizer) # log.write('schduler\n %s\n'%(schduler)) log.write('\n') ###################################################################### ## start training here! ############################################## ###################################################################### num_iteration = (num_iteration // len(train_loader) + 1) * len(train_loader) log.write('** start training here! **\n') log.write(' is_mixed_precision = %s \n' % str(is_mixed_precision)) log.write(' loss_type = %s \n' % loss_type) log.write(' batch_size = %d \n' % batch_size) log.write(' num_iterations = %d \n' % num_iteration) log.write(' experiment = %s\n' % str(__file__.split('/')[-2:])) log.write( ' |-------------- VALID---------|---- TRAIN/BATCH ----------------\n' ) log.write( 'rate iter epoch | dice loss tp tn | loss | time \n' ) log.write( '-------------------------------------------------------------------------------------\n' ) # 0.00100 0.50 0.80 | 0.891 0.020 0.000 0.000 | 0.000 0.000 | 0 hr 02 min def message(mode='print'): if iteration % iter_valid == 0 and iteration > 0: iter_save = True if mode == 'print': asterisk = ' ' loss = batch_loss if mode == 'log': asterisk = '*' if iter_save else ' ' loss = train_loss text = \ '%0.5f %5.2f%s %4.2f | ' % (rate, iteration / 1000, asterisk, epoch,) + \ '%4.3f %4.3f %4.3f %4.3f | ' % (*valid_loss,) + \ '%4.3f %4.3f | ' % (*loss,) + \ '%s' % (time_to_str(timer() - start_timer, 'min')) return text # ---- valid_loss = np.zeros(4, np.float32) train_loss = np.zeros(2, np.float32) batch_loss = np.zeros_like(train_loss) sum_train_loss = np.zeros_like(train_loss) sum_train = 0 loss = torch.FloatTensor([0]).sum() start_timer = timer() iteration = start_iteration epoch = start_epoch rate = 0 bookeeping = CheckpointUpdate( net=net, first_iter_save=first_iter_save, # iter_save=iter_save, out_dir=out_dir, sha=sha, nbest=5) while iteration < num_iteration: for t, batch in enumerate(train_loader): if iteration % iter_valid == 0 and iteration > 0: valid_loss = do_valid(net, valid_loader) bookeeping.update(iteration=iteration, epoch=epoch, score=valid_loss[0]) # sys.exit() if iteration % iter_log == 0 and iteration > 0: print('\r', end='', flush=True) log.write(message(mode='log') + '\n') # learning rate scheduler ------------- rate = scheduler.get_last_lr()[0] # one iteration update ------------- batch_size = len(batch['index']) mask = batch['mask'].cuda() image = batch['image'].cuda() net.train() optimizer.zero_grad() ################################################ ### Compute the loss --------------------------- ################################################ if is_mixed_precision: # assert (False) image = image.half() with amp.autocast(): logit = data_parallel(net, image) loss = get_loss(loss_type, logit, mask) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() else: # logit = data_parallel(net, image) logit = net(image) loss = get_loss(loss_type, logit, mask) loss.backward() optimizer.step() scheduler.step() ############################## # print statistics ---------- ############################## epoch += 1 / len(train_loader) iteration += 1 batch_loss = np.array([loss.item(), 0]) sum_train_loss += batch_loss sum_train += 1 if iteration % 100 == 0: train_loss = sum_train_loss / (sum_train + 1e-12) sum_train_loss[...] = 0 sum_train = 0 print('\r', end='', flush=True) print(message(mode='print'), end='', flush=True) # debug if show_valid_images: # if iteration%50==1: pass # buggy code ???? probability = torch.sigmoid(logit) image = image.data.cpu().float().numpy() mask = mask.data.cpu().float().numpy().squeeze(1) probability = probability.data.cpu().float().numpy().squeeze(1) image = np.ascontiguousarray(image.transpose(0, 2, 3, 1)) batch_size, h, w, _ = image.shape for b in range(batch_size): m = image[b] t = mask[b] p = probability[b] # contour = mask_to_inner_contour(p) m = draw_contour_overlay(m, t, color=(0, 0, 1), thickness=3) m = draw_contour_overlay(m, p, color=(0, 1, 0), thickness=3) overlay = np.hstack([ m, np.tile(t.reshape(h, w, 1), (1, 1, 3)), np.tile(p.reshape(h, w, 1), (1, 1, 3)), np.stack([np.zeros_like(p), p, t], 2), ]) image_show_norm('overlay', overlay, min=0, max=1) # image_show_norm('m',m,min=0,max=1) # image_show_norm('t',t,min=0,max=1) # image_show_norm('p',p,min=0,max=1) cv2.waitKey(1) cv2.imwrite(out_dir + '/train/%05d.png' % (b), (overlay * 255).astype(np.uint8)) log.write('\n')
def forward(self, inputs, truth_boxes, truth_labels, truth_masks, masks, split_combiner=None, nzhw=None): features, feat_4 = data_parallel(self.feature_net, (inputs)) #print('fs[-1] ', fs[-1].shape) fs = features[-1] self.rpn_logits_flat, self.rpn_deltas_flat = data_parallel( self.rpn, fs) b, D, H, W, _, num_class = self.rpn_logits_flat.shape self.rpn_logits_flat = self.rpn_logits_flat.view(b, -1, 1) #print('rpn_logit ', self.rpn_logits_flat.shape) self.rpn_deltas_flat = self.rpn_deltas_flat.view(b, -1, 6) #print('rpn_delta ', self.rpn_deltas_flat.shape) self.rpn_window = make_rpn_windows(fs, self.cfg) self.rpn_proposals = [] if self.use_rcnn or self.mode in ['eval', 'test']: self.rpn_proposals = rpn_nms(self.cfg, self.mode, inputs, self.rpn_window, self.rpn_logits_flat, self.rpn_deltas_flat) # print 'length of rpn proposals', self.rpn_proposals.shape if self.mode in ['train', 'valid']: # self.rpn_proposals = torch.zeros((0, 8)).cuda() self.rpn_labels, self.rpn_label_assigns, self.rpn_label_weights, self.rpn_targets, self.rpn_target_weights = \ make_rpn_target(self.cfg, self.mode, inputs, self.rpn_window, truth_boxes, truth_labels ) if self.use_rcnn: # self.rpn_proposals = torch.zeros((0, 8)).cuda() self.rpn_proposals, self.rcnn_labels, self.rcnn_assigns, self.rcnn_targets = \ make_rcnn_target(self.cfg, self.mode, inputs, self.rpn_proposals, truth_boxes, truth_labels, truth_masks) #rcnn proposals self.detections = copy.deepcopy(self.rpn_proposals) self.ensemble_proposals = copy.deepcopy(self.rpn_proposals) self.mask_probs = [] if self.use_rcnn: if len(self.rpn_proposals) > 0: rcnn_crops = self.rcnn_crop(feat_4, inputs, self.rpn_proposals) self.rcnn_logits, self.rcnn_deltas = data_parallel( self.rcnn_head, rcnn_crops) self.detections, self.keeps = rcnn_nms(self.cfg, self.mode, inputs, self.rpn_proposals, self.rcnn_logits, self.rcnn_deltas) if self.mode in ['eval']: # Ensemble fpr_res = get_probability(self.cfg, self.mode, inputs, self.rpn_proposals, self.rcnn_logits, self.rcnn_deltas) self.ensemble_proposals[:, 1] = (self.ensemble_proposals[:, 1] + fpr_res[:, 0]) / 2 if self.use_mask and len(self.detections): # keep batch index, z, y, x, d, h, w, class self.crop_boxes = [] if len(self.detections): self.crop_boxes = self.detections[:, [0, 2, 3, 4, 5, 6, 7, 8 ]].cpu().numpy().copy() self.crop_boxes[:, 1:-1] = center_box_to_coord_box( self.crop_boxes[:, 1:-1]) self.crop_boxes = self.crop_boxes.astype(np.int32) self.crop_boxes[:, 1:-1] = ext2factor( self.crop_boxes[:, 1:-1], 4) self.crop_boxes[:, 1:-1] = clip_boxes( self.crop_boxes[:, 1:-1], inputs.shape[2:]) # if self.mode in ['eval', 'test']: # self.crop_boxes = top1pred(self.crop_boxes) # else: # self.crop_boxes = random1pred(self.crop_boxes) if self.mode in ['train', 'valid']: self.mask_targets = make_mask_target( self.cfg, self.mode, inputs, self.crop_boxes, truth_boxes, truth_labels, masks) # Make sure to keep feature maps not splitted by data parallel features = [ t.unsqueeze(0).expand(torch.cuda.device_count(), -1, -1, -1, -1, -1) for t in features ] self.mask_probs = data_parallel( self.mask_head, (torch.from_numpy(self.crop_boxes).cuda(), features)) if self.mode in ['eval', 'test']: mask_keep = mask_nms(self.cfg, self.mode, self.mask_probs, self.crop_boxes, inputs) # self.crop_boxes = torch.index_select(self.crop_boxes, 0, mask_keep) # self.detections = torch.index_select(self.detections, 0, mask_keep) # self.mask_probs = torch.index_select(self.mask_probs, 0, mask_keep) self.crop_boxes = self.crop_boxes[mask_keep] self.detections = self.detections[mask_keep] self.mask_probs = self.mask_probs[mask_keep] self.mask_probs = crop_mask_regions(self.mask_probs, self.crop_boxes)
def main(): v1 = Vocab() v1.build_vocab(sys.argv[1]) train_data, train_tag = read_data(sys.argv[1], v1) dev_data, dev_tag = read_data(sys.argv[2], v1) ntag = 2 batch_size = 64 init_range = 0.08 step_per_epoch = 500 learning_rate = 0.01 learning_rate_decay = 0.98 decay_rate = 0.95 batch_first = True checkpoint_after = 500 device_num = 2 model = LstmTag(v1.size(), 100, 100, max_len, batch_size, ntag, 3, batch_first=batch_first) model_path = "model.dat" if os.path.exists(model_path): saved_state = torch.load(model_path) model.load_state_dict(saved_state) else: if torch.cuda.is_available(): model.cuda() model.init_weights(init_range) optimizer = optim.RMSprop(model.parameters(), lr=learning_rate, alpha=decay_rate) train_loss = 0 last_train_loss = 10 loss_count = 0 best_dev_loss = 10 step = 0 begin_time = time.time() while True: inputs, tags = get_batch(train_data, train_tag, batch_size * device_num, batch_first) if len(inputs) == 0: continue if torch.cuda.is_available(): inputs, tags = inputs.cuda(), tags.cuda() #pred = model(inputs) pred = data_parallel(model, inputs, device_ids=[0, 2]) total_loss = None if batch_first: input_len = inputs.size()[1] else: input_len = inputs.size()[0] for time_step in xrange(input_len): if batch_first: y_pred = pred[:, time_step] else: y_pred = pred[time_step] if batch_first: target = tags[:, time_step] else: target = tags[time_step] # print(y_pred.size(), target.size()) loss = criterion(y_pred, target) if total_loss is None: total_loss = loss else: total_loss += loss optimizer.zero_grad() total_loss /= batch_size total_loss.backward() optimizer.step() train_loss += total_loss if step % step_per_epoch == 0: epoch = step / step_per_epoch dev_loss = evaluate(model, dev_data, dev_tag, batch_size, batch_first) train_loss = train_loss.data[0] / step_per_epoch if train_loss > last_train_loss: loss_count += 1 if loss_count == 3: learning_rate *= learning_rate_decay optimizer = optim.RMSprop(model.parameters(), lr=learning_rate, alpha=decay_rate) loss_count = 0 else: loss_count = 0 last_train_loss = train_loss if epoch > 0: epoch_time = (time.time() - begin_time) / epoch else: epoch_time = 0 print( "Epoch time: {0}\tEpoch: {1}\tLR: {2}, Train loss: {3}\tDev loss: {4}" .format(epoch_time, epoch, learning_rate, train_loss, dev_loss)) train_loss = 0 if epoch > checkpoint_after and dev_loss < best_dev_loss: state_to_save = model.state_dict() torch.save(state_to_save, model_path) step += 1
def train(min_num_class=10, checkPoint_start=0, lr=3e-4, batch_size=36): num_classes = 5004 * 2 model = net(num_classes=num_classes, inchannels=4).cuda() i = 0 iter_smooth = 50 iter_valid = 200 iter_save = 200 epoch = 0 optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.99), weight_decay=0.0002) resultDir = './result/{}_{}'.format(model_name, fold_index) ImageDir = resultDir + '/image' checkPoint = os.path.join(resultDir, 'checkpoint') os.makedirs(checkPoint, exist_ok=True) os.makedirs(ImageDir, exist_ok=True) log = Logger() log.open(os.path.join(resultDir, 'log_train.txt'), mode='a') log.write(' start_time :{} \n'.format( datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) log.write(' batch_size :{} \n'.format(batch_size)) # Image,Id data_train = pd.read_csv('./input/train_split_{}.csv'.format(fold_index)) names_train = data_train['Image'].tolist() labels_train = data_train['Id'].tolist() data_valid = pd.read_csv('./input/valid_split_{}.csv'.format(fold_index)) names_valid = data_valid['Image'].tolist() labels_valid = data_valid['Id'].tolist() num_data = len(names_train) dst_train = WhaleDataset(names_train, labels_train, mode='train', transform_train=transform_train, min_num_classes=min_num_class) dataloader_train = DataLoader(dst_train, shuffle=True, drop_last=True, batch_size=batch_size, num_workers=16, collate_fn=train_batch) print(dst_train.__len__()) dst_valid = WhaleTestDataset(names_valid, labels_valid, mode='valid', transform=transform_valid) dataloader_valid = DataLoader(dst_valid, shuffle=False, batch_size=batch_size * 2, num_workers=8, collate_fn=valid_batch) train_loss = 0.0 valid_loss = 0.0 top1, top5, map5 = 0, 0, 0 top1_train, top5_train, map5_train = 0, 0, 0 top1_batch, top5_batch, map5_batch = 0, 0, 0 batch_loss = 0.0 train_loss_sum = 0 train_top1_sum = 0 train_map5_sum = 0 sum = 0 skips = [] if not checkPoint_start == 0: log.write(' start from{}, l_rate ={} \n'.format(checkPoint_start, lr)) log.write('freeze={}, batch_size={}, min_num_class={} \n'.format( freeze, batch_size, min_num_class)) model.load_pretrain(os.path.join(checkPoint, '%08d_model.pth' % checkPoint_start), skip=skips) ckp = torch.load( os.path.join(checkPoint, '%08d_optimizer.pth' % checkPoint_start)) optimizer.load_state_dict(ckp['optimizer']) for params in optimizer.param_groups: params['lr'] = lr i = checkPoint_start epoch = ckp['epoch'] log.write(' rate iter epoch | valid top@1 top@5 map@5 | ' 'train top@1 top@5 map@5 |' ' batch top@1 top@5 map@5 | time \n') log.write( '--------------------------------------------------------------------------------------------------------' '-------------------------------------------------------------------\n' ) start = timer() start_epoch = epoch best_t = 0 cycle_epoch = 0 while i < 10000000: for data in dataloader_train: epoch = start_epoch + ( i - checkPoint_start) * 4 * batch_size / num_data if i % iter_valid == 0: valid_loss, top1, top5, map5, best_t = \ eval(model, dataloader_valid) print('\r', end='', flush=True) log.write( '%0.5f %5.2f k %5.2f | %0.3f %0.3f %0.3f %0.4f %0.4f | %0.3f %0.3f %0.3f | %0.3f' ' %0.3f %0.3f | %s \n' % (lr, i / 1000, epoch, valid_loss, top1, top5, map5, best_t, train_loss, top1_train, map5_train, batch_loss, top1_batch, map5_batch, time_to_str( (timer() - start) / 60))) time.sleep(0.01) if i % iter_save == 0 and not i == checkPoint_start: torch.save(model.state_dict(), resultDir + '/checkpoint/%08d_model.pth' % i) torch.save( { 'optimizer': optimizer.state_dict(), 'iter': i, 'epoch': epoch, 'best_t': best_t, }, resultDir + '/checkpoint/%08d_optimizer.pth' % (i)) model.train() model.mode = 'train' images, labels = data images = images.cuda() labels = labels.cuda().long() global_feat, local_feat, results = data_parallel(model, images) model.getLoss(global_feat, local_feat, results, labels) batch_loss = model.loss optimizer.zero_grad() batch_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0, norm_type=2) optimizer.step() results = torch.cat([ torch.sigmoid(results), torch.ones_like(results[:, :1]).float().cuda() * 0.5 ], 1) top1_batch = accuracy(results, labels, topk=(1, ))[0] map5_batch = mapk(labels, results, k=5) batch_loss = batch_loss.data.cpu().numpy() sum += 1 train_loss_sum += batch_loss train_top1_sum += top1_batch train_map5_sum += map5_batch if (i + 1) % iter_smooth == 0: train_loss = train_loss_sum / sum top1_train = train_top1_sum / sum map5_train = train_map5_sum / sum train_loss_sum = 0 train_top1_sum = 0 train_map5_sum = 0 sum = 0 print( '\r%0.5f %5.2f k %5.2f | %0.3f %0.3f %0.3f %0.4f %0.4f | %0.3f %0.3f %0.3f | %0.3f' ' %0.3f %0.3f | %s %d %d' % (lr, i / 1000, epoch, valid_loss, top1, top5, map5, best_t, train_loss, top1_train, map5_train, batch_loss, top1_batch, map5_batch, time_to_str( (timer() - start) / 60), checkPoint_start, i), end='', flush=True) i += 1 pass
def eval_embed(model, dataloader_valid, embeddings, ind_train_toID): # Setup torch and model for evaluation and efficiency with torch.no_grad(): model.eval() # TODO: Test this. Do not think this is necessary #model.mode = 'valid' #valid_loss, index_valid= 0, 0 all_feats = [] all_local = [] # Embedded all_results_e = [] all_labels = [] # Run dataset through model and get accuracy for data in dataloader_valid: images, labels, _ = data images = images.cuda() labels = labels.cuda().long() features, local_feat, outs = data_parallel(model, images) # NEW loss #model.getLoss(features[::2], local_feat[::2], outs[::2], labels) """ print(type(features), features.dtype) print(features.size()) print(type(embeddings), embeddings.dtype) print(embeddings.size()) """ # Get distances to embeddings # dist_mat: pytorch Variable, with shape [N, M] dist_mat = euclidean_dist(features.cuda(), embeddings.cuda()) #dist_mat = euclidean_dist(outs.cuda(), embeddings.cuda()) # TODO: trying new! #dist_mat = features.cuda().mm(embeddings.cuda().t()) #dist_mat = outs.mm(embeddings.cuda().t()) """ print(dist_mat) print(dist_mat.size()) """ flip_lr = True if flip_lr: # TODO: Test and decide which to do # Will do nearest neighbors first then try "center". # OR # Right now, I am trying to compare my image features to embeddings of every # single training image. But what I need is to compare them to embeddings # of the LABELS... So, I need to get a "center" vector for each label and # compare my features to those instead... known = embeddings.size()[0] // 2 assert known == ind_train_toID.size()[0] # TODO: TRYING THIS OUT.. THINK WAS A MISTAKE...??? # Labels are no longer 1...n, then flipped. But 1,1f,...n,nf right next # since I am not using model outputs but batch input to get embeddings #dist_mat = (dist_mat[::2, :known] + dist_mat[1::2, known:])/2 dist_mat = (dist_mat[::2, ::2] + dist_mat[1::2, 1::2]) / 2 # Currently, what we have is a mapping of distances to images' index but we want # a mapping of distances to the ID that corresponds to those images. # Instead of creating a new accuracy fcn, if I change the range of the # distance to [0..1], and then do 1 - distances, then I can just use the rest of # the set up just like if it were classification. Accuracy fcn was just # modified to convert the indices to IDs # Trying to make sure to get the resuts to be like probabilities # so acuracy function can be used as well # TODO: remove this to see how it influences accuracy. # TODO: JUST REPLACED THIS AND NEXT LINE WITH NEW WAY AS TEST #dist_mat = torch.sigmoid(dist_mat) #all_results_e.append(1 - dist_mat) all_results_e.append(dist_mat) all_labels.append(labels) all_feats.append(features[::2]) all_local.append(local_feat[::2]) #b = len(labels) # NEW loss #valid_loss += model.loss.data.cpu().numpy() * b #index_valid += b all_results_pre = torch.cat(all_results_e, 0) all_labels = torch.cat(all_labels, 0) all_feats = torch.cat(all_feats, 0) all_local = torch.cat(all_local, 0) #print("In eval:") print("Min and Max distances seen: {} {}".format( all_results_pre.min(), all_results_pre.max())) #""" #print("Some results visualized:") #see, _ = all_results_e.topk(5, 1, True, True) #print(see[:20]) min_dist = 0.8 #min_dist = -0.3 #min_dist = 0 #1900 #min_dist = 2000 min_found = all_results_pre.min() if min_found < min_dist: print("#" * 20) print("Lower min found: {}".format(min_found)) min_dist = min_found results3 = all_results_pre - min_dist # Don't necessarily need this second part if max_dist is close to 1 max_dist = 0.9 # 1.7 - 0.8 OR # 0.6 + 0.3 #max_dist = 200 #200-0 # ??? 8400-1900=6500 #max_dist = 6200 # 8200-2000 max_found = results3.max() if max_found > max_dist: print("#" * 20) print("Higher max found: {}".format(max_found)) max_dist = max_found results2 = results3 / max_dist all_results_e = 1 - results2 #""" #all_results_e = all_results_pre # End of with # NEW loss #valid_loss /= index_valid model.getLoss(all_feats, all_local, all_results_e, all_labels, ind_train_toID) valid_loss = model.loss.data.cpu().numpy() # TODO: implement giving each 'new_whale' their own ID so accuracy metric is more # accurate.. (as done on paper w/ explanation to Hilal) if min occ. to include is 1. # This won't relly be helpful with 'new_whale's as I can't test for re-identification # but I will def need this to include 'new_whale's with more than 1 occurence # to test that the embedding approach works map10s, top1s, top5s, top10s = [], [], [], [] new_res, known_res = [], [] # Adding the mapping for the 'label_map', # as 'new_whale' threshold added to results_t is not # a part of the training set data label_map = torch.cat( [ind_train_toID, torch.Tensor([CLASSES]).cuda().long()], 0) if 1: ts = np.linspace(0.1, 0.9, 9) for t in ts: # Guess that the id is 'new_whale' if nothing else has a high enough probability # We don't know what a good threshold is here, so test out a bunch results_t = torch.cat([ all_results_e, torch.ones_like(all_results_e[:, :1]).float().cuda() * t ], 1) # The guess for new_whale is appended to the end. So the last id is for 'new_whale' # We convert the 'new_whale' ID from its original state: 'the last ID after all the flipped IDs' all_labels[all_labels == CLASSES * 2] = CLASSES #new_whale, known, top1_, top5_ = accuracy(results_t, all_labels, label_map=label_map, sep=CLASSES) # MOD 9 new_whale, known, top1_, top5_, top10_ = accuracy( results_t, all_labels, topk=(1, 5, 10), label_map=label_map, sep=CLASSES) #map5_ = mapk(all_labels, results_t, k=5, label_map=label_map) map10_ = mapk(all_labels, results_t, k=10, label_map=label_map) """ print("t:", t, "map:", map10_) print("e: known: {}, new: {}".format(known, new_whale)) #print("top1: {}, top5: {}".format(top1_, top5_)) print("top1: {}, top5: {}, top10: {}".format(top1_, top5_, top10_)) """ map10s.append(map10_) new_res.append(new_whale) known_res.append(known) top1s.append(top1_) top5s.append(top5_) top10s.append(top10_) map10 = max(map10s) i_max = map10s.index(map10) new_b = new_res[i_max] known_b = known_res[i_max] top1 = top1s[i_max] top5 = top5s[i_max] top10 = top10s[i_max] best_t = ts[i_max] # TODO: Output the results when separated by known vs. 'new_whale' as well print("Best in eval") print("e: known: {}, new: {}".format(known_b, new_b)) print("top1: {}, top5: {}".format(top1, top5)) #return valid_loss, top1, top5, map5, best_t return valid_loss, top1, top5, top10, map10, best_t
def eval(model, dataLoader_valid): with torch.no_grad(): model.eval() model.mode = 'valid' valid_loss, index_valid = 0, 0 all_results = [] all_labels = [] for valid_data in dataLoader_valid: images, labels, names = valid_data images = images.cuda() labels = labels.cuda().long() feature, local_feat, results = data_parallel(model, images) model.getLoss(feature[::2], local_feat[::2], results[::2], labels) results = torch.sigmoid(results) # Combine the guesses for the results: since the outputs # are arranged so that an image and its flipped counterpart # are right after one another. And the flipped version's # correct classification is the original's ID + classes. # So, we add the 2 (out of 4) quadrants that contain the # correct intersection and ignore the rest (guess it doesn't matter # if it's wrong anyways?... ) # (This is not necessarily an accurate representation # of the overall accuracy. But it does boost the accuracy... # Seems the network can sometimes better recognize an image as # an individual when its flipped.. interesting) results_zeros = (results[::2, :classes] + results[1::2, classes:]) / 2 all_results.append(results_zeros) all_labels.append(labels) b = len(labels) valid_loss += model.loss.data.cpu().numpy() * b index_valid += b all_results = torch.cat(all_results, 0) all_labels = torch.cat(all_labels, 0) map10s, top1s, top5s, top10s = [], [], [], [] if 1: ts = np.linspace(0.1, 0.9, 9) for t in ts: # Guess that the id is 'new_whale' if nothing else has a high enough probability # We don't know what a good threshold is here, so test out a bunch results_t = torch.cat([ all_results, torch.ones_like(all_results[:, :1]).float().cuda() * t ], 1) # The guess for new_whale is appended to the end. So the last id is for 'new_whale' all_labels[all_labels == classes * 2] = classes top1_, top5_, top10_ = accuracy(results_t, all_labels, topk=(1, 5, 10)) map10_ = mapk(all_labels, results_t, k=5) map10s.append(map10_) top1s.append(top1_) top5s.append(top5_) top10s.append(top10_) map10 = max(map10s) i_max = map10s.index(map10) top1 = top1s[i_max] top5 = top5s[i_max] top10 = top10s[i_max] best_t = ts[i_max] valid_loss /= index_valid #return valid_loss, top1, top5, map5, best_t return valid_loss, top1, top5, top10, map10, best_t
def train(freeze=False, fold_index=1, model_name='seresnext50', min_num_class=10, checkPoint_start=0, lr=3e-4, batch_size=36, kaggle=False): num_classes = 2233 * 2 model = model_whale(num_classes=num_classes, inchannels=4, model_name=model_name).cuda() i = 0 iter_smooth = 50 iter_valid = 200 iter_save = 200 epoch = 0 if freeze: model.freeze() optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.99), weight_decay=0.0002) # optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=0.0002) resultDir = './WC_result/{}_{}'.format(model_name, fold_index) ImageDir = resultDir + '/image' checkPoint = os.path.join(resultDir, 'checkpoint') os.makedirs(checkPoint, exist_ok=True) os.makedirs(ImageDir, exist_ok=True) log = Logger() log.open(os.path.join(resultDir, 'log_train.txt'), mode='a') log.write(' start_time :{} \n'.format( datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) log.write(' batch_size :{} \n'.format(batch_size)) # Image,Id data_train = pd.read_csv( './WC_input/train_split_{}.csv'.format(fold_index), dtype='object') if test_train: data_valid = pd.read_csv( './WC_input/valid_split_{}.csv'.format(fold_index), dtype='object') data_train = data_train.append(data_valid) if pseudo_labels: data_test_pseudo_label = pd.read_csv('./WC_input/pseudo_labels.csv', dtype='object') data_train = data_train.append(data_test_pseudo_label) train_mode = 'data' else: train_mode = 'train' names_train = data_train['Image'].tolist() labels_train = data_train['Id'].tolist() data_valid = pd.read_csv( './WC_input/valid_split_{}.csv'.format(fold_index), dtype='object') names_valid = data_valid['Image'].tolist() labels_valid = data_valid['Id'].tolist() num_data = len(names_train) dst_train = WhaleDataset(names_train, labels_train, mode=train_mode, transform_train=transform_train, min_num_classes=min_num_class) dataloader_train = DataLoader(dst_train, shuffle=True, drop_last=True, batch_size=batch_size, num_workers=12, collate_fn=train_collate) print(dst_train.__len__()) dst_valid = WhaleTestDataset(names_valid, labels_valid, mode='valid', transform=transform_valid) dataloader_valid = DataLoader(dst_valid, shuffle=False, batch_size=batch_size * 2, num_workers=8, collate_fn=valid_collate) train_loss = 0.0 valid_loss = 0.0 top1, top5, map5 = 0, 0, 0 top1_train, top5_train, map5_train = 0, 0, 0 top1_batch, top5_batch, map5_batch = 0, 0, 0 batch_loss = 0.0 train_loss_sum = 0 train_top1_sum = 0 train_map5_sum = 0 sum = 0 skips = [] if not checkPoint_start == 0: log.write(' start from{}, l_rate ={} \n'.format(checkPoint_start, lr)) log.write('freeze={}, batch_size={}, min_num_class={} \n'.format( freeze, batch_size, min_num_class)) if kaggle: print('LOAD FROM Pretrained Model on Kaggle') num_classes = 5004 * 2 model = model_whale(num_classes=num_classes, inchannels=4, model_name=model_name).cuda() if freeze: model.freeze() checkPoint_kaggle = checkPoint.replace('WC_result', 'result') model.load_pretrain(os.path.join( checkPoint_kaggle, '%08d_model.pth' % (checkPoint_start)), skip=skips) planes = 2048 num_classes = 2233 * 2 model.fc = nn.Linear(planes, num_classes) init.normal_(model.fc.weight, std=0.001) init.constant_(model.fc.bias, 0) model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.99), weight_decay=0.0002) else: print('GDSC Dataset') print('checkpoint:', checkPoint) num_classes = 2233 * 2 model = model_whale(num_classes=num_classes, inchannels=4, model_name=model_name).cuda() if freeze: model.freeze() optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.99), weight_decay=0.0002) model.load_pretrain(os.path.join( checkPoint, '%08d_model.pth' % (checkPoint_start)), skip=skips) ckp = torch.load( os.path.join(checkPoint, '%08d_optimizer.pth' % (checkPoint_start))) optimizer.load_state_dict(ckp['optimizer']) adjust_learning_rate(optimizer, lr) i = checkPoint_start epoch = 0 # ckp['epoch'] log.write(' rate iter epoch | valid top@1 top@5 map@5 | ' 'train top@1 top@5 map@5 |' ' batch top@1 top@5 map@5 | time \n') log.write( '---------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n' ) start = timer() start_epoch = epoch best_t = 0 cycle_epoch = 0 while i < 10000000: for data in dataloader_train: epoch = start_epoch + ( i - checkPoint_start) * 4 * batch_size / num_data if i % iter_valid == 0: valid_loss, top1, top5, map5, best_t = \ eval(model, dataloader_valid) print('\r', end='', flush=True) log.write( '%0.5f %5.2f k %5.2f |' ' %0.3f %0.3f %0.3f %0.4f %0.4f | %0.3f %0.3f %0.3f | %0.3f %0.3f %0.3f | %s \n' % ( \ lr, i / 1000, epoch, valid_loss, top1, top5, map5, best_t, train_loss, top1_train, map5_train, batch_loss, top1_batch, map5_batch, time_to_str((timer() - start) / 60))) time.sleep(0.01) if i % iter_save == 0 and not i == checkPoint_start: torch.save(model.state_dict(), resultDir + '/checkpoint/%08d_model.pth' % (i)) torch.save( { 'optimizer': optimizer.state_dict(), 'iter': i, 'epoch': epoch, 'best_t': best_t, }, resultDir + '/checkpoint/%08d_optimizer.pth' % (i)) model.train() model.mode = 'train' images, labels = data images = images.cuda() labels = labels.cuda().long() # global_feat, local_feat, results = model(images) global_feat, local_feat, results = data_parallel(model, images) model.getLoss(global_feat, local_feat, results, labels) batch_loss = model.loss optimizer.zero_grad() batch_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0, norm_type=2) optimizer.step() results = torch.cat([ torch.sigmoid(results), torch.ones_like(results[:, :1]).float().cuda() * 0.5 ], 1) top1_batch = accuracy(results, labels, topk=(1, ))[0] map5_batch = mapk(labels, results, k=5) batch_loss = batch_loss.data.cpu().numpy() sum += 1 train_loss_sum += batch_loss train_top1_sum += top1_batch train_map5_sum += map5_batch if (i + 1) % iter_smooth == 0: train_loss = train_loss_sum / sum top1_train = train_top1_sum / sum map5_train = train_map5_sum / sum train_loss_sum = 0 train_top1_sum = 0 train_map5_sum = 0 sum = 0 print('\r%0.5f %5.2f k %5.2f | %0.3f %0.3f %0.3f %0.4f %0.4f | %0.3f %0.3f %0.3f | %0.3f %0.3f %0.3f | %s %d %d' % ( \ lr, i / 1000, epoch, valid_loss, top1, top5,map5,best_t, train_loss, top1_train, map5_train, batch_loss, top1_batch, map5_batch, time_to_str((timer() - start) / 60), checkPoint_start, i) , end='', flush=True) i += 1 pass
def train(config, num_classes=1108): model = model_whale(num_classes=num_classes, inchannels=6, model_name=config.train.model_name, pretrained=config.train.pretrained).cuda() if config.train.freeze: model.freeze() base_opt = RAdam(model.parameters(), lr=config.train.lr) optimizer = Lookahead(base_opt) # optimizer = torch.optim.Adam(model.parameters(), lr=config.train.lr, betas=(0.9, 0.99), weight_decay=0.0002) resultDir = config.train.result_dir checkPoint = join(resultDir, 'checkpoint') # if not config.train.in_colab: # os.makedirs(checkPoint, exist_ok=True) train_dataset = CustomDataset(config.train.csv_file, config.train.img_dir, transforms=transforms['train']) dataset_size = len(train_dataset) indices = list(range(dataset_size)) split = int(np.floor(config.train.validation_split * dataset_size)) if config.train.shuffle_dataset: np.random.seed(config.train.random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.train.batch_size, sampler=train_sampler, num_workers=config.train.num_workers) validation_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.train.batch_size, sampler=valid_sampler, num_workers=config.train.num_workers) train_loss = 0. # load from cpk: if config.train.load_cpk: model.load_pretrain(os.path.join( checkPoint, '%08d_model.pth' % (config.train.start_epoch)), skip=[]) cpk = torch.load( os.path.join(checkPoint, '%08d_optimizer.pth' % (config.train.start_epoch))) optimizer.load_state_dict(cpk['optimizer']) adjust_learning_rate(optimizer, config.train.lr) start_epoch = cpk['epoch'] else: start_epoch = 0 top1_batch, map5_batch = 0, 0 for epoch in range(start_epoch + 1, config.train.epochs): print('Starting:', epoch, 'Iterations:', len(train_loader)) for i, data in enumerate(train_loader): model.train() model.mode = 'train' images, labels = data images = images.cuda() labels = labels.cuda().long() global_feat, local_feat, results = data_parallel(model, images) model.getLoss(global_feat, local_feat, results, labels, config, verbose=(i % config.loss.verbose_interval == 0)) batch_loss = model.loss optimizer.zero_grad() batch_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0, norm_type=2) optimizer.step() results = torch.sigmoid(results) train_loss += batch_loss.data.cpu().numpy() top1_batch += accuracy(results, labels, topk=[1])[0] map5_batch += mapk(labels, results, k=5) if i % config.train.verbose_interval == 0: print( 'epoch: %03d, iter: %05d, train_loss: %f, top1_batch: %f, map5_batch: %f' % (epoch, i, float(train_loss / config.train.verbose_interval), float(top1_batch / config.train.verbose_interval), float(map5_batch / config.train.verbose_interval))) # print(f'epoch: {epoch}, iter: {i}, train_loss: {float(train_loss / config.train.verbose_interval)}, top1_batch: {float(top1_batch / config.train.verbose_interval)}, map5_batch: {float(map5_batch / config.train.verbose_interval)}') train_loss, top1_batch, map5_batch = 0, 0, 0 valid_loss, top1_valid, map5_valid = valid_eval( config, model, validation_loader) print( 'epoch: %03d, iter: %05d, valid_loss: %f, valid_top1_batch: %f, valid_map5_batch: %f' % (epoch, i, valid_loss, top1_valid, map5_valid)) # print(f'epoch: {epoch}, iter: {i}, valid_loss: {valid_loss}, top1_batch: {top1_valid}, map5_batch: {map5_valid}') if epoch % config.train.save_period == 0: os.system("touch " + resultDir + "/checkpoint/%08d_model.pth" % (epoch)) os.system("touch " + resultDir + "/checkpoint/%08d_optimizer.pth" % (epoch)) time.sleep(1) torch.save(model.state_dict(), resultDir + '/checkpoint/%08d_model.pth' % (epoch)) torch.save({ 'optimizer': optimizer.state_dict(), 'epoch': epoch, }, resultDir + '/checkpoint/%08d_optimizer.pth' % (epoch))
def train(train_loader, val_loader, model, optimizer, args, model_path): model.cuda() steps = 0 best_acc = 0 best_loss = float('inf') model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) train_info = { 'epoch': [], 'train_loss': [], 'val_loss': [], 'metric': [], 'best': [] } print( 'epoch | lr | % | loss | avg |val loss| top1 | top3 | best | time | save |' ) bg = time.time() train_iter = 0 model.train() for epoch in range(1, args.epochs + 1): losses = [] train_loss = 0 last_val_iter = 0 current_lr = get_lrs(optimizer) for batch_idx, batch in enumerate(train_loader): train_iter += 1 feature, target = batch[0], batch[1] # feature.data.t_(), target.data.sub_(1) # batch first, index align feature, target = feature.cuda(), target.cuda() optimizer.zero_grad() logit = data_parallel(model, feature) loss = criterion(logit, target) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() train_loss += loss.item() losses.append(loss.item()) print('\r {:4d} | {:.5f} | {:4d}/{} | {:.4f} | {:.4f} |'.format( epoch, float(current_lr[0]), args.batch_size * (batch_idx + 1), train_loader.num, loss.item(), train_loss / (train_iter - last_val_iter)), end='') if train_iter > 0 and train_iter % args.log_interval == 0: top_1, top_3, val_loss, size = validate(val_loader, model) # test_top_1, tst_top_3, test_loss, _ = validate(test_loader, model) _save_ckp = ' ' if val_loss < best_loss: best_acc = top_1 best_loss = val_loss save_checkpoint(model_path, model, optimizer) _save_ckp = '*' print(' {:.4f} | {:.4f} | {:.4f} | {:.4f} | {:.2f} | {:4s} |'. format(val_loss, top_1, top_3, best_acc, (time.time() - bg) / 60, _save_ckp)) train_info['epoch'].append(args.batch_size * (batch_idx + 1) / train_loader.num + epoch) train_info['train_loss'].append(train_loss / (batch_idx + 1)) train_info['val_loss'].append(val_loss) train_info['metric'].append(top_1) train_info['best'].append(best_acc) log_df = pd.DataFrame(train_info) log_df.to_csv(model_path + '.csv') train_loss = 0 last_val_iter = train_iter model.train() log_df = pd.DataFrame(train_info) log_df.to_csv(model_path + '.csv') print("Best accuracy is {:.4f}".format(best_acc))
def train(debug, freeze=False, fold_index=1, model_name='seresnext50', min_num_class=10, checkpoint_start=0, lr=3e-4, batch_size=36, num_classes=10008, embed=False): import ipdb device = torch.device('cuda') if withMask: #model = model_whale(num_classes=num_classes, inchannels=4, model_name=model_name).cuda() model = model_whale(num_classes=num_classes, inchannels=4, model_name=model_name).to(device) else: model = model_whale(num_classes=num_classes, inchannels=3, model_name=model_name).cuda() if debug: print(model) print("\n\n\n\n\n\n\n\n\n") #print(" 999999999 ") #summary(model.cuda(), (3, width, height)) i = 0 # We want more valid/saving for embed as the training times are much, much longer if embed: # Changed this since this method is much slower, helps to keep track iter_smooth = 10 #iter_valid = 50 # Valid and save should be the same so I know the state at the time of save! iter_valid = 20 iter_save = 20 else: iter_smooth = 50 iter_valid = 200 iter_save = 200 epoch = 0 if freeze: model.freeze() optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.99), weight_decay=0.0002) # optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=0.0002) resultDir = './result/{}_{}'.format(model_name, fold_index) ImageDir = resultDir + '/image' checkPoint = os.path.join(resultDir, 'checkpoint') os.makedirs(checkPoint, exist_ok=True) os.makedirs(ImageDir, exist_ok=True) log = Logger() log.open(os.path.join(resultDir, 'log_train.txt'), mode='a') log.write(' start_time :{} \n'.format( datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) log.write(' batch_size :{} \n'.format(batch_size)) # Image,Id # if debug: ipdb.set_trace(context=5) data_train = pd.read_csv('./input/train_split_{}.csv'.format(fold_index)) names_train = data_train['Image'].tolist() labels_train = data_train['Id'].tolist() data_valid = pd.read_csv('./input/valid_split_{}.csv'.format(fold_index)) names_valid = data_valid['Image'].tolist() labels_valid = data_valid['Id'].tolist() num_data = len(names_train) num_vld = len(names_valid) ind1 = len(set(labels_train)) ind2 = len(set(labels_valid)) print() print("Initial incoming image analysis") log.write("\nLooking at {} total images.\n".format(num_data + num_vld)) log.write("{} train and {} validation images\n".format(num_data, num_vld)) log.write("{} train and {} valid individuals\n".format(ind1, ind2)) # Get the dataset for training if not (test): # For retraining with older model, unfinalized ideas if oldModel: dst_train = WhaleDataset(names_train, labels_train, mode='train', transform_train=transform_train, min_num_classes=min_num_class, newWhale=newWhale) dataloader_train = DataLoader(dst_train, shuffle=True, drop_last=True, batch_size=batch_size, num_workers=16, collate_fn=train_collate) # No dataloader for use to store embeddings else: dst_train = WhaleDataset(names_train, labels_train, mode='train', transform_train=transform_train, min_num_classes=min_num_class) dataloader_train = DataLoader(dst_train, shuffle=True, drop_last=True, batch_size=batch_size, num_workers=16, collate_fn=train_collate) # Create the dataloader for use to store embeddings dst_embed = WhaleTestDataset(names_train, labels_train, mode='train', transform=transform_valid) dataloader_embed = DataLoader(dst_embed, shuffle=False, drop_last=False, batch_size=batch_size, num_workers=16, collate_fn=valid_collate) # num_workers was previously 4 TODO: TEST THIS!!! else: dst_train = WhaleDataset(names_train, labels_train, mode='train', transform_train=transform_train, min_num_classes=min_num_class, newWhale=newWhale) # Change drop_last back to True if there is an issue... # Thought this would cause an error but apparently not dataloader_train = DataLoader(dst_train, shuffle=True, drop_last=False, batch_size=batch_size, num_workers=0, collate_fn=train_collate) # Create the dataloader for use to store embeddings dst_embed = WhaleTestDataset(names_train, labels_train, mode='train', transform=transform_embed) dataloader_embed = DataLoader(dst_embed, shuffle=False, drop_last=False, batch_size=3, num_workers=0) # Get the dataset for validation dst_valid = WhaleTestDataset(names_valid, labels_valid, mode='valid', transform=transform_valid) if not (test): dataloader_valid = DataLoader(dst_valid, shuffle=False, batch_size=batch_size * 2, num_workers=8, collate_fn=valid_collate) # num_workers was previously 4 TODO: TEST THIS!! else: dataloader_valid = DataLoader(dst_valid, shuffle=False, batch_size=1, num_workers=0, collate_fn=valid_collate) #ipdb.set_trace(context=5) print() print("Analysis of images to be used") print("Train & valid image analysis:") print("Number of training images with at least {} instances: {}".format( min_num_class, dst_train.num_images)) print("Number of validation images with at least {} instances: {}".format( 0, dst_valid.__len__())) # Right now it is important for everything in valid to be in train # (this should be changed later on) trn = set(dst_train.labels) vld = set(dst_valid.labels) extra = vld - trn i2 = len(vld) i1 = i2 - len(extra) print("There are {} individuals found in train".format(len(trn))) print("{} out of {} ids in valid are found in train".format(i1, i2)) if (i1 < i2): print("The missing ones are: {}".format(extra)) print() train_loss = 0.0 valid_loss = 0.0 #top1, top5, map5 = 0, 0, 0 top1, top5, top10, map10 = 0, 0, 0, 0 top1_train, top5_train, map10_train = 0, 0, 0 #top1_batch, top5_batch, map5_batch = 0, 0, 0 top1_batch, top5_batch, top10_batch, map10_batch = 0, 0, 0, 0 batch_loss = 0.0 train_loss_sum = 0 train_top1_sum = 0 train_map10_sum = 0 sum_ = 0 skips = [] if not checkpoint_start == 0: log.write('Starting from iter {}, l_rate = {} \n'.format( checkpoint_start, lr)) log.write('freeze={}, batch_size={}, min_num_class={} \n\n'.format( freeze, batch_size, min_num_class)) model.load_pretrain(os.path.join(checkPoint, '%08d_model.pth' % (checkpoint_start)), skip=skips) ckp = torch.load( os.path.join(checkPoint, '%08d_optimizer.pth' % (checkpoint_start))) optimizer.load_state_dict(ckp['optimizer']) adjust_learning_rate(optimizer, lr) i = checkpoint_start epoch = ckp['epoch'] #log.write( # ' rate iter epoch | valid top@1 top@5 map@5 | ' # 'train top@1 top@5 map@5 |' # ' batch top@1 top@5 map@5 | time \n') log.write( 'format: \n' 'rate iter k epoch | valid top@1 top@5 map@5 best_t |' ' train top@1 map@5 |' ' batch top@1 map@5 | time \n') log.write( '--------------------------------------------------------------------------------------------------------------------------------------\n' ) start = timer() start_epoch = epoch best_t = 0 cycle_epoch = 0 curMax = 0 max_valid = 0 if not (oldModel): # Load the previously stored features with max validation accuracy # This features is to stop runs from overwriting previously gotten, good # features because the system has no concept of what was previously achieved if embed: acc_file = "maxValAcc_{}_{}_embed.txt".format( model_name, fold_index) else: acc_file = "maxValAcc_{}_{}.txt".format(model_name, fold_index) # To avoid overwriting previously gotten results that were better try: max_valid = pd.read_csv(acc_file)['0'][0] print("Found max valid of:", max_valid) except: print("No existing max valid found. Creating one.") handle = open(acc_file, "w+") handle.close() #from ipdb import launch_ipdb_on_exception #ipdb.set_trace(context=5) # with launch_ipdb_on_exception(): if True: while i < 10000000: print("\nAt iteration:", i) for data in dataloader_train: #print("\nIteration:",i) # Starting model learning process epoch = start_epoch + ( i - checkpoint_start) * 4 * batch_size / num_data # Temporarily addded to test convergence when running with 'old' embeddings #if i % iter_valid == 0: if embed: ids, feats = get_features(dataloader_embed, model, num_classes) # Validation phase if i % iter_valid == 0: if not (embed): valid_loss, top1, top5, top10, map10, best_t = \ eval(model, dataloader_valid) # Run embedding validation function if embed: valid_loss, top1, top5, top10, map10, best_t = \ eval_embed(model, dataloader_valid, \ torch.Tensor(feats).float(), torch.Tensor(ids).cuda().long()) if top1 > curMax: curMax = top1 print("New max valid for this run") # Store the features of the best model version for embedding if top1 > max_valid and not (oldModel): if not (embed): ids, feats = get_features(dataloader_embed, model, num_classes) max_valid = top1 print( "Saving current features. Best valid acc. found.") # Save data # They are not being saved together as they are not the same # length. Could be though. addition = "" if embed: addition = "2" outfile = "train_ids_{}{}.csv".format( fold_index, addition) outfile2 = "train_features_{}{}.csv".format( fold_index, addition) df1 = pd.DataFrame(ids) df2 = pd.DataFrame(feats) # Keep track of id, vector and some info about where this # was gotten (model, fold, iteration?) df1.to_csv(outfile, index=None) df2.to_csv(outfile2, index=None) pd.DataFrame([max_valid.cpu().numpy() ]).to_csv(acc_file, index=None) #np.savetxt(outfile, out_ids) #np.savetxt(outfile2, out_features) # Output some statistics print('\r', end='', flush=True) log.write( '%0.5f %5.2f v %5.2f |' ' %0.3f %0.3f %0.3f %0.3f %0.4f %0.4f | %0.3f %0.3f %0.3f | %0.3f %0.3f %0.3f | %s\n' % ( \ lr, i / 1000, epoch, valid_loss, top1, top5, top10, map10, best_t, train_loss, top1_train, map10_train, batch_loss, top1_batch, map10_batch, time_to_str((timer() - start) / 60))) # Don't know if this played a role but potentially slows it down.. #time.sleep(0.01) # End valid if statement #print("check1") # Save the state of the model and some attributes for possible reuse if i % iter_save == 0 and not i == checkpoint_start: torch.save(model.state_dict(), resultDir + '/checkpoint/%08d_model.pth' % (i)) torch.save( { 'optimizer': optimizer.state_dict(), 'iter': i, 'epoch': epoch, 'best_t': best_t, }, resultDir + '/checkpoint/%08d_optimizer.pth' % (i)) # This doesn't actually do any training but sets up the # modules in the model so that they are ready for training model.train() # Get training results model.mode = 'train' images, labels = data images = images.cuda() labels = labels.cuda().long() global_feat, local_feat, results = data_parallel(model, images) if embed: embeddings = torch.Tensor(feats).float() # Create a modified target ids with the mirrored training images included dist_mat = euclidean_dist(global_feat, embeddings.cuda()) #dist_mat = euclidean_dist(results, embeddings.cuda()) # TODO: trying new! #dist_mat = global_feat.mm(embeddings.cuda().t()) #dist_mat = results.mm(embeddings.cuda().t()) # Here we are trying to make sure our 1-sigmoid(x) scheme is appropriate print("Min and Max distances seen: {} {}".format( dist_mat.min(), dist_mat.max())) #results = 1 - torch.sigmoid(dist_mat) # Upper and lower bounds for distance results from above min_dist = 0.8 #min_dist = -0.3 #min_dist = 0 # 1900 #min_dist = 2000 min_found = dist_mat.min() if min_found < min_dist: print("#" * 20) print("Lower min found: {}".format(min_found)) min_dist = min_found results3 = dist_mat - min_dist # TODO: ATTEMPT THIS WITH TORCH.NORM OR SOMETHING AUTOMATIC # AND COMPARE RESULTS AS WELL # Don't necessarily need this second part if max_dist is close to 1 max_dist = 0.9 # 1.7 - 0.8 #max_dist = 0.9 # 0.6 + 0.3 #max_dist = 200 #200-0 # ??? 8400-1900=6500 #max_dist = 6200 # 8200-2000 max_found = results3.max() if max_found > max_dist: print("#" * 20) print("Higher max found: {}".format(max_found)) max_dist = max_found results2 = results3 / max_dist results = 1 - results2 #""" #results = dist_mat # For training, we do need to add the second, mirrored label # to the ids as those are considered different classes # and not combined as they are in validation and testing ids_ = [] for elem in ids: ids_.extend([elem, elem + classes]) ind_train_toID = torch.Tensor(ids_).cuda().long() #model.getLoss(global_feat, local_feat, results, labels, ind_train_toID) model.getLoss(global_feat, local_feat, None, labels) else: model.getLoss(global_feat, local_feat, results, labels) batch_loss = model.loss #print("check3") # Backpropagation and accuracy measurement optimizer.zero_grad() batch_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0, norm_type=2) optimizer.step() #print("check4") if embed: results = torch.cat([ results, torch.ones_like(results[:, :1]).float().cuda() * 0.5 ], 1) # This makes sure that there are no images classified as 'new_whale' # in the set of seen images. They should ALL be known (ie. have an ID) assert ind_train_toID.ne(num_classes).all() label_map = torch.cat([ ind_train_toID, torch.Tensor([num_classes]).cuda().long() ], 0) # Mode here was to be potentially used if needed to remove embeddings # of the same image for the training set. # Initially thought it screwed the accuracy.. not used tho new_whale, known, top1_batch, top5_batch, top10_batch = accuracy( results, labels, topk=(1, 5, 10), label_map=label_map, sep=num_classes, mode="train") #map5_batch = mapk(labels, results, k=5, label_map=label_map) map10_batch = mapk(labels, results, k=10, label_map=label_map) #print("Just ran accuracy") print("In train") print("e: known: {}, new: {}, top5: {}, top10: {}".format( known, new_whale, top5_batch, top10_batch)) else: results = torch.cat([ torch.sigmoid(results), torch.ones_like(results[:, :1]).float().cuda() * 0.5 ], 1) #top1_batch = accuracy(results, labels, topk=(1,))[0] new_whale, known, top1_batch, top5_batch, top10_batch = accuracy( results, labels, topk=(1, 5, 10), sep=num_classes) map10_batch = mapk(labels, results, k=5) #print("check5") # Nothing after this should need to change batch_loss = batch_loss.data.cpu().numpy() sum_ += 1 train_loss_sum += batch_loss train_top1_sum += top1_batch train_map10_sum += map10_batch # Aggregrate the training data over the last 50 iterations # and show the average loss, accuracy & map if (i + 1) % iter_smooth == 0: print("\nSmoothing after {}".format(iter_smooth)) train_loss = train_loss_sum / sum_ top1_train = train_top1_sum / sum_ map10_train = train_map10_sum / sum_ train_loss_sum = 0 train_top1_sum = 0 train_map10_sum = 0 sum_ = 0 if 1: #if embed: print("e: known: {}, new: {}, top5: {}".format( known, new_whale, top5_batch)) # This was indented as I do not care to see this output so often... print('%0.5f %5.2f l %5.2f | %0.3f %0.3f %0.3f %0.4f %0.4f | %0.3f %0.3f %0.3f | %0.3f %0.3f %0.3f | %s %d %d\n' % ( \ lr, i / 1000, epoch, valid_loss, top1, top5, map10, best_t, train_loss, top1_train, map10_train, batch_loss, top1_batch, map10_batch, time_to_str((timer() - start) / 60), checkpoint_start, i) , end='', flush=True) i += 1 pass
def train(fold_index=3, num_classes=5, model_name='resnet101', checkPoint_start=0, lr=3e-4, batch_size=36): #Build the model: model = model_blindness(num_classes=5, inchannels=3, model_name=model_name).cuda() #Training parameters: epoch = 0 iter_smooth = 100 iter_valid = 200 MAX_BATCH = 10000000 #Choose the optimizer: optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.99), weight_decay=0.0002) # optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=0.0002) #Results and log: resultDir = './result/{}_{}'.format(model_name, fold_index) os.makedirs(resultDir, exist_ok=True) ImageDir = resultDir + '/image' checkPoint = os.path.join(resultDir, 'checkpoint') os.makedirs(checkPoint, exist_ok=True) os.makedirs(ImageDir, exist_ok=True) log = Logger() log.open(os.path.join(resultDir, 'log_train.txt'), mode='a') log.write(' start_time :{} \n'.format( datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) log.write(' batch_size :{} \n'.format(batch_size)) #Prepare the dataset: dataset = BlindnessDataset(mode='train', transform=None) train_loader, validation_loader = train_valid_dataset( dataset, batch_size, 0.2, True) num_image = len(dataset) iter_save = 2 * num_image // batch_size #Initializa the losses: train_loss = 0.0 valid_loss = 0.0 batch_loss = 0.0 train_loss_sum = 0 iter_num = 0 i = 1 #batch index #If need to load the previous model: skips = [] if not checkPoint_start == 0: log.write(' start from{}, l_rate ={} \n'.format(checkPoint_start, lr)) log.write('freeze={}, batch_size={}\n'.format(freeze, batch_size)) model.load_pretrain(os.path.join(checkPoint, '%08d_model.pth' % (checkPoint_start)), skip=skips) ckp = torch.load( os.path.join(checkPoint, '%08d_optimizer.pth' % (checkPoint_start))) optimizer.load_state_dict(ckp['optimizer']) adjust_learning_rate(optimizer, lr) i = checkPoint_start epoch = ckp['epoch'] log.write( ' rate iter epoch | valid train batch |Accuracy time \n' ) log.write( '----------------------------------------------------------------------------\n' ) start = timer() start_epoch = epoch cycle_epoch = 0 #Freeze base-model layers: model.freeze() #set to train mode: model.train() while i < MAX_BATCH: for batchdata in train_loader: epoch = start_epoch + (i - checkPoint_start) * batch_size / num_image #We check the validation set @iter_valid: if i % iter_valid == 0: valid_loss, valid_accuracy = prediction(model, validation_loader, num_classes=5, batch_size=batch_size) print('\r', end='', flush=True) log.write( '%0.5f %5.2f k %5.2f | %0.3f %0.3f %0.3f |%0.3f %s \n' % ( \ lr, i / 1000, epoch, valid_loss, train_loss, batch_loss, valid_accuracy, time_to_str((timer() - start) / 60))) print('epoch=', epoch, 'valid_loss=', valid_loss) time.sleep(0.01) #set model back to train mode: model.train() #We save the training @iter_save if i % iter_save == 0 and (not i == checkPoint_start): torch.save(model.state_dict(), resultDir + '/checkpoint/%08d_model.pth' % (i)) torch.save( { 'optimizer': optimizer.state_dict(), 'iter': i, 'epoch': epoch, }, resultDir + '/checkpoint/%08d_optimizer.pth' % (i)) images, labels = batchdata # pdb.set_trace() # print(images.shape) # N, H, W, C = images.shape # images = images.view(N, C, H, W).float() #We use GPU in GCP: images = images.cuda() labels = labels.cuda() # global_feat, local_feat, results = data_parallel(model,images) out = data_parallel(model, images) out = model(images) batch_loss = getLoss(out, labels) #Backpropagation: optimizer.zero_grad() batch_loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0, norm_type=2) optimizer.step() # results = torch.cat([torch.sigmoid(results), torch.ones_like(results[:, :1]).float().cuda() * 0.5], 1) #move back to CPU and transform to numpy: batch_loss = batch_loss.data.cpu().numpy() train_loss_sum += batch_loss iter_num += 1 if (i + 1) % iter_smooth == 0: train_loss = train_loss_sum / iter_num train_loss_sum = 0 iter_num = 0 print('\r%0.5f %5.2f k %5.2f | %0.3f %0.3f %0.3f | %s %d %d' % ( \ lr, i / 1000, epoch, valid_loss, train_loss, batch_loss, time_to_str((timer() - start) / 60), checkPoint_start, i) , end='', flush=True) i += 1 pass
def forward(self, inputs, truth_boxes, truth_labels, truth_masks, masks): """ Forward function for the network. I admit this is a bit strange: the forward takes in multiple arguments. As people might wonder, how to set the variables in test mode since no ground truth labels are available. So, in test mode, simply set truth_boxes, truth_labels, truth_masks, masks to None """ # Feature extraction backbone features = data_parallel(self.feature_net, (inputs)) # Get feature_map_8 fs = features[-1] # RPN branch self.rpn_logits_flat, self.rpn_deltas_flat = data_parallel( self.rpn, fs) b, D, H, W, _, num_class = self.rpn_logits_flat.shape self.rpn_logits_flat = self.rpn_logits_flat.view(b, -1, 1) self.rpn_deltas_flat = self.rpn_deltas_flat.view(b, -1, 6) # Generating anchor boxes self.rpn_window = make_rpn_windows(fs, self.cfg) self.rpn_proposals = [] # Only in evalutation mode, or in training mode and we need use rcnn branch, # we will perform nms to rpn results if self.use_rcnn or self.mode in ['eval', 'test']: self.rpn_proposals = rpn_nms(self.cfg, self.mode, inputs, self.rpn_window, self.rpn_logits_flat, self.rpn_deltas_flat) # Generate the labels for each anchor box, and regression terms for positive anchor boxes # Generate the labels for each RPN proposal, and corresponding regression terms if self.mode in ['train', 'valid']: self.rpn_labels, self.rpn_label_assigns, self.rpn_label_weights, self.rpn_targets, self.rpn_target_weights = \ make_rpn_target(self.cfg, self.mode, inputs, self.rpn_window, truth_boxes, truth_labels ) if self.use_rcnn: self.rpn_proposals, self.rcnn_labels, self.rcnn_assigns, self.rcnn_targets = \ make_rcnn_target(self.cfg, self.mode, inputs, self.rpn_proposals, truth_boxes, truth_labels, truth_masks) # RCNN branch self.detections = copy.deepcopy(self.rpn_proposals) self.mask_probs = [] if self.use_rcnn: if len(self.rpn_proposals) > 0: rcnn_crops = self.rcnn_crop(fs, inputs, self.rpn_proposals) self.rcnn_logits, self.rcnn_deltas = data_parallel( self.rcnn_head, rcnn_crops) self.detections, self.keeps = rcnn_nms(self.cfg, self.mode, inputs, self.rpn_proposals, self.rcnn_logits, self.rcnn_deltas) # Mask branch if self.use_mask: # keep batch index, z, y, x, d, h, w, class self.crop_boxes = [] if len(self.detections): # [batch_id, z, y, x, d, h, w, class] self.crop_boxes = self.detections[:, [0, 2, 3, 4, 5, 6, 7, 8 ]].cpu().numpy().copy() # Use left bottom and right upper points to represent a bounding box # [batch_id, z0, y0, x0, z1, y1, x1] self.crop_boxes[:, 1:-1] = center_box_to_coord_box( self.crop_boxes[:, 1:-1]) self.crop_boxes = self.crop_boxes.astype(np.int32) # Round the coordinates to the nearest multiple of 8 self.crop_boxes[:, 1:-1] = ext2factor( self.crop_boxes[:, 1:-1], 8) # Clip the coordinates, so the points fall within the size of the input data # More specifically, make sure (0, 0, 0) <= (z0, y0, x0) and (z1, y1, x1) < (D, H, W) self.crop_boxes[:, 1:-1] = clip_boxes( self.crop_boxes[:, 1:-1], inputs.shape[2:]) # In evaluation mode, we keep the detection with the highest probability for each OAR if self.mode in ['eval', 'test']: self.crop_boxes = top1pred(self.crop_boxes) else: # In training mode, we random select one detection for each OAR self.crop_boxes = random1pred(self.crop_boxes) # Generate mask labels for each detection if self.mode in ['train', 'valid']: self.mask_targets = make_mask_target( self.cfg, self.mode, inputs, self.crop_boxes, truth_boxes, truth_labels, masks) # Make sure to keep feature maps not splitted by data parallel features = [ t.unsqueeze(0).expand(torch.cuda.device_count(), -1, -1, -1, -1, -1) for t in features ] self.mask_probs = data_parallel( self.mask_head, (torch.from_numpy(self.crop_boxes).cuda(), features)) self.mask_probs = crop_mask_regions(self.mask_probs, self.crop_boxes)