def main(): start_time = datetime.datetime.now() logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.DEBUG, ) logger = logging.getLogger(__name__) # load config file model_args, training_args = load_or_parse_args( (ModelArgs, TrainingArgs), verbose=True, json_path=os.path.join(CHECKPOINT_DIR, 'config.json')) # load label_encoder logger.info( f'Loading persisted LabelEncoder and num_classes from checkpoints {CHECKPOINT_DIR}' ) label_enc: LabelEncoder = joblib.load(filename=os.path.join( CHECKPOINT_DIR, training_args.label_encoder_filename)) num_classes = len(label_enc.classes_) # create model and load weights from checkpoint model = LandmarkModel( model_name=model_args.model_name, n_classes=num_classes, loss_module=model_args.loss_module, pooling_name=model_args.pooling_name, args_pooling=model_args.args_pooling, use_fc=model_args.use_fc, fc_dim=model_args.fc_dim, dropout=model_args.dropout, ) logger.info("Model params:") logger.info(pformat(model_args)) logger.info('Loading model weights from checkpoint') model = load_model_state_from_checkpoint( os.path.join(CHECKPOINT_DIR, CHECKPOINT_NAME), model) # create test dataloader submission_df = pd.read_csv(training_args.data_path / 'sample_submission.csv') test_loader = get_test_data_loader(submission_df, image_dir=training_args.data_path, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS) # load index index = faiss.read_index(os.path.join(CHECKPOINT_DIR, 'flat.index')) # extract feature vectors for test images if not LOAD_VECTORS_FROM_CHECKPOINT: img_mapping_test, img_vectors_test = extract_features( model, test_loader, mode='test', device=DEVICE, normalize=NORMALIZE_VECTORS, dir_to_save=CHECKPOINT_DIR) else: logger.info('Loading vectors from checkpoint') img_mapping_test = joblib.load( os.path.join(CHECKPOINT_DIR, 'meta_vectors_test.pkl')) img_vectors_test = joblib.load( os.path.join(CHECKPOINT_DIR, 'vectors_test.pkl')) logger.info('Loading train vectors mapping') train_vec_mapping = joblib.load( os.path.join(CHECKPOINT_DIR, 'meta_vectors_train.pkl')) # train_vec_image_ids = train_vec_mapping['image_ids'] train_vec_targets = train_vec_mapping[TARGET_NAME] # predict kNN for each test image (topk = 3) logger.info('Searching for nearest neighbours') # this predicts nearest train vector id which needs to be converted in to the label pred_dist, pred_vec_id = index.search(img_vectors_test, TOPK) logger.info('Extracting label encoded target class_id') preds_vec = np.vectorize(lambda x: train_vec_targets[x])(pred_vec_id) logger.info('Picking up most common labels for each vector') pred_mode, pred_cnt = mode(preds_vec, axis=1) # threshold_dist = 0.007 # pred_mode_final = np.where(pred_dist >= threshold_dist, preds_vec, np.nan) # inverse_transform predicted labels by label encoder logger.info('Label encoder inverse transform labels') pred_labels = label_enc.inverse_transform(pred_mode[:, 0]) # rerank (optional) # save submit file logger.info('Saving the predictions to submission.csv') submission_df['labels'] = pred_labels submission_df['cnt'] = pred_cnt / pred_cnt.max() submission_df['landmarks'] = submission_df.apply( lambda x: str(x['labels']) + ' ' + str(x['cnt']), axis=1) del submission_df['labels'] del submission_df['cnt'] submission_df.to_csv(SUBMISSION_PATH, index=False) end_time = datetime.datetime.now() logger.info('Duration: {}'.format(end_time - start_time))
LOAD_VECTORS_FROM_CHECKPOINT = False if __name__ == "__main__": SEED = 17 fix_seed(SEED) start_time = datetime.datetime.now() logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) if not LOAD_VECTORS_FROM_CHECKPOINT: model_args, training_args = load_or_parse_args((ModelArgs, TrainingArgs), verbose=True, json_path=os.path.join(CHECKPOINT_DIR, 'config.json')) train_orig_df, _ = load_train_dataframe(training_args.data_train, min_class_samples=training_args.min_class_samples) logger.info(f'Loading persisted LabelEncoder and num_classes from checkpoints {CHECKPOINT_DIR}') label_enc = joblib.load(filename=training_args.checkpoints_path / training_args.label_encoder_filename) num_classes = len(label_enc.classes_) assert train_orig_df.landmark_id.nunique() == num_classes, "Num classes should be the same in DF and loaded obj" model = LandmarkModel(model_name=model_args.model_name, n_classes=num_classes, loss_module=model_args.loss_module, pooling_name=model_args.pooling_name, args_pooling=model_args.args_pooling, use_fc=model_args.use_fc,
def main(): logger = logging.getLogger(__name__) start_time = datetime.datetime.now() model_args, training_args = load_or_parse_args((ModelArgs, TrainingArgs), verbose=True, json_path=CONFIG_PATH) train_orig_df, label_enc = load_train_dataframe( training_args.data_train, min_class_samples=training_args.min_class_samples) # assert training_args.test_size % training_args.batch_size == 0, "Test size should be multiple of batch size" # TODO: split DFs once and keep those on the disk. Reload label_enc from disk on resume. train_df, valid_df = split_dataframe_train_test( train_orig_df, test_size=training_args.test_size, stratify=train_orig_df.landmark_id, random_state=SEED) num_classes = train_df.landmark_id.nunique( ) if training_args.min_class_samples is None else len(label_enc.classes_) logger.info(f'Num classes train: {num_classes}') logger.info(f'Num classes valid: {valid_df.landmark_id.nunique()}') logger.info('Initializing the model') model = LandmarkModel(model_name=model_args.model_name, n_classes=num_classes, loss_module=model_args.loss_module, pooling_name=model_args.pooling_name, args_pooling=model_args.args_pooling, normalize=model_args.normalize, use_fc=model_args.use_fc, fc_dim=model_args.fc_dim, dropout=model_args.dropout) logger.info("Model params:") logger.info(pformat(model_args)) # save checkpoints training_args.checkpoints_dir.mkdir(exist_ok=True, parents=True) joblib.dump(label_enc, filename=training_args.checkpoints_dir / training_args.label_encoder_filename) logger.info( f'Persisted LabelEncoder to {training_args.label_encoder_filename}') save_config_checkpoint(training_args.checkpoints_dir, json_path=CONFIG_PATH) # Stage 1 - train full model with low resolution stage1_start_time = datetime.datetime.now() lit_module = LandmarksPLBaseModule(hparams={ **model_args.__dict__, **training_args.__dict__ }, model=model, optimizer=training_args.optimizer, loss=model_args.loss_module) # init data dm = LandmarksDataModule( train_df, valid_df, hparams=training_args, image_dir=training_args.data_path, batch_size=training_args.batch_size, num_workers=training_args.num_workers, use_weighted_sampler=training_args.use_weighted_sampler) # train dt_str = datetime.datetime.now().strftime("%y%m%d_%H-%M") wandb_logger = WandbLogger( name=f'{model_args.model_name.capitalize()}_GeM_ArcFace_{dt_str}', save_dir='logs/', project='landmarks', tags=['TPU'], ) checkpoint_callback = ModelCheckpoint(monitor='val_acc', mode='max', save_top_k=2, save_last=True, verbose=True) # hack around to change only filename, not provide the full path (which is generated by W&B) checkpoint_callback.filename = '{epoch}-{val_acc:.3f}' early_stopping_callback = EarlyStopping('val_acc', verbose=True, mode='max') trainer = pl.Trainer( gpus=training_args.gpus, tpu_cores=training_args.tpu_cores, logger=wandb_logger, max_epochs=training_args.n_epochs, val_check_interval=training_args.val_check_interval, checkpoint_callback=checkpoint_callback, progress_bar_refresh_rate=100, resume_from_checkpoint=training_args.resume_checkpoint, gradient_clip_val=training_args.gradient_clip_val, accumulate_grad_batches=training_args.accumulate_grad_batches, early_stop_callback=early_stopping_callback, fast_dev_run=DEBUG_ENABLED, limit_train_batches=3, limit_val_batches=2) trainer.fit(lit_module, datamodule=dm) try: training_args.checkpoints_dir = get_wandb_logger_checkpoints_path( wandb_logger) logger.info( f'Saving checkpoints to the current directory: {training_args.checkpoints_dir}' ) except (NotADirectoryError, FileNotFoundError) as e: logger.warning( f'Unable to get current checkpoints directory, using default one: ' f'{training_args.checkpoints_dir}') logger.debug(exc_info=e) except Exception as e: logger.warning('Unknown error', exc_info=e) # save checkpoints (saved twice - in default directory above and in wandb current run folder) training_args.checkpoints_dir.mkdir(exist_ok=True, parents=True) joblib.dump(label_enc, filename=training_args.checkpoints_dir / training_args.label_encoder_filename) logger.info( f'Persisted LabelEncoder to {training_args.label_encoder_filename}') save_config_checkpoint(training_args.checkpoints_dir, json_path=CONFIG_PATH) stage1_end_time = datetime.datetime.now() logger.info('Stage 1 duration: {}'.format(stage1_end_time - stage1_start_time)) # Stage 2: Fine-tuning with frozen backbone on higher resolution # Change: # lr=0.01, image_size=512/crop_size=448 (-> DataLoader), margin=0.3, freeze_backbone model_args.margin = 0.3 model_args.freeze_backbone = True training_args.data_path = "data/orig" training_args.lr = 0.01 training_args.image_size = 512 training_args.crop_size = 448 lit_module = LandmarksPLBaseModule(hparams={ **model_args.__dict__, **training_args.__dict__ }, model=model, optimizer=training_args.optimizer, loss=model_args.loss_module) dm = LandmarksDataModule( train_df, valid_df, hparams=training_args, image_dir=training_args.data_path, batch_size=training_args.batch_size, num_workers=training_args.num_workers, use_weighted_sampler=training_args.use_weighted_sampler) trainer.fit(lit_module, datamodule=dm) # Wrap-up end_time = datetime.datetime.now() logger.info('Training duration: {}'.format(end_time - start_time))
def main(): """ Use this class if anything in trainer checkpoint changed and only model weights are required to be preloaded. """ logger = logging.getLogger(__name__) start_time = datetime.datetime.now() model_args, training_args = load_or_parse_args((ModelArgs, TrainingArgs), verbose=True) train_orig_df, label_enc = load_train_dataframe(training_args.data_train, min_class_samples=training_args.min_class_samples) # assert training_args.test_size % training_args.batch_size == 0, "Test size should be multiple of batch size" # TODO: split DFs once and keep those on the disk. Reload label_enc from disk on resume. train_df, valid_df = split_dataframe_train_test(train_orig_df, test_size=training_args.test_size, stratify=train_orig_df.landmark_id, random_state=SEED) num_classes = train_df.landmark_id.nunique() if training_args.min_class_samples is None else len(label_enc.classes_) logger.info(f'Num classes train: {num_classes}') logger.info(f'Num classes valid: {valid_df.landmark_id.nunique()}') # save checkpoints training_args.checkpoints_dir.mkdir(exist_ok=True, parents=True) joblib.dump(label_enc, filename=training_args.checkpoints_dir / training_args.label_encoder_filename) logger.info(f'Persisted LabelEncoder to {training_args.label_encoder_filename}') save_config_checkpoint(training_args.checkpoints_dir) logger.info('Initializing the model') model = LandmarkModel(model_name=model_args.model_name, n_classes=num_classes, loss_module=model_args.loss_module, pooling_name=model_args.pooling_name, args_pooling=model_args.args_pooling, normalize=model_args.normalize, use_fc=model_args.use_fc, fc_dim=model_args.fc_dim, dropout=model_args.dropout ) logger.info("Model params:") logger.info(pformat(model_args)) model = load_model_state_from_checkpoint(net=model, checkpoint_path=training_args.resume_checkpoint) lit_module = LandmarksPLBaseModule(hparams=training_args.__dict__, model=model, optimizer=training_args.optimizer, loss=model_args.loss_module) # init data dm = LandmarksDataModule(train_df, valid_df, hparams=training_args, image_dir=training_args.data_path, batch_size=training_args.batch_size, num_workers=training_args.num_workers, use_weighted_sampler=training_args.use_weighted_sampler ) # train dt_str = datetime.datetime.now().strftime("%y%m%d_%H-%M") wandb_logger = WandbLogger(name=f'{model_args.model_name.capitalize()}_GeM_ArcFace_{dt_str}', save_dir='logs/', project='landmarks') checkpoint_callback = ModelCheckpoint(monitor='val_acc', mode='max', save_top_k=2, save_last=True, verbose=True) # hack around to change only filename, not provide the full path (which is generated by W&B) checkpoint_callback.filename = '{epoch}-{val_acc:.3f}' early_stopping_callback = EarlyStopping('val_acc', verbose=True, mode='max') trainer = pl.Trainer(gpus=training_args.gpus, logger=wandb_logger, max_epochs=training_args.n_epochs, val_check_interval=training_args.val_check_interval, checkpoint_callback=checkpoint_callback, progress_bar_refresh_rate=100, gradient_clip_val=training_args.gradient_clip_val, accumulate_grad_batches=training_args.accumulate_grad_batches, early_stop_callback=early_stopping_callback # fast_dev_run=True, # limit_train_batches=5, # limit_val_batches=5 ) trainer.fit(lit_module, datamodule=dm) try: training_args.checkpoints_dir = get_wandb_logger_checkpoints_path(wandb_logger) logger.info(f'Saving checkpoints to the current directory: {training_args.checkpoints_dir}') except: logger.warning(f'Unable to get current checkpoints directory, using default one: ' f'{training_args.checkpoints_dir}') # save checkpoints (saved twice - in default directory above and in wandb current run folder) training_args.checkpoints_dir.mkdir(exist_ok=True, parents=True) joblib.dump(label_enc, filename=training_args.checkpoints_dir / training_args.label_encoder_filename) logger.info(f'Persisted LabelEncoder to {training_args.label_encoder_filename}') save_config_checkpoint(training_args.checkpoints_dir) end_time = datetime.datetime.now() logger.info('Duration: {}'.format(end_time - start_time))
def main(): start_time = datetime.datetime.now() logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.DEBUG, ) logger = logging.getLogger(__name__) # load config file model_args, training_args = load_or_parse_args( (ModelArgs, TrainingArgs), verbose=True, json_path=os.path.join(CHECKPOINT_DIR, CONFIG_FILE)) # load label_encoder logger.info( f'Loading persisted LabelEncoder and num_classes from checkpoints {CHECKPOINT_DIR}' ) label_enc: LabelEncoder = joblib.load(filename=os.path.join( CHECKPOINT_DIR, training_args.label_encoder_filename)) num_classes = len(label_enc.classes_) # create model and load weights from checkpoint model = LandmarkModel( model_name=model_args.model_name, pretrained=False, n_classes=num_classes, loss_module=model_args.loss_module, pooling_name=model_args.pooling_name, args_pooling=model_args.args_pooling, use_fc=model_args.use_fc, fc_dim=model_args.fc_dim, dropout=model_args.dropout, ) logger.info("Model params:") logger.info(pformat(model_args)) logger.info('Loading model weights from checkpoint') model = load_model_state_from_checkpoint( os.path.join(CHECKPOINT_DIR, CHECKPOINT_NAME), model) # create test dataloader submission_df = pd.read_csv(training_args.data_path / 'sample_submission.csv') test_loader = get_test_data_loader(submission_df, image_dir=training_args.data_path, image_size=training_args.image_size, crop_size=training_args.crop_size, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS) # predict on test dataset model.eval() model.to(DEVICE) activation = nn.Softmax(dim=1) confs_list = [] preds_list = [] with torch.no_grad(): for batch in test_loader: y_hat = model(batch[FEATURE_NAME].to(DEVICE)) # y_hat = activation(y_hat) confs_batch, preds_batch = torch.topk(y_hat, TOPK) # confs_batch = activation(confs_batch) confs_list.append(confs_batch) preds_list.append(preds_batch) confs = torch.cat(confs_list).cpu().numpy() preds = torch.cat(preds_list).cpu().numpy() if TOPK == 1: pred_labels = label_enc.inverse_transform(preds.ravel()) confidence_score = confs.ravel() else: # solely for debug&analysis (if TOPK > 1) pred_labels = [label_enc.inverse_transform(pred) for pred in preds] pred_labels = [label[0] for label in pred_labels] confidence_score = [score[0] for score in confs] # TODO: save debug file # save submit file logger.info('Saving the predictions to submission.csv') submission_df['labels'] = pred_labels submission_df['scores'] = confidence_score submission_df['landmarks'] = submission_df.apply( postprocessing_omit_low_scores, axis=1) del submission_df['labels'] del submission_df['scores'] submission_df.to_csv(SUBMISSION_PATH, index=False) end_time = datetime.datetime.now() logger.info('Duration: {}'.format(end_time - start_time))
def get_similarities(config_path, image_size=None, crop_size=None): # load config file m_args, tr_args = load_or_parse_args((ModelArgs, TrainingArgs), verbose=True, json_path=config_path) image_size = image_size if image_size is not None else tr_args.image_size crop_size = crop_size if crop_size is not None else tr_args.crop_size logger.debug( f'Loading persisted LabelEncoder and num_classes from checkpoints {CHECKPOINT_DIR}' ) label_enc: LabelEncoder = joblib.load( filename=os.path.join(CHECKPOINT_DIR, tr_args.label_encoder_filename)) num_classes = len(label_enc.classes_) train_df, _ = load_train_dataframe(tr_args.data_train, min_class_samples=None) sub_df = pd.read_csv(tr_args.data_path / 'sample_submission.csv') # create model and load weights from checkpoint model = LandmarkModel( model_name=m_args.model_name, pretrained=False, n_classes=num_classes, loss_module=m_args.loss_module, pooling_name=m_args.pooling_name, args_pooling=m_args.args_pooling, use_fc=m_args.use_fc, fc_dim=m_args.fc_dim, dropout=m_args.dropout, ) logger.info("Model params:") logger.info(pformat(m_args)) logger.info('Loading model weights from checkpoint') model = load_model_state_from_checkpoint( os.path.join(CHECKPOINT_DIR, CHECKPOINT_NAME), model) # create train dataloader train_dataset = LandmarksImageDataset( train_df, image_dir=tr_args.data_path, mode="train", get_img_id=True, # transform=transforms.ToTensor(), # in case on rescaling required image_size=image_size, crop_size=crop_size) train_loader = DataLoader( train_dataset, batch_size=BATCH_SIZE, shuffle=False, # due to using sampler sampler=None, num_workers=NUM_WORKERS, collate_fn=CollateBatchFn(), drop_last=False) # create test dataloader test_loader = get_test_data_loader(sub_df, image_dir=tr_args.data_path, image_size=image_size, crop_size=crop_size, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS) # generate embeddings train_ids, train_embeddings = generate_embeddings(model, train_loader) test_ids, test_embeddings = generate_embeddings(model, test_loader) train_ids_labels_and_scores = [None] * test_embeddings.shape[0] # Using (slow) for-loop, as distance matrix doesn't fit in memory for test_idx in range(test_embeddings.shape[0]): distances = spatial.distance.cdist( test_embeddings[np.newaxis, test_idx, :], train_embeddings, 'cosine')[0] # Get the indices of the closest images top_k = np.argpartition(distances, NUM_TO_RERANK)[:NUM_TO_RERANK] # Get the nearest ids and distances using the previous indices nearest = sorted([(train_ids[p], distances[p]) for p in top_k], key=lambda x: x[1]) # Get the labels and score results train_ids_labels_and_scores[test_idx] = [ (train_df[train_id], 1.0 - cosine_distance) for train_id, cosine_distance in nearest ] del test_embeddings del train_embeddings gc.collect() return test_ids, train_ids_labels_and_scores