def main():
    start_time = datetime.datetime.now()
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.DEBUG,
    )
    logger = logging.getLogger(__name__)

    # load config file
    model_args, training_args = load_or_parse_args(
        (ModelArgs, TrainingArgs),
        verbose=True,
        json_path=os.path.join(CHECKPOINT_DIR, 'config.json'))

    # load label_encoder
    logger.info(
        f'Loading persisted LabelEncoder and num_classes from checkpoints {CHECKPOINT_DIR}'
    )
    label_enc: LabelEncoder = joblib.load(filename=os.path.join(
        CHECKPOINT_DIR, training_args.label_encoder_filename))
    num_classes = len(label_enc.classes_)

    # create model and load weights from checkpoint
    model = LandmarkModel(
        model_name=model_args.model_name,
        n_classes=num_classes,
        loss_module=model_args.loss_module,
        pooling_name=model_args.pooling_name,
        args_pooling=model_args.args_pooling,
        use_fc=model_args.use_fc,
        fc_dim=model_args.fc_dim,
        dropout=model_args.dropout,
    )
    logger.info("Model params:")
    logger.info(pformat(model_args))
    logger.info('Loading model weights from checkpoint')
    model = load_model_state_from_checkpoint(
        os.path.join(CHECKPOINT_DIR, CHECKPOINT_NAME), model)

    # create test dataloader
    submission_df = pd.read_csv(training_args.data_path /
                                'sample_submission.csv')
    test_loader = get_test_data_loader(submission_df,
                                       image_dir=training_args.data_path,
                                       batch_size=BATCH_SIZE,
                                       num_workers=NUM_WORKERS)
    # load index
    index = faiss.read_index(os.path.join(CHECKPOINT_DIR, 'flat.index'))

    # extract feature vectors for test images
    if not LOAD_VECTORS_FROM_CHECKPOINT:
        img_mapping_test, img_vectors_test = extract_features(
            model,
            test_loader,
            mode='test',
            device=DEVICE,
            normalize=NORMALIZE_VECTORS,
            dir_to_save=CHECKPOINT_DIR)
    else:
        logger.info('Loading vectors from checkpoint')
        img_mapping_test = joblib.load(
            os.path.join(CHECKPOINT_DIR, 'meta_vectors_test.pkl'))
        img_vectors_test = joblib.load(
            os.path.join(CHECKPOINT_DIR, 'vectors_test.pkl'))

    logger.info('Loading train vectors mapping')
    train_vec_mapping = joblib.load(
        os.path.join(CHECKPOINT_DIR, 'meta_vectors_train.pkl'))

    # train_vec_image_ids = train_vec_mapping['image_ids']
    train_vec_targets = train_vec_mapping[TARGET_NAME]

    # predict kNN for each test image (topk = 3)
    logger.info('Searching for nearest neighbours')
    # this predicts nearest train vector id which needs to be converted in to the label
    pred_dist, pred_vec_id = index.search(img_vectors_test, TOPK)

    logger.info('Extracting label encoded target class_id')
    preds_vec = np.vectorize(lambda x: train_vec_targets[x])(pred_vec_id)

    logger.info('Picking up most common labels for each vector')
    pred_mode, pred_cnt = mode(preds_vec, axis=1)
    # threshold_dist = 0.007
    # pred_mode_final = np.where(pred_dist >= threshold_dist, preds_vec, np.nan)

    # inverse_transform predicted labels by label encoder
    logger.info('Label encoder inverse transform labels')
    pred_labels = label_enc.inverse_transform(pred_mode[:, 0])

    # rerank (optional)

    # save submit file
    logger.info('Saving the predictions to submission.csv')
    submission_df['labels'] = pred_labels
    submission_df['cnt'] = pred_cnt / pred_cnt.max()
    submission_df['landmarks'] = submission_df.apply(
        lambda x: str(x['labels']) + ' ' + str(x['cnt']), axis=1)
    del submission_df['labels']
    del submission_df['cnt']
    submission_df.to_csv(SUBMISSION_PATH, index=False)

    end_time = datetime.datetime.now()
    logger.info('Duration: {}'.format(end_time - start_time))
Example #2
0
LOAD_VECTORS_FROM_CHECKPOINT = False

if __name__ == "__main__":
    SEED = 17
    fix_seed(SEED)
    start_time = datetime.datetime.now()
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)

    if not LOAD_VECTORS_FROM_CHECKPOINT:
        model_args, training_args = load_or_parse_args((ModelArgs, TrainingArgs), verbose=True,
                                                       json_path=os.path.join(CHECKPOINT_DIR, 'config.json'))

        train_orig_df, _ = load_train_dataframe(training_args.data_train,
                                                min_class_samples=training_args.min_class_samples)

        logger.info(f'Loading persisted LabelEncoder and num_classes from checkpoints {CHECKPOINT_DIR}')
        label_enc = joblib.load(filename=training_args.checkpoints_path / training_args.label_encoder_filename)
        num_classes = len(label_enc.classes_)
        assert train_orig_df.landmark_id.nunique() == num_classes, "Num classes should be the same in DF and loaded obj"

        model = LandmarkModel(model_name=model_args.model_name,
                              n_classes=num_classes,
                              loss_module=model_args.loss_module,
                              pooling_name=model_args.pooling_name,
                              args_pooling=model_args.args_pooling,
                              use_fc=model_args.use_fc,
Example #3
0
def main():
    logger = logging.getLogger(__name__)
    start_time = datetime.datetime.now()
    model_args, training_args = load_or_parse_args((ModelArgs, TrainingArgs),
                                                   verbose=True,
                                                   json_path=CONFIG_PATH)
    train_orig_df, label_enc = load_train_dataframe(
        training_args.data_train,
        min_class_samples=training_args.min_class_samples)

    # assert training_args.test_size % training_args.batch_size == 0, "Test size should be multiple of batch size"

    # TODO: split DFs once and keep those on the disk. Reload label_enc from disk on resume.
    train_df, valid_df = split_dataframe_train_test(
        train_orig_df,
        test_size=training_args.test_size,
        stratify=train_orig_df.landmark_id,
        random_state=SEED)
    num_classes = train_df.landmark_id.nunique(
    ) if training_args.min_class_samples is None else len(label_enc.classes_)
    logger.info(f'Num classes train: {num_classes}')
    logger.info(f'Num classes valid: {valid_df.landmark_id.nunique()}')

    logger.info('Initializing the model')
    model = LandmarkModel(model_name=model_args.model_name,
                          n_classes=num_classes,
                          loss_module=model_args.loss_module,
                          pooling_name=model_args.pooling_name,
                          args_pooling=model_args.args_pooling,
                          normalize=model_args.normalize,
                          use_fc=model_args.use_fc,
                          fc_dim=model_args.fc_dim,
                          dropout=model_args.dropout)
    logger.info("Model params:")
    logger.info(pformat(model_args))

    # save checkpoints
    training_args.checkpoints_dir.mkdir(exist_ok=True, parents=True)
    joblib.dump(label_enc,
                filename=training_args.checkpoints_dir /
                training_args.label_encoder_filename)
    logger.info(
        f'Persisted LabelEncoder to {training_args.label_encoder_filename}')
    save_config_checkpoint(training_args.checkpoints_dir,
                           json_path=CONFIG_PATH)

    # Stage 1 - train full model with low resolution
    stage1_start_time = datetime.datetime.now()

    lit_module = LandmarksPLBaseModule(hparams={
        **model_args.__dict__,
        **training_args.__dict__
    },
                                       model=model,
                                       optimizer=training_args.optimizer,
                                       loss=model_args.loss_module)
    # init data
    dm = LandmarksDataModule(
        train_df,
        valid_df,
        hparams=training_args,
        image_dir=training_args.data_path,
        batch_size=training_args.batch_size,
        num_workers=training_args.num_workers,
        use_weighted_sampler=training_args.use_weighted_sampler)
    # train
    dt_str = datetime.datetime.now().strftime("%y%m%d_%H-%M")
    wandb_logger = WandbLogger(
        name=f'{model_args.model_name.capitalize()}_GeM_ArcFace_{dt_str}',
        save_dir='logs/',
        project='landmarks',
        tags=['TPU'],
    )
    checkpoint_callback = ModelCheckpoint(monitor='val_acc',
                                          mode='max',
                                          save_top_k=2,
                                          save_last=True,
                                          verbose=True)
    # hack around to change only filename, not provide the full path (which is generated by W&B)
    checkpoint_callback.filename = '{epoch}-{val_acc:.3f}'
    early_stopping_callback = EarlyStopping('val_acc',
                                            verbose=True,
                                            mode='max')
    trainer = pl.Trainer(
        gpus=training_args.gpus,
        tpu_cores=training_args.tpu_cores,
        logger=wandb_logger,
        max_epochs=training_args.n_epochs,
        val_check_interval=training_args.val_check_interval,
        checkpoint_callback=checkpoint_callback,
        progress_bar_refresh_rate=100,
        resume_from_checkpoint=training_args.resume_checkpoint,
        gradient_clip_val=training_args.gradient_clip_val,
        accumulate_grad_batches=training_args.accumulate_grad_batches,
        early_stop_callback=early_stopping_callback,
        fast_dev_run=DEBUG_ENABLED,
        limit_train_batches=3,
        limit_val_batches=2)
    trainer.fit(lit_module, datamodule=dm)
    try:
        training_args.checkpoints_dir = get_wandb_logger_checkpoints_path(
            wandb_logger)
        logger.info(
            f'Saving checkpoints to the current directory: {training_args.checkpoints_dir}'
        )
    except (NotADirectoryError, FileNotFoundError) as e:
        logger.warning(
            f'Unable to get current checkpoints directory, using default one: '
            f'{training_args.checkpoints_dir}')
        logger.debug(exc_info=e)
    except Exception as e:
        logger.warning('Unknown error', exc_info=e)
    # save checkpoints (saved twice - in default directory above and in wandb current run folder)
    training_args.checkpoints_dir.mkdir(exist_ok=True, parents=True)
    joblib.dump(label_enc,
                filename=training_args.checkpoints_dir /
                training_args.label_encoder_filename)
    logger.info(
        f'Persisted LabelEncoder to {training_args.label_encoder_filename}')
    save_config_checkpoint(training_args.checkpoints_dir,
                           json_path=CONFIG_PATH)
    stage1_end_time = datetime.datetime.now()
    logger.info('Stage 1 duration: {}'.format(stage1_end_time -
                                              stage1_start_time))

    # Stage 2: Fine-tuning with frozen backbone on higher resolution
    # Change:
    # lr=0.01, image_size=512/crop_size=448 (-> DataLoader), margin=0.3, freeze_backbone

    model_args.margin = 0.3
    model_args.freeze_backbone = True
    training_args.data_path = "data/orig"
    training_args.lr = 0.01
    training_args.image_size = 512
    training_args.crop_size = 448

    lit_module = LandmarksPLBaseModule(hparams={
        **model_args.__dict__,
        **training_args.__dict__
    },
                                       model=model,
                                       optimizer=training_args.optimizer,
                                       loss=model_args.loss_module)

    dm = LandmarksDataModule(
        train_df,
        valid_df,
        hparams=training_args,
        image_dir=training_args.data_path,
        batch_size=training_args.batch_size,
        num_workers=training_args.num_workers,
        use_weighted_sampler=training_args.use_weighted_sampler)

    trainer.fit(lit_module, datamodule=dm)

    # Wrap-up
    end_time = datetime.datetime.now()
    logger.info('Training duration: {}'.format(end_time - start_time))
Example #4
0
def main():
    """
    Use this class if anything in trainer checkpoint changed and only model weights are required to be preloaded.
    """
    logger = logging.getLogger(__name__)
    start_time = datetime.datetime.now()
    model_args, training_args = load_or_parse_args((ModelArgs, TrainingArgs), verbose=True)
    train_orig_df, label_enc = load_train_dataframe(training_args.data_train,
                                                    min_class_samples=training_args.min_class_samples)

    # assert training_args.test_size % training_args.batch_size == 0, "Test size should be multiple of batch size"

    # TODO: split DFs once and keep those on the disk. Reload label_enc from disk on resume.
    train_df, valid_df = split_dataframe_train_test(train_orig_df, test_size=training_args.test_size,
                                                    stratify=train_orig_df.landmark_id, random_state=SEED)
    num_classes = train_df.landmark_id.nunique() if training_args.min_class_samples is None else len(label_enc.classes_)
    logger.info(f'Num classes train: {num_classes}')
    logger.info(f'Num classes valid: {valid_df.landmark_id.nunique()}')

    # save checkpoints
    training_args.checkpoints_dir.mkdir(exist_ok=True, parents=True)
    joblib.dump(label_enc, filename=training_args.checkpoints_dir / training_args.label_encoder_filename)
    logger.info(f'Persisted LabelEncoder to {training_args.label_encoder_filename}')
    save_config_checkpoint(training_args.checkpoints_dir)

    logger.info('Initializing the model')
    model = LandmarkModel(model_name=model_args.model_name,
                          n_classes=num_classes,
                          loss_module=model_args.loss_module,
                          pooling_name=model_args.pooling_name,
                          args_pooling=model_args.args_pooling,
                          normalize=model_args.normalize,
                          use_fc=model_args.use_fc,
                          fc_dim=model_args.fc_dim,
                          dropout=model_args.dropout
                          )
    logger.info("Model params:")
    logger.info(pformat(model_args))
    model = load_model_state_from_checkpoint(net=model, checkpoint_path=training_args.resume_checkpoint)

    lit_module = LandmarksPLBaseModule(hparams=training_args.__dict__,
                                       model=model,
                                       optimizer=training_args.optimizer,
                                       loss=model_args.loss_module)
    # init data
    dm = LandmarksDataModule(train_df, valid_df,
                             hparams=training_args,
                             image_dir=training_args.data_path,
                             batch_size=training_args.batch_size,
                             num_workers=training_args.num_workers,
                             use_weighted_sampler=training_args.use_weighted_sampler
                             )
    # train
    dt_str = datetime.datetime.now().strftime("%y%m%d_%H-%M")
    wandb_logger = WandbLogger(name=f'{model_args.model_name.capitalize()}_GeM_ArcFace_{dt_str}',
                               save_dir='logs/',
                               project='landmarks')
    checkpoint_callback = ModelCheckpoint(monitor='val_acc',
                                          mode='max',
                                          save_top_k=2,
                                          save_last=True,
                                          verbose=True)
    # hack around to change only filename, not provide the full path (which is generated by W&B)
    checkpoint_callback.filename = '{epoch}-{val_acc:.3f}'

    early_stopping_callback = EarlyStopping('val_acc', verbose=True, mode='max')

    trainer = pl.Trainer(gpus=training_args.gpus,
                         logger=wandb_logger,
                         max_epochs=training_args.n_epochs,
                         val_check_interval=training_args.val_check_interval,
                         checkpoint_callback=checkpoint_callback,
                         progress_bar_refresh_rate=100,
                         gradient_clip_val=training_args.gradient_clip_val,
                         accumulate_grad_batches=training_args.accumulate_grad_batches,
                         early_stop_callback=early_stopping_callback
                         # fast_dev_run=True,
                         # limit_train_batches=5,
                         # limit_val_batches=5
                         )
    trainer.fit(lit_module, datamodule=dm)

    try:
        training_args.checkpoints_dir = get_wandb_logger_checkpoints_path(wandb_logger)
        logger.info(f'Saving checkpoints to the current directory: {training_args.checkpoints_dir}')
    except:
        logger.warning(f'Unable to get current checkpoints directory, using default one: '
                       f'{training_args.checkpoints_dir}')
    # save checkpoints (saved twice - in default directory above and in wandb current run folder)
    training_args.checkpoints_dir.mkdir(exist_ok=True, parents=True)
    joblib.dump(label_enc, filename=training_args.checkpoints_dir / training_args.label_encoder_filename)
    logger.info(f'Persisted LabelEncoder to {training_args.label_encoder_filename}')
    save_config_checkpoint(training_args.checkpoints_dir)

    end_time = datetime.datetime.now()
    logger.info('Duration: {}'.format(end_time - start_time))
def main():
    start_time = datetime.datetime.now()
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.DEBUG,
    )
    logger = logging.getLogger(__name__)

    # load config file
    model_args, training_args = load_or_parse_args(
        (ModelArgs, TrainingArgs),
        verbose=True,
        json_path=os.path.join(CHECKPOINT_DIR, CONFIG_FILE))

    # load label_encoder
    logger.info(
        f'Loading persisted LabelEncoder and num_classes from checkpoints {CHECKPOINT_DIR}'
    )
    label_enc: LabelEncoder = joblib.load(filename=os.path.join(
        CHECKPOINT_DIR, training_args.label_encoder_filename))
    num_classes = len(label_enc.classes_)

    # create model and load weights from checkpoint
    model = LandmarkModel(
        model_name=model_args.model_name,
        pretrained=False,
        n_classes=num_classes,
        loss_module=model_args.loss_module,
        pooling_name=model_args.pooling_name,
        args_pooling=model_args.args_pooling,
        use_fc=model_args.use_fc,
        fc_dim=model_args.fc_dim,
        dropout=model_args.dropout,
    )
    logger.info("Model params:")
    logger.info(pformat(model_args))
    logger.info('Loading model weights from checkpoint')
    model = load_model_state_from_checkpoint(
        os.path.join(CHECKPOINT_DIR, CHECKPOINT_NAME), model)

    # create test dataloader
    submission_df = pd.read_csv(training_args.data_path /
                                'sample_submission.csv')
    test_loader = get_test_data_loader(submission_df,
                                       image_dir=training_args.data_path,
                                       image_size=training_args.image_size,
                                       crop_size=training_args.crop_size,
                                       batch_size=BATCH_SIZE,
                                       num_workers=NUM_WORKERS)

    # predict on test dataset
    model.eval()
    model.to(DEVICE)
    activation = nn.Softmax(dim=1)
    confs_list = []
    preds_list = []
    with torch.no_grad():
        for batch in test_loader:
            y_hat = model(batch[FEATURE_NAME].to(DEVICE))
            # y_hat = activation(y_hat)

            confs_batch, preds_batch = torch.topk(y_hat, TOPK)
            # confs_batch = activation(confs_batch)
            confs_list.append(confs_batch)
            preds_list.append(preds_batch)
        confs = torch.cat(confs_list).cpu().numpy()
        preds = torch.cat(preds_list).cpu().numpy()

    if TOPK == 1:
        pred_labels = label_enc.inverse_transform(preds.ravel())
        confidence_score = confs.ravel()
    else:  # solely for debug&analysis (if TOPK > 1)
        pred_labels = [label_enc.inverse_transform(pred) for pred in preds]
        pred_labels = [label[0] for label in pred_labels]
        confidence_score = [score[0] for score in confs]
        # TODO: save debug file

    # save submit file
    logger.info('Saving the predictions to submission.csv')
    submission_df['labels'] = pred_labels
    submission_df['scores'] = confidence_score
    submission_df['landmarks'] = submission_df.apply(
        postprocessing_omit_low_scores, axis=1)
    del submission_df['labels']
    del submission_df['scores']
    submission_df.to_csv(SUBMISSION_PATH, index=False)

    end_time = datetime.datetime.now()
    logger.info('Duration: {}'.format(end_time - start_time))
Example #6
0
def get_similarities(config_path, image_size=None, crop_size=None):
    # load config file
    m_args, tr_args = load_or_parse_args((ModelArgs, TrainingArgs),
                                         verbose=True,
                                         json_path=config_path)
    image_size = image_size if image_size is not None else tr_args.image_size
    crop_size = crop_size if crop_size is not None else tr_args.crop_size

    logger.debug(
        f'Loading persisted LabelEncoder and num_classes from checkpoints {CHECKPOINT_DIR}'
    )
    label_enc: LabelEncoder = joblib.load(
        filename=os.path.join(CHECKPOINT_DIR, tr_args.label_encoder_filename))
    num_classes = len(label_enc.classes_)

    train_df, _ = load_train_dataframe(tr_args.data_train,
                                       min_class_samples=None)
    sub_df = pd.read_csv(tr_args.data_path / 'sample_submission.csv')

    # create model and load weights from checkpoint
    model = LandmarkModel(
        model_name=m_args.model_name,
        pretrained=False,
        n_classes=num_classes,
        loss_module=m_args.loss_module,
        pooling_name=m_args.pooling_name,
        args_pooling=m_args.args_pooling,
        use_fc=m_args.use_fc,
        fc_dim=m_args.fc_dim,
        dropout=m_args.dropout,
    )
    logger.info("Model params:")
    logger.info(pformat(m_args))
    logger.info('Loading model weights from checkpoint')
    model = load_model_state_from_checkpoint(
        os.path.join(CHECKPOINT_DIR, CHECKPOINT_NAME), model)
    # create train dataloader
    train_dataset = LandmarksImageDataset(
        train_df,
        image_dir=tr_args.data_path,
        mode="train",
        get_img_id=True,
        # transform=transforms.ToTensor(),  # in case on rescaling required
        image_size=image_size,
        crop_size=crop_size)
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,  # due to using sampler
        sampler=None,
        num_workers=NUM_WORKERS,
        collate_fn=CollateBatchFn(),
        drop_last=False)
    # create test dataloader
    test_loader = get_test_data_loader(sub_df,
                                       image_dir=tr_args.data_path,
                                       image_size=image_size,
                                       crop_size=crop_size,
                                       batch_size=BATCH_SIZE,
                                       num_workers=NUM_WORKERS)
    # generate embeddings
    train_ids, train_embeddings = generate_embeddings(model, train_loader)
    test_ids, test_embeddings = generate_embeddings(model, test_loader)

    train_ids_labels_and_scores = [None] * test_embeddings.shape[0]
    # Using (slow) for-loop, as distance matrix doesn't fit in memory
    for test_idx in range(test_embeddings.shape[0]):
        distances = spatial.distance.cdist(
            test_embeddings[np.newaxis, test_idx, :], train_embeddings,
            'cosine')[0]
        # Get the indices of the closest images
        top_k = np.argpartition(distances, NUM_TO_RERANK)[:NUM_TO_RERANK]
        # Get the nearest ids and distances using the previous indices
        nearest = sorted([(train_ids[p], distances[p]) for p in top_k],
                         key=lambda x: x[1])
        # Get the labels and score results
        train_ids_labels_and_scores[test_idx] = [
            (train_df[train_id], 1.0 - cosine_distance)
            for train_id, cosine_distance in nearest
        ]

    del test_embeddings
    del train_embeddings
    gc.collect()

    return test_ids, train_ids_labels_and_scores