Example #1
0
def experiment_pred(experiment_dir, test_data):
    print(f"Start predict: {experiment_dir}")
    transforms = get_transforms(False, CROP_SIZE)

    pred_df_lst = []
    for fold in config.folds:
        print("Predict fold", fold)
        fold_dir = experiment_dir / f'fold_{fold}'
        model_path = get_best_model_path(fold_dir)
        print("Model path", model_path)
        predictor = Predictor(model_path,
                              transforms,
                              BATCH_SIZE, (config.audio.n_mels, CROP_SIZE),
                              (config.audio.n_mels, CROP_SIZE // 4),
                              device=DEVICE)

        pred_df = pred_test(predictor, test_data)
        pred_df_lst.append(pred_df)

    pred_df = gmean_preds_blend(pred_df_lst)
    return pred_df
Example #2
0
def experiment_pred(experiment_dir, audio_id2spec):
    print(f"Start predict: {experiment_dir}")
    transforms = get_transforms(False, CROP_SIZE)

    pred_lst = []
    for fold in FOLDS:
        print("Predict fold", fold)
        fold_dir = experiment_dir / f'fold_{fold}'
        model_path = get_best_model_path(fold_dir)
        print("Model path", model_path)
        predictor = Predictor(model_path, transforms, BATCH_SIZE,
                              CROP_SIZE, CROP_SIZE, DEVICE)

        transforms = get_transforms(False, CROP_SIZE)

        pred = fold_pred(predictor, audio_id2spec)
        pred_lst.append(pred)

    audio_id2pred = dict()
    for audio_id in audio_id2spec:
        pred = [p[audio_id] for p in pred_lst]
        audio_id2pred[audio_id] = np.mean(pred, axis=0)

    return audio_id2pred
    model.fit(train_loader,
              val_loader=val_loader,
              num_epochs=150,
              callbacks=callbacks,
              metrics=['multi_accuracy', 'lwlrap'])


if __name__ == "__main__":
    if not SAVE_DIR.exists():
        SAVE_DIR.mkdir(parents=True, exist_ok=True)
    else:
        print(f"Folder {SAVE_DIR} already exists.")

    with open(SAVE_DIR / 'source.py', 'w') as outfile:
        outfile.write(open(__file__).read())

    folds_data = load_folds_data()
    noisy_data = load_noisy_data()

    for fold in config.folds:
        val_folds = [fold]
        train_folds = list(set(config.folds) - set(val_folds))
        save_fold_dir = SAVE_DIR / f'fold_{fold}'
        base_model_path = get_best_model_path(BASE_DIR / f'fold_{fold}')
        print(f"Base model path: {base_model_path}")
        print(f"Val folds: {val_folds}, Train folds: {train_folds}")
        print(f"Fold save dir {save_fold_dir}")
        train_fold(base_model_path, save_fold_dir, train_folds, val_folds,
                   folds_data, noisy_data)
Example #4
0
def train_fold(save_dir, train_folds, val_folds, folds_data):
    train_transfrom = get_transforms(train=True,
                                     size=CROP_SIZE,
                                     wrap_pad_prob=0.0,
                                     resize_scale=(0.8, 1.0),
                                     resize_ratio=(1.7, 2.3),
                                     resize_prob=0.0,
                                     spec_num_mask=2,
                                     spec_freq_masking=0.15,
                                     spec_time_masking=0.20,
                                     spec_prob=0.0)
    val_transform = get_transforms(train=False, size=CROP_SIZE)

    if MIXER_PROB:
        mixer = get_mixer(mixer_prob=MIXER_PROB,
                          sigmoid_range=(3, 12),
                          alpha_dist='uniform',
                          random_prob=(0.6, 0.4))
    else:
        mixer = None

    train_dataset = BirdsongDataset(folds_data,
                                    folds=train_folds,
                                    transform=train_transfrom,
                                    mixer=mixer)
    val_dataset = BirdsongDataset(folds_data,
                                  folds=val_folds,
                                  transform=val_transform)
    train_loader = DataLoader(train_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              drop_last=True,
                              num_workers=NUM_WORKERS)
    val_loader = DataLoader(val_dataset,
                            batch_size=BATCH_SIZE * 2 // ITER_SIZE,
                            shuffle=False,
                            num_workers=NUM_WORKERS)

    model = BirdsongModel(PARAMS)
    if 'pretrained' in model.params['nn_module'][1]:
        model.params['nn_module'][1]['pretrained'] = False

    if USE_AMP:
        initialize_amp(model)

    model.set_device(DEVICES)

    num_iterations = (5 * len(train_dataset)) // BATCH_SIZE
    callbacks = [
        MonitorCheckpoint(save_dir, monitor='val_loss', max_saves=1),
        CosineAnnealingLR(T_max=num_iterations,
                          eta_min=0,
                          step_on_iteration=True),
        EarlyStopping(monitor='val_loss', patience=12),
        LoggingToFile(save_dir / 'log.txt'),
        LoggingToCSV(save_dir / 'log.csv')
    ]

    model.fit(train_loader,
              val_loader=val_loader,
              num_epochs=EPOCHS,
              callbacks=callbacks,
              metrics=['f1_score'])

    del model

    model_path = get_best_model_path(save_dir)
    model = load_model(model_path)
    val_dataset = BirdsongDataset(folds_data,
                                  folds=val_folds + [config.n_folds],
                                  transform=val_transform)
    val_loader = DataLoader(val_dataset,
                            batch_size=BATCH_SIZE * 2 // ITER_SIZE,
                            shuffle=False,
                            num_workers=NUM_WORKERS)
    model.set_device(DEVICES[0])
    model.validate(val_loader,
                   metrics=['f1_score'],
                   callbacks=[
                       LoggingToFile(save_dir / 'log.txt'),
                       LoggingToCSV(save_dir / 'log.csv')
                   ])
            target[config.class2index[label]] = 1.

        pred = probs_df.loc[row.fname].values
        lwlrap.accumulate(target[np.newaxis], pred[np.newaxis])

    result = {
        'overall_lwlrap': lwlrap.overall_lwlrap(),
        'per_class_lwlrap': {
            cls: lwl
            for cls, lwl in zip(config.classes, lwlrap.per_class_lwlrap())
        }
    }
    print(result)
    with open(PREDICTION_DIR / 'val_lwlrap.json', 'w') as file:
        json.dump(result, file, indent=2)


if __name__ == "__main__":
    for fold in config.folds:
        print("Predict fold", fold)
        fold_dir = EXPERIMENT_DIR / f'fold_{fold}'
        model_path = get_best_model_path(fold_dir)
        print("Model path", model_path)
        predictor = StackPredictor(model_path, BATCH_SIZE, device=DEVICE)

        print("Val predict")
        pred_val_fold(predictor, fold)

    print("Calculate lwlrap metric on cv")
    calc_lwlrap_on_val()
Example #6
0
def train_fold(save_dir,
               train_folds,
               val_folds,
               local_rank=0,
               distributed=False,
               pretrain_dir=''):
    folds_data = get_folds_data()

    model = AlaskaModel(PARAMS)
    model.params['nn_module'][1]['pretrained'] = False

    if pretrain_dir:
        pretrain_path = get_best_model_path(pretrain_dir)
        if pretrain_path is not None:
            print(f'Pretrain model path {pretrain_path}')
            load_pretrain_weigths(model, pretrain_path)
        else:
            print(f"Pretrain model not found in '{pretrain_dir}'")

    if USE_AMP:
        initialize_amp(model)

    if distributed:
        model.nn_module = SyncBatchNorm.convert_sync_batchnorm(model.nn_module)
        model.nn_module = DistributedDataParallel(
            model.nn_module.to(local_rank),
            device_ids=[local_rank],
            output_device=local_rank)
        if local_rank:
            model.logger.disabled = True
    else:
        model.set_device(DEVICES)

    if USE_EMA:
        initialize_ema(model, decay=0.9999)
        checkpoint = EmaMonitorCheckpoint
    else:
        checkpoint = MonitorCheckpoint

    for epochs, stage in zip(TRAIN_EPOCHS, STAGE):
        test_transform = get_transforms(train=False)

        if stage == 'train':
            mixer = RandomMixer([BitMix(gamma=0.25), EmptyMix()], p=[0., 1.])
            train_transform = get_transforms(train=True)
        else:
            mixer = EmptyMix()
            train_transform = get_transforms(train=False)

        train_dataset = AlaskaDataset(folds_data,
                                      train_folds,
                                      transform=train_transform,
                                      mixer=mixer)
        val_dataset = AlaskaDataset(folds_data,
                                    val_folds,
                                    transform=test_transform)
        val_sampler = AlaskaSampler(val_dataset, train=False)

        if distributed:
            train_sampler = AlaskaDistributedSampler(train_dataset)
        else:
            train_sampler = AlaskaSampler(train_dataset, train=True)

        train_loader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  num_workers=NUM_WORKERS,
                                  batch_size=BATCH_SIZE)
        val_loader = DataLoader(val_dataset,
                                sampler=val_sampler,
                                num_workers=NUM_WORKERS,
                                batch_size=VAL_BATCH_SIZE)

        callbacks = []
        if local_rank == 0:
            callbacks += [
                checkpoint(save_dir,
                           monitor='val_weighted_auc',
                           max_saves=5,
                           file_format=stage +
                           '-model-{epoch:03d}-{monitor:.6f}.pth'),
                LoggingToFile(save_dir / 'log.txt'),
                LoggingToCSV(save_dir / 'log.csv', append=True)
            ]

        if stage == 'train':
            callbacks += [
                CosineAnnealingLR(T_max=epochs,
                                  eta_min=get_lr(9e-6, WORLD_BATCH_SIZE))
            ]
        elif stage == 'warmup':
            warmup_iterations = epochs * (len(train_sampler) / BATCH_SIZE)
            callbacks += [
                LambdaLR(lambda x: x / warmup_iterations,
                         step_on_iteration=True)
            ]

        if stage == 'train':

            @argus.callbacks.on_epoch_start
            def schedule_mixer_prob(state):
                bitmix_prob = state.epoch / epochs
                mixer.p = [bitmix_prob, 1 - bitmix_prob]
                state.logger.info(f"Mixer probabilities {mixer.p}")

            callbacks += [schedule_mixer_prob]

        if distributed:

            @argus.callbacks.on_epoch_complete
            def schedule_sampler(state):
                train_sampler.set_epoch(state.epoch + 1)

            callbacks += [schedule_sampler]

        metrics = ['weighted_auc', Accuracy('stegano'), Accuracy('quality')]

        model.fit(train_loader,
                  val_loader=val_loader,
                  num_epochs=epochs,
                  callbacks=callbacks,
                  metrics=metrics)