def train_fold(save_dir, train_folds, val_folds, model_path): depth_trns = SimpleDepthTransform() train_trns = SaltTransform(IMAGE_SIZE, True, 'crop') val_trns = SaltTransform(IMAGE_SIZE, False, 'crop') train_dataset = SaltDataset(TRAIN_FOLDS_PATH, train_folds, train_trns, depth_trns) val_dataset = SaltDataset(TRAIN_FOLDS_PATH, val_folds, val_trns, depth_trns) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=8) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=8) model = load_model(model_path) model.loss.lovasz_weight = 0.5 model.loss.prob_weight = 0.5 callbacks = [ MonitorCheckpoint(save_dir, monitor='val_crop_iout', max_saves=3, copy_last=False), LoggingToFile(os.path.join(save_dir, 'log.txt')), update_lr ] model.fit(train_loader, val_loader=val_loader, max_epochs=500, callbacks=callbacks, metrics=['crop_iout'])
def train_fold(save_dir, train_folds, val_folds, folds_data): train_dataset = StackingDataset(folds_data, train_folds, get_transforms(True), DATASET_SIZE) val_dataset = StackingDataset(folds_data, val_folds, get_transforms(False)) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=NUM_WORKERS) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE * 2, shuffle=False, num_workers=NUM_WORKERS) model = StackingModel(PARAMS) callbacks = [ MonitorCheckpoint(save_dir, monitor='val_lwlrap', max_saves=1), ReduceLROnPlateau(monitor='val_lwlrap', patience=RS_PARAMS['patience'], factor=RS_PARAMS['factor'], min_lr=1e-8), EarlyStopping(monitor='val_lwlrap', patience=30), LoggingToFile(save_dir / 'log.txt'), ] model.fit(train_loader, val_loader=val_loader, max_epochs=700, callbacks=callbacks, metrics=['multi_accuracy', 'lwlrap'])
def train_fold(base_model_path, save_dir, train_folds, val_folds, folds_data, noisy_data): train_transfrom = get_transforms(train=True, size=CROP_SIZE, wrap_pad_prob=WRAP_PAD_PROB) mixer = RandomMixer([ SigmoidConcatMixer(sigmoid_range=(3, 12)), AddMixer(alpha_dist='uniform') ], p=[0.6, 0.4]) mixer = UseMixerWithProb(mixer, prob=MIXER_PROB) curated_dataset = FreesoundDataset(folds_data, train_folds, transform=train_transfrom, mixer=mixer) noisy_dataset = FreesoundNoisyDataset(noisy_data, transform=train_transfrom, mixer=mixer) train_dataset = RandomDataset([noisy_dataset, curated_dataset], p=[NOISY_PROB, 1 - NOISY_PROB], size=DATASET_SIZE) val_dataset = FreesoundDataset(folds_data, val_folds, get_transforms(False, CROP_SIZE)) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=NUM_WORKERS) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE * 2, shuffle=False, num_workers=NUM_WORKERS) model = load_model(base_model_path, device=DEVICE) model.set_lr(BASE_LR) callbacks = [ MonitorCheckpoint(save_dir, monitor='val_lwlrap', max_saves=3), CosineAnnealing(T_0=10, T_mult=2, eta_min=0.00001), LoggingToFile(save_dir / 'log.txt'), ] model.fit(train_loader, val_loader=val_loader, max_epochs=150, callbacks=callbacks, metrics=['multi_accuracy', 'lwlrap'])
def test_pipeline(tmpdir, get_batch_function, linear_argus_model_instance): model = linear_argus_model_instance experiment_dir = Path(tmpdir.join("path/to/pipeline_experiment/")) train_dataset = TensorDataset(*get_batch_function(batch_size=4096)) val_dataset = TensorDataset(*get_batch_function(batch_size=512)) train_loader = DataLoader(train_dataset, shuffle=True, drop_last=True, batch_size=32) val_loader = DataLoader(val_dataset, shuffle=False, batch_size=64) monitor_checkpoint = MonitorCheckpoint(dir_path=experiment_dir, monitor='val_loss', max_saves=1) callbacks = [ monitor_checkpoint, EarlyStopping(monitor='val_loss', patience=9), ReduceLROnPlateau(monitor='val_loss', factor=0.64, patience=3), LoggingToFile(experiment_dir / 'log.txt'), LoggingToCSV(experiment_dir / 'log.csv') ] model.fit(train_loader, val_loader=val_loader, num_epochs=100, callbacks=callbacks) val_loss = model.validate(val_loader)['val_loss'] assert val_loss < 0.1 model_paths = sorted(experiment_dir.glob('*.pth')) assert len(model_paths) == 1 loaded_model = load_model(model_paths[0]) loaded_val_loss = loaded_model.validate(val_loader)['val_loss'] assert loaded_val_loss == monitor_checkpoint.best_value assert (experiment_dir / 'log.txt').exists() assert (experiment_dir / 'log.csv').exists()
def train_fold(save_path, train_folds, val_folds): train_loader, val_loader = get_data_loaders(BATCH_SIZE, train_folds, val_folds) model = ShipMetaModel(params) callbacks = [ MonitorCheckpoint(save_path, monitor='val_iout', max_saves=2, copy_last=True), EarlyStopping(monitor='val_iout', patience=40), ReduceLROnPlateau(monitor='val_iout', patience=10, factor=0.2, min_lr=1e-8), LoggingToFile(os.path.join(save_path, 'log.txt')) ] model.fit(train_loader, val_loader=val_loader, max_epochs=EPOCHS, callbacks=callbacks, metrics=['iout'])
def train_fold(save_dir, train_folds, val_folds): depth_trns = SimpleDepthTransform() train_trns = SaltTransform(IMAGE_SIZE, True, 'crop') val_trns = SaltTransform(IMAGE_SIZE, False, 'crop') train_dataset = SaltDataset(TRAIN_FOLDS_PATH, train_folds, train_trns, depth_trns) val_dataset = SaltDataset(TRAIN_FOLDS_PATH, val_folds, val_trns, depth_trns) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=4) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4) model = SaltMetaModel(PARAMS) callbacks = [ MonitorCheckpoint(save_dir, monitor='val_crop_iout', max_saves=3, copy_last=False), EarlyStopping(monitor='val_crop_iout', patience=100), ReduceLROnPlateau(monitor='val_crop_iout', patience=30, factor=0.64, min_lr=1e-8), LoggingToFile(os.path.join(save_dir, 'log.txt')), ] model.fit(train_loader, val_loader=val_loader, max_epochs=700, callbacks=callbacks, metrics=['crop_iout'])
def train_fold(save_dir, train_folds, val_folds, folds_data, noisy_data, corrected_noisy_data): train_transfrom = get_transforms(train=True, size=CROP_SIZE, wrap_pad_prob=WRAP_PAD_PROB, resize_scale=(0.8, 1.0), resize_ratio=(1.7, 2.3), resize_prob=0.33, spec_num_mask=2, spec_freq_masking=0.15, spec_time_masking=0.20, spec_prob=0.5) mixer = RandomMixer([ SigmoidConcatMixer(sigmoid_range=(3, 12)), AddMixer(alpha_dist='uniform') ], p=[0.6, 0.4]) mixer = UseMixerWithProb(mixer, prob=MIXER_PROB) curated_dataset = FreesoundDataset(folds_data, train_folds, transform=train_transfrom, mixer=mixer) noisy_dataset = FreesoundNoisyDataset(noisy_data, transform=train_transfrom, mixer=mixer) corr_noisy_dataset = FreesoundCorrectedNoisyDataset( corrected_noisy_data, transform=train_transfrom, mixer=mixer) dataset_probs = [ NOISY_PROB, CORR_NOISY_PROB, 1 - NOISY_PROB - CORR_NOISY_PROB ] print("Dataset probs", dataset_probs) print("Dataset lens", len(noisy_dataset), len(corr_noisy_dataset), len(curated_dataset)) train_dataset = RandomDataset( [noisy_dataset, corr_noisy_dataset, curated_dataset], p=dataset_probs, size=DATASET_SIZE) val_dataset = FreesoundDataset(folds_data, val_folds, get_transforms(False, CROP_SIZE)) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=NUM_WORKERS) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE * 2, shuffle=False, num_workers=NUM_WORKERS) model = FreesoundModel(PARAMS) callbacks = [ MonitorCheckpoint(save_dir, monitor='val_lwlrap', max_saves=1), ReduceLROnPlateau(monitor='val_lwlrap', patience=6, factor=0.6, min_lr=1e-8), EarlyStopping(monitor='val_lwlrap', patience=18), LoggingToFile(save_dir / 'log.txt'), ] model.fit(train_loader, val_loader=val_loader, max_epochs=700, callbacks=callbacks, metrics=['multi_accuracy', 'lwlrap'])
print("Model params:", params) model = ArcfaceModel(params) train_metric_dataset = WhaleDataset(train_val_csv_path, True, **val_transforms) monitor_metric = CosMAPatK(train_metric_dataset, k=5, batch_size=batch_size, num_workers=num_workers) monitor_metric_name = 'val_' + monitor_metric.name callbacks = [ MonitorCheckpoint(experiment_dir, monitor=monitor_metric_name, max_saves=3), EarlyStopping(monitor=monitor_metric_name, patience=50), ReduceLROnPlateau(monitor=monitor_metric_name, patience=10, factor=0.64, min_lr=1e-8), LoggingToFile(join(experiment_dir, 'log.txt')) ] with open(join(experiment_dir, 'source.py'), 'w') as outfile: outfile.write(open(__file__).read()) model.fit(train_loader, val_loader=val_loader, max_epochs=1000, callbacks=callbacks, metrics=['accuracy', monitor_metric])
def train_fold(save_dir, train_folds, val_folds, folds_data): train_transfrom = get_transforms(train=True, size=CROP_SIZE, wrap_pad_prob=0.0, resize_scale=(0.8, 1.0), resize_ratio=(1.7, 2.3), resize_prob=0.0, spec_num_mask=2, spec_freq_masking=0.15, spec_time_masking=0.20, spec_prob=0.0) val_transform = get_transforms(train=False, size=CROP_SIZE) if MIXER_PROB: mixer = get_mixer(mixer_prob=MIXER_PROB, sigmoid_range=(3, 12), alpha_dist='uniform', random_prob=(0.6, 0.4)) else: mixer = None train_dataset = BirdsongDataset(folds_data, folds=train_folds, transform=train_transfrom, mixer=mixer) val_dataset = BirdsongDataset(folds_data, folds=val_folds, transform=val_transform) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=NUM_WORKERS) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE * 2 // ITER_SIZE, shuffle=False, num_workers=NUM_WORKERS) model = BirdsongModel(PARAMS) if 'pretrained' in model.params['nn_module'][1]: model.params['nn_module'][1]['pretrained'] = False if USE_AMP: initialize_amp(model) model.set_device(DEVICES) num_iterations = (5 * len(train_dataset)) // BATCH_SIZE callbacks = [ MonitorCheckpoint(save_dir, monitor='val_loss', max_saves=1), CosineAnnealingLR(T_max=num_iterations, eta_min=0, step_on_iteration=True), EarlyStopping(monitor='val_loss', patience=12), LoggingToFile(save_dir / 'log.txt'), LoggingToCSV(save_dir / 'log.csv') ] model.fit(train_loader, val_loader=val_loader, num_epochs=EPOCHS, callbacks=callbacks, metrics=['f1_score']) del model model_path = get_best_model_path(save_dir) model = load_model(model_path) val_dataset = BirdsongDataset(folds_data, folds=val_folds + [config.n_folds], transform=val_transform) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE * 2 // ITER_SIZE, shuffle=False, num_workers=NUM_WORKERS) model.set_device(DEVICES[0]) model.validate(val_loader, metrics=['f1_score'], callbacks=[ LoggingToFile(save_dir / 'log.txt'), LoggingToCSV(save_dir / 'log.csv') ])
model.nn_module = SyncBatchNorm.convert_sync_batchnorm(model.nn_module) model.nn_module = DistributedDataParallel(model.nn_module.to(local_rank), device_ids=[local_rank], output_device=local_rank) if local_rank: model.logger.disabled = True else: model.set_device('cuda') callbacks = [] if local_rank == 0: callbacks += [ MonitorCheckpoint(dir_path=EXPERIMENT_DIR, monitor='val_dist_accuracy', max_saves=3), LoggingToCSV(EXPERIMENT_DIR / 'log.csv'), LoggingToFile(EXPERIMENT_DIR / 'log.txt') ] callbacks += [ EarlyStopping(monitor='val_dist_accuracy', patience=30), CosineAnnealingLR(args.epochs), ] if distributed: @argus.callbacks.on_epoch_complete def schedule_sampler(state): state.data_loader.sampler.set_epoch(state.epoch + 1) callbacks += [schedule_sampler] model.fit(train_loader, val_loader=val_loader,
def train_fold(save_dir, train_folds, val_folds, local_rank=0, distributed=False, pretrain_dir=''): folds_data = get_folds_data() model = AlaskaModel(PARAMS) model.params['nn_module'][1]['pretrained'] = False if pretrain_dir: pretrain_path = get_best_model_path(pretrain_dir) if pretrain_path is not None: print(f'Pretrain model path {pretrain_path}') load_pretrain_weigths(model, pretrain_path) else: print(f"Pretrain model not found in '{pretrain_dir}'") if USE_AMP: initialize_amp(model) if distributed: model.nn_module = SyncBatchNorm.convert_sync_batchnorm(model.nn_module) model.nn_module = DistributedDataParallel( model.nn_module.to(local_rank), device_ids=[local_rank], output_device=local_rank) if local_rank: model.logger.disabled = True else: model.set_device(DEVICES) if USE_EMA: initialize_ema(model, decay=0.9999) checkpoint = EmaMonitorCheckpoint else: checkpoint = MonitorCheckpoint for epochs, stage in zip(TRAIN_EPOCHS, STAGE): test_transform = get_transforms(train=False) if stage == 'train': mixer = RandomMixer([BitMix(gamma=0.25), EmptyMix()], p=[0., 1.]) train_transform = get_transforms(train=True) else: mixer = EmptyMix() train_transform = get_transforms(train=False) train_dataset = AlaskaDataset(folds_data, train_folds, transform=train_transform, mixer=mixer) val_dataset = AlaskaDataset(folds_data, val_folds, transform=test_transform) val_sampler = AlaskaSampler(val_dataset, train=False) if distributed: train_sampler = AlaskaDistributedSampler(train_dataset) else: train_sampler = AlaskaSampler(train_dataset, train=True) train_loader = DataLoader(train_dataset, sampler=train_sampler, num_workers=NUM_WORKERS, batch_size=BATCH_SIZE) val_loader = DataLoader(val_dataset, sampler=val_sampler, num_workers=NUM_WORKERS, batch_size=VAL_BATCH_SIZE) callbacks = [] if local_rank == 0: callbacks += [ checkpoint(save_dir, monitor='val_weighted_auc', max_saves=5, file_format=stage + '-model-{epoch:03d}-{monitor:.6f}.pth'), LoggingToFile(save_dir / 'log.txt'), LoggingToCSV(save_dir / 'log.csv', append=True) ] if stage == 'train': callbacks += [ CosineAnnealingLR(T_max=epochs, eta_min=get_lr(9e-6, WORLD_BATCH_SIZE)) ] elif stage == 'warmup': warmup_iterations = epochs * (len(train_sampler) / BATCH_SIZE) callbacks += [ LambdaLR(lambda x: x / warmup_iterations, step_on_iteration=True) ] if stage == 'train': @argus.callbacks.on_epoch_start def schedule_mixer_prob(state): bitmix_prob = state.epoch / epochs mixer.p = [bitmix_prob, 1 - bitmix_prob] state.logger.info(f"Mixer probabilities {mixer.p}") callbacks += [schedule_mixer_prob] if distributed: @argus.callbacks.on_epoch_complete def schedule_sampler(state): train_sampler.set_epoch(state.epoch + 1) callbacks += [schedule_sampler] metrics = ['weighted_auc', Accuracy('stegano'), Accuracy('quality')] model.fit(train_loader, val_loader=val_loader, num_epochs=epochs, callbacks=callbacks, metrics=metrics)
num_workers=4) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4) model = SaltMetaModel(params) callbacks = [ MonitorCheckpoint(experiment_dir, monitor='val_crop_iout', max_saves=1, copy_last=False), EarlyStopping(monitor='val_crop_iout', patience=100), ReduceLROnPlateau(monitor='val_crop_iout', patience=30, factor=0.7, min_lr=1e-8), LoggingToFile(os.path.join(experiment_dir, 'log.txt')) ] with open(os.path.join(experiment_dir, 'random_params.json'), 'w') as outfile: json.dump(random_params, outfile) model.fit(train_loader, val_loader=val_loader, max_epochs=600, callbacks=callbacks, metrics=['crop_iout'])
image_transform=val_trns) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=8, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=8, shuffle=False) model = IterSizeMetaModel(PARAMS) callbacks = [ MonitorCheckpoint(f'/workdir/data/experiments/{EXPERIMENT_NAME}', monitor='val_map_at_k', max_saves=10), EarlyStopping(monitor='val_map_at_k', patience=50), ReduceLROnPlateau(monitor='val_map_at_k', factor=0.64, patience=1, min_lr=0.000001), LoggingToFile(f'/workdir/data/experiments/{EXPERIMENT_NAME}/log.txt') ] model.fit(train_loader, val_loader=val_loader, max_epochs=1000, callbacks=callbacks, metrics=['accuracy', MAPatK(k=3)])
def train_folds(save_dir, folds_data): random_params = { 'base_size': int(np.random.choice([64, 128, 256, 512])), 'reduction_scale': int(np.random.choice([2, 4, 8, 16])), 'p_dropout': float(np.random.uniform(0.0, 0.5)), 'lr': float(np.random.uniform(0.0001, 0.00001)), 'patience': int(np.random.randint(3, 12)), 'factor': float(np.random.uniform(0.5, 0.8)), 'batch_size': int(np.random.choice([32, 64, 128])), } pprint(random_params) save_dir.mkdir(parents=True, exist_ok=True) with open(save_dir / 'random_params.json', 'w') as outfile: json.dump(random_params, outfile) params = { 'nn_module': ('FCNet', { 'in_channels': len(config.classes) * len(EXPERIMENTS), 'num_classes': len(config.classes), 'base_size': random_params['base_size'], 'reduction_scale': random_params['reduction_scale'], 'p_dropout': random_params['p_dropout'] }), 'loss': 'BCEWithLogitsLoss', 'optimizer': ('Adam', { 'lr': random_params['lr'] }), 'device': 'cuda', } for fold in config.folds: val_folds = [fold] train_folds = list(set(config.folds) - set(val_folds)) save_fold_dir = save_dir / f'fold_{fold}' print(f"Val folds: {val_folds}, Train folds: {train_folds}") print(f"Fold save dir {save_fold_dir}") train_dataset = StackingDataset(folds_data, train_folds, get_transforms(True), DATASET_SIZE) val_dataset = StackingDataset(folds_data, val_folds, get_transforms(False)) train_loader = DataLoader(train_dataset, batch_size=random_params['batch_size'], shuffle=True, drop_last=True, num_workers=NUM_WORKERS) val_loader = DataLoader(val_dataset, batch_size=random_params['batch_size'] * 2, shuffle=False, num_workers=NUM_WORKERS) model = StackingModel(params) callbacks = [ MonitorCheckpoint(save_fold_dir, monitor='val_lwlrap', max_saves=1), ReduceLROnPlateau(monitor='val_lwlrap', patience=random_params['patience'], factor=random_params['factor'], min_lr=1e-8), EarlyStopping(monitor='val_lwlrap', patience=20), LoggingToFile(save_fold_dir / 'log.txt'), ] model.fit(train_loader, val_loader=val_loader, max_epochs=300, callbacks=callbacks, metrics=['multi_accuracy', 'lwlrap'])
def train_experiment(folds_data, noisy_data, num): experiment_dir = SAVE_DIR / f'{num:04}' np.random.seed(num) random.seed(num) random_params = { 'p_dropout': float(np.random.uniform(0.1, 0.3)), 'batch_size': int(np.random.choice([128])), 'lr': float(np.random.choice([0.001, 0.0006, 0.0003])), 'add_prob': float(np.random.uniform(0.0, 1.0)), 'noisy_prob': float(np.random.uniform(0.0, 1.0)), 'lsoft_beta': float(np.random.uniform(0.2, 0.8)), 'noisy_weight': float(np.random.uniform(0.3, 0.7)), 'patience': int(np.random.randint(2, 10)), 'factor': float(np.random.uniform(0.5, 0.8)) } pprint(random_params) params = { 'nn_module': ('SimpleKaggle', { 'num_classes': len(config.classes), 'dropout': random_params['p_dropout'], 'base_size': 64 }), 'loss': ('OnlyNoisyLSoftLoss', { 'beta': random_params['lsoft_beta'], 'noisy_weight': random_params['noisy_weight'], 'curated_weight': 1 - random_params['noisy_weight'] }), 'optimizer': ('Adam', {'lr': random_params['lr']}), 'device': 'cuda', 'amp': { 'opt_level': 'O2', 'keep_batchnorm_fp32': True, 'loss_scale': "dynamic" } } pprint(params) try: train_transfrom = get_transforms(True, CROP_SIZE) curated_dataset = FreesoundDataset(folds_data, TRAIN_FOLDS, transform=train_transfrom, add_prob=random_params['add_prob']) noisy_dataset = FreesoundNoisyDataset(noisy_data, transform=train_transfrom) train_dataset = CombinedDataset(noisy_dataset, curated_dataset, noisy_prob=random_params['noisy_prob'], size=DATASET_SIZE) val_dataset = FreesoundDataset(folds_data, VAL_FOLDS, get_transforms(False, CROP_SIZE)) train_loader = DataLoader(train_dataset, batch_size=random_params['batch_size'], shuffle=True, drop_last=True, num_workers=NUM_WORKERS) val_loader = DataLoader(val_dataset, batch_size=random_params['batch_size'] * 2, shuffle=False, num_workers=NUM_WORKERS) model = FreesoundModel(params) callbacks = [ MonitorCheckpoint(experiment_dir, monitor='val_lwlrap', max_saves=1), ReduceLROnPlateau(monitor='val_lwlrap', patience=random_params['patience'], factor=random_params['factor'], min_lr=1e-8), EarlyStopping(monitor='val_lwlrap', patience=20), LoggingToFile(experiment_dir / 'log.txt'), ] with open(experiment_dir / 'random_params.json', 'w') as outfile: json.dump(random_params, outfile) model.fit(train_loader, val_loader=val_loader, max_epochs=100, callbacks=callbacks, metrics=['multi_accuracy', 'lwlrap']) except KeyboardInterrupt as e: raise e except BaseException as e: print(f"Exception '{e}' with random params '{random_params}'")