def test_increasing_decreasing_seq(self, engine, patience, better): early_stopping = EarlyStopping(monitor='val_loss', patience=patience, better='auto') early_stopping.attach(engine) engine.state.logger = logging.getLogger("test_early_stopping") engine.state.stopped = False increasing_seq = list(range(patience)) decreasing_seq = increasing_seq[::-1] early_stopping.start(engine.state) for value in decreasing_seq: engine.state.metrics = {'val_loss': value} early_stopping.epoch_complete(engine.state) print(engine.state.stopped) assert early_stopping.wait == 0 assert early_stopping.best_value == value assert not engine.state.stopped best_value = early_stopping.best_value for num, value in enumerate(increasing_seq, 1): engine.state.metrics = {'val_loss': value} early_stopping.epoch_complete(engine.state) print(engine.state.stopped) assert early_stopping.wait == num assert early_stopping.best_value == best_value if num == len(increasing_seq): assert engine.state.stopped else: assert not engine.state.stopped
def test_decreasing_with_spikes(self, engine): early_stopping = EarlyStopping(monitor='val_loss', patience=2, better='auto') early_stopping.attach(engine) engine.state.stopped = False decreasing_seq = list(range(30))[::-1] for i in range(0, len(decreasing_seq), 2): decreasing_seq[i] = 100 assert not engine.state.stopped
def train_fold(save_dir, train_folds, val_folds, folds_data): train_dataset = StackingDataset(folds_data, train_folds, get_transforms(True), DATASET_SIZE) val_dataset = StackingDataset(folds_data, val_folds, get_transforms(False)) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=NUM_WORKERS) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE * 2, shuffle=False, num_workers=NUM_WORKERS) model = StackingModel(PARAMS) callbacks = [ MonitorCheckpoint(save_dir, monitor='val_lwlrap', max_saves=1), ReduceLROnPlateau(monitor='val_lwlrap', patience=RS_PARAMS['patience'], factor=RS_PARAMS['factor'], min_lr=1e-8), EarlyStopping(monitor='val_lwlrap', patience=30), LoggingToFile(save_dir / 'log.txt'), ] model.fit(train_loader, val_loader=val_loader, max_epochs=700, callbacks=callbacks, metrics=['multi_accuracy', 'lwlrap'])
def test_metric_not_found(self, engine): early_stopping = EarlyStopping(monitor='val_loss', patience=1, better='min') early_stopping.attach(engine) engine.state.metrics = {'val_qwerty': 0.1} with pytest.raises(ValueError): early_stopping.epoch_complete(engine.state)
def train_fold(save_path, train_folds, val_folds): train_loader, val_loader = get_data_loaders(BATCH_SIZE, train_folds, val_folds) model = ShipMetaModel(params) callbacks = [ MonitorCheckpoint(save_path, monitor='val_iout', max_saves=2, copy_last=True), EarlyStopping(monitor='val_iout', patience=40), ReduceLROnPlateau(monitor='val_iout', patience=10, factor=0.2, min_lr=1e-8), LoggingToFile(os.path.join(save_path, 'log.txt')) ] model.fit(train_loader, val_loader=val_loader, max_epochs=EPOCHS, callbacks=callbacks, metrics=['iout'])
def test_pipeline(tmpdir, get_batch_function, linear_argus_model_instance): model = linear_argus_model_instance experiment_dir = Path(tmpdir.join("path/to/pipeline_experiment/")) train_dataset = TensorDataset(*get_batch_function(batch_size=4096)) val_dataset = TensorDataset(*get_batch_function(batch_size=512)) train_loader = DataLoader(train_dataset, shuffle=True, drop_last=True, batch_size=32) val_loader = DataLoader(val_dataset, shuffle=False, batch_size=64) monitor_checkpoint = MonitorCheckpoint(dir_path=experiment_dir, monitor='val_loss', max_saves=1) callbacks = [ monitor_checkpoint, EarlyStopping(monitor='val_loss', patience=9), ReduceLROnPlateau(monitor='val_loss', factor=0.64, patience=3), LoggingToFile(experiment_dir / 'log.txt'), LoggingToCSV(experiment_dir / 'log.csv') ] model.fit(train_loader, val_loader=val_loader, num_epochs=100, callbacks=callbacks) val_loss = model.validate(val_loader)['val_loss'] assert val_loss < 0.1 model_paths = sorted(experiment_dir.glob('*.pth')) assert len(model_paths) == 1 loaded_model = load_model(model_paths[0]) loaded_val_loss = loaded_model.validate(val_loader)['val_loss'] assert loaded_val_loss == monitor_checkpoint.best_value assert (experiment_dir / 'log.txt').exists() assert (experiment_dir / 'log.csv').exists()
def train_fold(save_dir, train_folds, val_folds): depth_trns = SimpleDepthTransform() train_trns = SaltTransform(IMAGE_SIZE, True, 'crop') val_trns = SaltTransform(IMAGE_SIZE, False, 'crop') train_dataset = SaltDataset(TRAIN_FOLDS_PATH, train_folds, train_trns, depth_trns) val_dataset = SaltDataset(TRAIN_FOLDS_PATH, val_folds, val_trns, depth_trns) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=4) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4) model = SaltMetaModel(PARAMS) callbacks = [ MonitorCheckpoint(save_dir, monitor='val_crop_iout', max_saves=3, copy_last=False), EarlyStopping(monitor='val_crop_iout', patience=100), ReduceLROnPlateau(monitor='val_crop_iout', patience=30, factor=0.64, min_lr=1e-8), LoggingToFile(os.path.join(save_dir, 'log.txt')), ] model.fit(train_loader, val_loader=val_loader, max_epochs=700, callbacks=callbacks, metrics=['crop_iout'])
} print("Model params:", params) model = ArcfaceModel(params) train_metric_dataset = WhaleDataset(train_val_csv_path, True, **val_transforms) monitor_metric = CosMAPatK(train_metric_dataset, k=5, batch_size=batch_size, num_workers=num_workers) monitor_metric_name = 'val_' + monitor_metric.name callbacks = [ MonitorCheckpoint(experiment_dir, monitor=monitor_metric_name, max_saves=3), EarlyStopping(monitor=monitor_metric_name, patience=50), ReduceLROnPlateau(monitor=monitor_metric_name, patience=10, factor=0.64, min_lr=1e-8), LoggingToFile(join(experiment_dir, 'log.txt')) ] with open(join(experiment_dir, 'source.py'), 'w') as outfile: outfile.write(open(__file__).read()) model.fit(train_loader, val_loader=val_loader, max_epochs=1000, callbacks=callbacks, metrics=['accuracy', monitor_metric])
def train_fold(save_dir, train_folds, val_folds, folds_data): train_transfrom = get_transforms(train=True, size=CROP_SIZE, wrap_pad_prob=0.0, resize_scale=(0.8, 1.0), resize_ratio=(1.7, 2.3), resize_prob=0.0, spec_num_mask=2, spec_freq_masking=0.15, spec_time_masking=0.20, spec_prob=0.0) val_transform = get_transforms(train=False, size=CROP_SIZE) if MIXER_PROB: mixer = get_mixer(mixer_prob=MIXER_PROB, sigmoid_range=(3, 12), alpha_dist='uniform', random_prob=(0.6, 0.4)) else: mixer = None train_dataset = BirdsongDataset(folds_data, folds=train_folds, transform=train_transfrom, mixer=mixer) val_dataset = BirdsongDataset(folds_data, folds=val_folds, transform=val_transform) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=NUM_WORKERS) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE * 2 // ITER_SIZE, shuffle=False, num_workers=NUM_WORKERS) model = BirdsongModel(PARAMS) if 'pretrained' in model.params['nn_module'][1]: model.params['nn_module'][1]['pretrained'] = False if USE_AMP: initialize_amp(model) model.set_device(DEVICES) num_iterations = (5 * len(train_dataset)) // BATCH_SIZE callbacks = [ MonitorCheckpoint(save_dir, monitor='val_loss', max_saves=1), CosineAnnealingLR(T_max=num_iterations, eta_min=0, step_on_iteration=True), EarlyStopping(monitor='val_loss', patience=12), LoggingToFile(save_dir / 'log.txt'), LoggingToCSV(save_dir / 'log.csv') ] model.fit(train_loader, val_loader=val_loader, num_epochs=EPOCHS, callbacks=callbacks, metrics=['f1_score']) del model model_path = get_best_model_path(save_dir) model = load_model(model_path) val_dataset = BirdsongDataset(folds_data, folds=val_folds + [config.n_folds], transform=val_transform) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE * 2 // ITER_SIZE, shuffle=False, num_workers=NUM_WORKERS) model.set_device(DEVICES[0]) model.validate(val_loader, metrics=['f1_score'], callbacks=[ LoggingToFile(save_dir / 'log.txt'), LoggingToCSV(save_dir / 'log.csv') ])
) # IT IS BETTER TO SPLIT DATA INTO TRAIN|VAL AND USE METRICS ON VAL val_dataset_paths = [p / "val" for p in DATASET_PATHS] val_dataset = ConcatDataset( [OcrDataset(p, transforms=transforms) for p in val_dataset_paths]) # val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4) model = CRNNModel(MODEL_PARAMS) # YOU CAN ADD CALLBACK IF IT NEEDED, FIND MORE IN argus.callbacks callbacks = [ MonitorCheckpoint(EXPERIMENT_DIR, monitor="val_str_accuracy_letter", max_saves=6), EarlyStopping(monitor='val_loss', patience=200), ] # YOU CAN IMPLEMENT DIFFERENT METRICS AND USE THEM TO SEE HOW MANY CORRECT PREDICTION YOU HAVE metrics = [StringAccuracy(), StringAccuracyLetters()] model.fit( train_loader, val_loader=val_loader, max_epochs=NUM_EPOCHS, metrics=metrics, callbacks=callbacks, metrics_on_train=True, )
if local_rank: model.logger.disabled = True else: model.set_device('cuda') callbacks = [] if local_rank == 0: callbacks += [ MonitorCheckpoint(dir_path=EXPERIMENT_DIR, monitor='val_dist_accuracy', max_saves=3), LoggingToCSV(EXPERIMENT_DIR / 'log.csv'), LoggingToFile(EXPERIMENT_DIR / 'log.txt') ] callbacks += [ EarlyStopping(monitor='val_dist_accuracy', patience=30), CosineAnnealingLR(args.epochs), ] if distributed: @argus.callbacks.on_epoch_complete def schedule_sampler(state): state.data_loader.sampler.set_epoch(state.epoch + 1) callbacks += [schedule_sampler] model.fit(train_loader, val_loader=val_loader, num_epochs=args.epochs, metrics=[CategoricalAccuracy(distributed, world_size)], callbacks=callbacks)
shuffle=True, drop_last=True, num_workers=4) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4) model = SaltMetaModel(params) callbacks = [ MonitorCheckpoint(experiment_dir, monitor='val_crop_iout', max_saves=1, copy_last=False), EarlyStopping(monitor='val_crop_iout', patience=100), ReduceLROnPlateau(monitor='val_crop_iout', patience=30, factor=0.7, min_lr=1e-8), LoggingToFile(os.path.join(experiment_dir, 'log.txt')) ] with open(os.path.join(experiment_dir, 'random_params.json'), 'w') as outfile: json.dump(random_params, outfile) model.fit(train_loader, val_loader=val_loader, max_epochs=600, callbacks=callbacks,
image_transform=val_trns) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=8, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=8, shuffle=False) model = IterSizeMetaModel(PARAMS) callbacks = [ MonitorCheckpoint(f'/workdir/data/experiments/{EXPERIMENT_NAME}', monitor='val_map_at_k', max_saves=10), EarlyStopping(monitor='val_map_at_k', patience=50), ReduceLROnPlateau(monitor='val_map_at_k', factor=0.64, patience=1, min_lr=0.000001), LoggingToFile(f'/workdir/data/experiments/{EXPERIMENT_NAME}/log.txt') ] model.fit(train_loader, val_loader=val_loader, max_epochs=1000, callbacks=callbacks, metrics=['accuracy', MAPatK(k=3)])
def train_folds(save_dir, folds_data): random_params = { 'base_size': int(np.random.choice([64, 128, 256, 512])), 'reduction_scale': int(np.random.choice([2, 4, 8, 16])), 'p_dropout': float(np.random.uniform(0.0, 0.5)), 'lr': float(np.random.uniform(0.0001, 0.00001)), 'patience': int(np.random.randint(3, 12)), 'factor': float(np.random.uniform(0.5, 0.8)), 'batch_size': int(np.random.choice([32, 64, 128])), } pprint(random_params) save_dir.mkdir(parents=True, exist_ok=True) with open(save_dir / 'random_params.json', 'w') as outfile: json.dump(random_params, outfile) params = { 'nn_module': ('FCNet', { 'in_channels': len(config.classes) * len(EXPERIMENTS), 'num_classes': len(config.classes), 'base_size': random_params['base_size'], 'reduction_scale': random_params['reduction_scale'], 'p_dropout': random_params['p_dropout'] }), 'loss': 'BCEWithLogitsLoss', 'optimizer': ('Adam', { 'lr': random_params['lr'] }), 'device': 'cuda', } for fold in config.folds: val_folds = [fold] train_folds = list(set(config.folds) - set(val_folds)) save_fold_dir = save_dir / f'fold_{fold}' print(f"Val folds: {val_folds}, Train folds: {train_folds}") print(f"Fold save dir {save_fold_dir}") train_dataset = StackingDataset(folds_data, train_folds, get_transforms(True), DATASET_SIZE) val_dataset = StackingDataset(folds_data, val_folds, get_transforms(False)) train_loader = DataLoader(train_dataset, batch_size=random_params['batch_size'], shuffle=True, drop_last=True, num_workers=NUM_WORKERS) val_loader = DataLoader(val_dataset, batch_size=random_params['batch_size'] * 2, shuffle=False, num_workers=NUM_WORKERS) model = StackingModel(params) callbacks = [ MonitorCheckpoint(save_fold_dir, monitor='val_lwlrap', max_saves=1), ReduceLROnPlateau(monitor='val_lwlrap', patience=random_params['patience'], factor=random_params['factor'], min_lr=1e-8), EarlyStopping(monitor='val_lwlrap', patience=20), LoggingToFile(save_fold_dir / 'log.txt'), ] model.fit(train_loader, val_loader=val_loader, max_epochs=300, callbacks=callbacks, metrics=['multi_accuracy', 'lwlrap'])
params = { 'nn_module': { 'model_name': 'tf_efficientnet_b0_ns', 'pretrained': True, 'num_classes': 10, 'drop_rate': 0.2, 'drop_path_rate': 0.2, }, 'optimizer': ('AdamW', { 'lr': args.lr }), 'loss': 'CrossEntropyLoss', 'device': args.device } model = CifarModel(params) callbacks = [ MonitorCheckpoint(dir_path=EXPERIMENT_DIR, monitor='val_accuracy', max_saves=3), EarlyStopping(monitor='val_accuracy', patience=9), ReduceLROnPlateau(monitor='val_accuracy', factor=0.64, patience=3), LoggingToCSV(EXPERIMENT_DIR / 'log.csv') ] model.fit(train_loader, val_loader=val_loader, num_epochs=args.epochs, metrics=['accuracy'], callbacks=callbacks)
def train_fold(save_dir, train_folds, val_folds, folds_data, noisy_data, corrected_noisy_data): train_transfrom = get_transforms(train=True, size=CROP_SIZE, wrap_pad_prob=WRAP_PAD_PROB, resize_scale=(0.8, 1.0), resize_ratio=(1.7, 2.3), resize_prob=0.33, spec_num_mask=2, spec_freq_masking=0.15, spec_time_masking=0.20, spec_prob=0.5) mixer = RandomMixer([ SigmoidConcatMixer(sigmoid_range=(3, 12)), AddMixer(alpha_dist='uniform') ], p=[0.6, 0.4]) mixer = UseMixerWithProb(mixer, prob=MIXER_PROB) curated_dataset = FreesoundDataset(folds_data, train_folds, transform=train_transfrom, mixer=mixer) noisy_dataset = FreesoundNoisyDataset(noisy_data, transform=train_transfrom, mixer=mixer) corr_noisy_dataset = FreesoundCorrectedNoisyDataset( corrected_noisy_data, transform=train_transfrom, mixer=mixer) dataset_probs = [ NOISY_PROB, CORR_NOISY_PROB, 1 - NOISY_PROB - CORR_NOISY_PROB ] print("Dataset probs", dataset_probs) print("Dataset lens", len(noisy_dataset), len(corr_noisy_dataset), len(curated_dataset)) train_dataset = RandomDataset( [noisy_dataset, corr_noisy_dataset, curated_dataset], p=dataset_probs, size=DATASET_SIZE) val_dataset = FreesoundDataset(folds_data, val_folds, get_transforms(False, CROP_SIZE)) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=NUM_WORKERS) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE * 2, shuffle=False, num_workers=NUM_WORKERS) model = FreesoundModel(PARAMS) callbacks = [ MonitorCheckpoint(save_dir, monitor='val_lwlrap', max_saves=1), ReduceLROnPlateau(monitor='val_lwlrap', patience=6, factor=0.6, min_lr=1e-8), EarlyStopping(monitor='val_lwlrap', patience=18), LoggingToFile(save_dir / 'log.txt'), ] model.fit(train_loader, val_loader=val_loader, max_epochs=700, callbacks=callbacks, metrics=['multi_accuracy', 'lwlrap'])
def train_experiment(folds_data, noisy_data, num): experiment_dir = SAVE_DIR / f'{num:04}' np.random.seed(num) random.seed(num) random_params = { 'p_dropout': float(np.random.uniform(0.1, 0.3)), 'batch_size': int(np.random.choice([128])), 'lr': float(np.random.choice([0.001, 0.0006, 0.0003])), 'add_prob': float(np.random.uniform(0.0, 1.0)), 'noisy_prob': float(np.random.uniform(0.0, 1.0)), 'lsoft_beta': float(np.random.uniform(0.2, 0.8)), 'noisy_weight': float(np.random.uniform(0.3, 0.7)), 'patience': int(np.random.randint(2, 10)), 'factor': float(np.random.uniform(0.5, 0.8)) } pprint(random_params) params = { 'nn_module': ('SimpleKaggle', { 'num_classes': len(config.classes), 'dropout': random_params['p_dropout'], 'base_size': 64 }), 'loss': ('OnlyNoisyLSoftLoss', { 'beta': random_params['lsoft_beta'], 'noisy_weight': random_params['noisy_weight'], 'curated_weight': 1 - random_params['noisy_weight'] }), 'optimizer': ('Adam', {'lr': random_params['lr']}), 'device': 'cuda', 'amp': { 'opt_level': 'O2', 'keep_batchnorm_fp32': True, 'loss_scale': "dynamic" } } pprint(params) try: train_transfrom = get_transforms(True, CROP_SIZE) curated_dataset = FreesoundDataset(folds_data, TRAIN_FOLDS, transform=train_transfrom, add_prob=random_params['add_prob']) noisy_dataset = FreesoundNoisyDataset(noisy_data, transform=train_transfrom) train_dataset = CombinedDataset(noisy_dataset, curated_dataset, noisy_prob=random_params['noisy_prob'], size=DATASET_SIZE) val_dataset = FreesoundDataset(folds_data, VAL_FOLDS, get_transforms(False, CROP_SIZE)) train_loader = DataLoader(train_dataset, batch_size=random_params['batch_size'], shuffle=True, drop_last=True, num_workers=NUM_WORKERS) val_loader = DataLoader(val_dataset, batch_size=random_params['batch_size'] * 2, shuffle=False, num_workers=NUM_WORKERS) model = FreesoundModel(params) callbacks = [ MonitorCheckpoint(experiment_dir, monitor='val_lwlrap', max_saves=1), ReduceLROnPlateau(monitor='val_lwlrap', patience=random_params['patience'], factor=random_params['factor'], min_lr=1e-8), EarlyStopping(monitor='val_lwlrap', patience=20), LoggingToFile(experiment_dir / 'log.txt'), ] with open(experiment_dir / 'random_params.json', 'w') as outfile: json.dump(random_params, outfile) model.fit(train_loader, val_loader=val_loader, max_epochs=100, callbacks=callbacks, metrics=['multi_accuracy', 'lwlrap']) except KeyboardInterrupt as e: raise e except BaseException as e: print(f"Exception '{e}' with random params '{random_params}'")