def test_event_handler_get_batch_completed(): true_event_handler_time = 0.1 true_max_epochs = 1 true_num_iters = 2 profiler = BasicTimeProfiler() dummy_trainer = Engine(_do_nothing_update_fn) profiler.attach(dummy_trainer) @dummy_trainer.on(Events.GET_BATCH_COMPLETED) def delay_get_batch_completed(engine): time.sleep(true_event_handler_time) dummy_trainer.run(range(true_num_iters), max_epochs=true_max_epochs) results = profiler.get_results() event_results = results["event_handlers_stats"]["GET_BATCH_COMPLETED"] assert event_results["min/index"][0] == approx(true_event_handler_time, abs=1e-1) assert event_results["max/index"][0] == approx(true_event_handler_time, abs=1e-1) assert event_results["mean"] == approx(true_event_handler_time, abs=1e-1) assert event_results["std"] == approx(0.0, abs=1e-1) assert event_results["total"] == approx(true_max_epochs * true_num_iters * true_event_handler_time, abs=1e-1)
def test_dataflow_timer_basic_profiler(): true_dataflow_time_per_ele = 0.1 true_max_epochs = 1 true_num_iters = 2 def dummy_data_loader(data): while True: for d in data: time.sleep(true_dataflow_time_per_ele) yield d dummy_data = range(true_num_iters) profiler = BasicTimeProfiler() dummy_trainer = Engine(_do_nothing_update_fn) profiler.attach(dummy_trainer) dummy_trainer.run(dummy_data_loader(dummy_data), max_epochs=true_max_epochs, epoch_length=true_num_iters) results = profiler.get_results() dataflow_results = results["dataflow_stats"] assert dataflow_results["min/index"][0] == approx(true_dataflow_time_per_ele, abs=1e-1) assert dataflow_results["max/index"][0] == approx(true_dataflow_time_per_ele, abs=1e-1) assert dataflow_results["mean"] == approx(true_dataflow_time_per_ele, abs=1e-1) assert dataflow_results["std"] == approx(0.0, abs=1e-1) assert dataflow_results["total"] == approx(true_num_iters * true_dataflow_time_per_ele, abs=1e-1)
def test_event_handler_iteration_started_basic_profiler(): true_event_handler_time = 0.1 true_max_epochs = 1 true_num_iters = 2 profiler = BasicTimeProfiler() dummy_trainer = Engine(_do_nothing_update_fn) profiler.attach(dummy_trainer) @dummy_trainer.on(Events.ITERATION_STARTED) def delay_iter_start(engine): time.sleep(true_event_handler_time) dummy_trainer.run(range(true_num_iters), max_epochs=true_max_epochs) results = profiler.get_results() event_results = results["event_handlers_stats"]["ITERATION_STARTED"] assert event_results["min/index"][0] == approx(true_event_handler_time, abs=1e-1) assert event_results["max/index"][0] == approx(true_event_handler_time, abs=1e-1) assert event_results["mean"] == approx(true_event_handler_time, abs=1e-1) assert event_results["std"] == approx(0.0, abs=1e-1) assert event_results["total"] == approx(true_max_epochs * true_num_iters * true_event_handler_time, abs=1e-1)
def test_event_handler_total_time(): true_event_handler_time = 0.125 true_max_epochs = 1 true_num_iters = 1 profiler = BasicTimeProfiler() dummy_trainer = Engine(_do_nothing_update_fn) profiler.attach(dummy_trainer) @dummy_trainer.on(Events.STARTED) def delay_start(engine): time.sleep(true_event_handler_time) @dummy_trainer.on(Events.COMPLETED) def delay_complete(engine): time.sleep(true_event_handler_time) @dummy_trainer.on(Events.EPOCH_STARTED) def delay_epoch_start(engine): time.sleep(true_event_handler_time) @dummy_trainer.on(Events.EPOCH_COMPLETED) def delay_epoch_complete(engine): time.sleep(true_event_handler_time) @dummy_trainer.on(Events.ITERATION_STARTED) def delay_iter_start(engine): time.sleep(true_event_handler_time) @dummy_trainer.on(Events.ITERATION_COMPLETED) def delay_iter_complete(engine): time.sleep(true_event_handler_time) @dummy_trainer.on(Events.GET_BATCH_STARTED) def delay_get_batch_started(engine): time.sleep(true_event_handler_time) @dummy_trainer.on(Events.GET_BATCH_COMPLETED) def delay_get_batch_completed(engine): time.sleep(true_event_handler_time) dummy_trainer.run(range(true_num_iters), max_epochs=true_max_epochs) results = profiler.get_results() event_results = results["event_handlers_stats"] assert event_results["total_time"].item() == approx( true_event_handler_time * 8, abs=1e-1)
def test_event_handler_completed_basic_profiler(): true_event_handler_time = 0.1 true_max_epochs = 2 true_num_iters = 2 profiler = BasicTimeProfiler() dummy_trainer = Engine(_do_nothing_update_fn) profiler.attach(dummy_trainer) @dummy_trainer.on(Events.COMPLETED) def delay_complete(engine): time.sleep(true_event_handler_time) dummy_trainer.run(range(true_num_iters), max_epochs=true_max_epochs) results = profiler.get_results() event_results = results["event_handlers_stats"]["COMPLETED"] assert event_results["total"] == approx(true_event_handler_time, abs=1e-1)
def test_get_intermediate_results_during_run_basic_profiler(capsys): true_event_handler_time = 0.0645 true_max_epochs = 2 true_num_iters = 5 profiler = BasicTimeProfiler() dummy_trainer = get_prepared_engine_for_basic_profiler(true_event_handler_time) profiler.attach(dummy_trainer) @dummy_trainer.on(Events.ITERATION_COMPLETED(every=3)) def log_results(_): results = profiler.get_results() profiler.print_results(results) captured = capsys.readouterr() out = captured.out assert "BasicTimeProfiler._" not in out assert "nan" not in out assert " min/index: (0.0, " not in out, out dummy_trainer.run(range(true_num_iters), max_epochs=true_max_epochs)
def test_processing_timer_basic_profiler(): true_processing_time = 0.1 true_max_epochs = 2 true_num_iters = 2 def train_updater(engine, batch): time.sleep(true_processing_time) profiler = BasicTimeProfiler() dummy_trainer = Engine(train_updater) profiler.attach(dummy_trainer) dummy_trainer.run(range(true_num_iters), max_epochs=true_max_epochs) results = profiler.get_results() processing_results = results["processing_stats"] assert processing_results["min/index"][0] == approx(true_processing_time, abs=1e-1) assert processing_results["max/index"][0] == approx(true_processing_time, abs=1e-1) assert processing_results["mean"] == approx(true_processing_time, abs=1e-1) assert processing_results["std"] == approx(0.0, abs=1e-1) assert processing_results["total"] == approx(true_max_epochs * true_num_iters * true_processing_time, abs=1e-1)
def test_event_handler_completed(): true_event_handler_time = 0.1 true_max_epochs = 2 true_num_iters = 2 profiler = BasicTimeProfiler() dummy_trainer = Engine(_do_nothing_update_fn) profiler.attach(dummy_trainer) @dummy_trainer.on(Events.COMPLETED) def delay_complete(engine): time.sleep(true_event_handler_time) dummy_trainer.run(range(true_num_iters), max_epochs=true_max_epochs) results = profiler.get_results() event_results = results['event_handlers_stats']['Events_COMPLETED'] assert event_results['min/index'][0] == approx(true_event_handler_time, abs=1e-1) assert event_results['max/index'][0] == approx(true_event_handler_time, abs=1e-1) assert event_results['mean'] == approx(true_event_handler_time, abs=1e-1)
def test_processing_timer(): true_processing_time = 0.1 true_max_epochs = 2 true_num_iters = 2 def train_updater(engine, batch): time.sleep(true_processing_time) profiler = BasicTimeProfiler() dummy_trainer = Engine(train_updater) profiler.attach(dummy_trainer) dummy_trainer.run(range(true_num_iters), max_epochs=true_max_epochs) results = profiler.get_results() processing_results = results['processing_stats'] assert processing_results['min/index'][0] == approx(true_processing_time, abs=1e-1) assert processing_results['max/index'][0] == approx(true_processing_time, abs=1e-1) assert processing_results['mean'] == approx(true_processing_time, abs=1e-1) assert processing_results['std'] == approx(0., abs=1e-1) assert processing_results['total']\ == approx(true_max_epochs * true_num_iters * true_processing_time, abs=1e-1)
def test_write_results(): true_event_handler_time = 0.125 true_max_epochs = 3 true_num_iters = 2 test_folder = "./test_log_folder" if os.path.exists(test_folder): shutil.rmtree(test_folder) os.makedirs(test_folder) profiler = BasicTimeProfiler() dummy_trainer = Engine(_do_nothing_update_fn) profiler.attach(dummy_trainer) @dummy_trainer.on(Events.STARTED) def delay_start(engine): time.sleep(true_event_handler_time) @dummy_trainer.on(Events.COMPLETED) def delay_complete(engine): time.sleep(true_event_handler_time) @dummy_trainer.on(Events.EPOCH_STARTED) def delay_epoch_start(engine): time.sleep(true_event_handler_time) @dummy_trainer.on(Events.EPOCH_COMPLETED) def delay_epoch_complete(engine): time.sleep(true_event_handler_time) @dummy_trainer.on(Events.ITERATION_STARTED) def delay_iter_start(engine): time.sleep(true_event_handler_time) @dummy_trainer.on(Events.ITERATION_COMPLETED) def delay_iter_complete(engine): time.sleep(true_event_handler_time) @dummy_trainer.on(Events.GET_BATCH_STARTED) def delay_get_batch_started(engine): time.sleep(true_event_handler_time) @dummy_trainer.on(Events.GET_BATCH_COMPLETED) def delay_get_batch_completed(engine): time.sleep(true_event_handler_time) dummy_trainer.run(range(true_num_iters), max_epochs=true_max_epochs) profiler.write_results(test_folder + "/test_log.csv") assert os.path.isfile(test_folder + "/test_log.csv") file_length = 0 with open(test_folder + "/test_log.csv") as f: for l in f: file_length += 1 assert file_length == (true_max_epochs * true_num_iters) + 1 # cleanup test log directory shutil.rmtree(test_folder)
def test_event_handler_get_batch_started(): true_event_handler_time = 0.1 true_max_epochs = 1 true_num_iters = 2 profiler = BasicTimeProfiler() dummy_trainer = Engine(_do_nothing_update_fn) profiler.attach(dummy_trainer) @dummy_trainer.on(Events.GET_BATCH_STARTED) def delay_get_batch_started(engine): time.sleep(true_event_handler_time) dummy_trainer.run(range(true_num_iters), max_epochs=true_max_epochs) results = profiler.get_results() event_results = results['event_handlers_stats']['Events_GET_BATCH_STARTED'] assert event_results['min/index'][0] == approx(true_event_handler_time, abs=1e-1) assert event_results['max/index'][0] == approx(true_event_handler_time, abs=1e-1) assert event_results['mean'] == approx(true_event_handler_time, abs=1e-1) assert event_results['std'] == approx(0., abs=1e-1) assert event_results['total'] == approx( true_max_epochs * true_num_iters * true_event_handler_time, abs=1e-1 )
def test_print_results_basic_profiler(capsys): true_max_epochs = 1 true_num_iters = 5 profiler = BasicTimeProfiler() dummy_trainer = get_prepared_engine_for_basic_profiler(true_event_handler_time=0.0125) profiler.attach(dummy_trainer) dummy_trainer.run(range(true_num_iters), max_epochs=true_max_epochs) BasicTimeProfiler.print_results(profiler.get_results()) captured = capsys.readouterr() out = captured.out assert "BasicTimeProfiler._" not in out assert "nan" not in out
def test_print_results(capsys): true_event_handler_time = 0.0 true_max_epochs = 1 true_num_iters = 1 profiler = BasicTimeProfiler() dummy_trainer = Engine(_do_nothing_update_fn) profiler.attach(dummy_trainer) @dummy_trainer.on(Events.GET_BATCH_COMPLETED) def delay_get_batch_completed(engine): time.sleep(true_event_handler_time) dummy_trainer.run(range(true_num_iters), max_epochs=true_max_epochs) results = profiler.get_results() BasicTimeProfiler.print_results(profiler.get_results()) captured = capsys.readouterr() out = captured.out assert out[0] == "\n"
def test_write_results_basic_profiler(dirname): true_event_handler_time = 0.125 true_max_epochs = 3 true_num_iters = 2 profiler = BasicTimeProfiler() dummy_trainer = get_prepared_engine_for_basic_profiler(true_event_handler_time) profiler.attach(dummy_trainer) dummy_trainer.run(range(true_num_iters), max_epochs=true_max_epochs) fp = os.path.join(dirname, "test_log.csv") profiler.write_results(fp) assert os.path.isfile(fp) file_length = 0 with open(fp) as f: for _ in f: file_length += 1 assert file_length == (true_max_epochs * true_num_iters) + 1
def test_profilers_wrong_inputs(): profiler = BasicTimeProfiler() with pytest.raises(TypeError, match=r"Argument engine should be ignite.engine.Engine"): profiler.attach(None) with pytest.raises(RuntimeError, match=r"Need pandas to write results as files"): with patch.dict("sys.modules", {"pandas": None}): profiler.write_results("") profiler = HandlersTimeProfiler() with pytest.raises(TypeError, match=r"Argument engine should be ignite.engine.Engine"): profiler.attach(None) with pytest.raises(RuntimeError, match=r"Need pandas to write results as files"): with patch.dict("sys.modules", {"pandas": None}): profiler.write_results("")
def create_trainer(loader, model, opt, loss_fn, device, args): def _update(engine, batch): model.train() x = batch['x'].to(engine.state.device, non_blocking=True) y = batch['y'].to(engine.state.device, non_blocking=True) m = batch['m'].to(engine.state.device, non_blocking=True) opt.zero_grad() y_pred = model(x) softmax = nn.Softmax() masked_loss = softmax(y_pred) #masked_loss = y_pred*m loss = loss_fn(masked_loss, y) if m.sum().item() / m.numel() > 0.7: loss.backward() opt.step() masked_loss = (masked_loss>0.5).float() acc = accuracy_segmentation(masked_loss[:,1,:,:,:],y[:,1,:,:,:]) return { 'x': x.detach(), 'y': y.detach(), 'm': m.detach(), 'y_pred': y_pred.detach(), 'loss': loss.item(), 'acc' : acc } def _inference(engine, batch): model.eval() with th.no_grad(): x = batch['x'].to(engine.state.device, non_blocking=True) y = batch['y'].to(engine.state.device, non_blocking=True) m = batch['m'].to(engine.state.device, non_blocking=True) y_pred = model(x) softmax = nn.Softmax(dim=1) masked_loss = softmax(y_pred) #masked_loss = y_pred*m loss = loss_fn(masked_loss, y) masked_loss = (masked_loss[-3:]>0.5).float() acc = accuracy_segmentation(masked_loss[:,1,:,:,:],y[:,1,:,:,:]) return { 'x': x.detach(), 'y': y.detach(), 'm': m.detach(), 'y_pred': y_pred.detach(), 'loss': loss.item(), 'acc' : acc } #wandb.watch(model, log ='all') trainer = Engine(_update) evaluator = Engine(_inference) profiler = BasicTimeProfiler() profiler.attach(trainer) logdir = args.logdir save_ = (not args.devrun) and (not args.nosave) # initialize trainer state trainer.state.device = device trainer.state.hparams = args trainer.state.save = save_ trainer.state.logdir = logdir trainer.state.df = defaultdict(dict) trainer.state.metrics = dict() trainer.state.val_metrics = dict() trainer.state.best_metrics = defaultdict(list) trainer.state.gradnorm = defaultdict(dict) # initialize evaluator state evaluator.logger = setup_logger('evaluator') evaluator.state.device = device evaluator.state.df = defaultdict(dict) evaluator.state.metrics = dict() pbar = ProgressBar(persist=True) ebar = ProgressBar(persist=False) pbar.attach(trainer, ['loss']) ebar.attach(evaluator, ['loss']) pbar.attach(trainer,['acc']) ebar.attach(evaluator,['acc']) # model summary if args.model_summary: trainer.add_event_handler( Events.STARTED, print_model_summary, model ) # terminate on nan trainer.add_event_handler( Events.ITERATION_COMPLETED, TerminateOnNan(lambda x: x['loss']) ) # metrics trainer.add_event_handler( Events.ITERATION_COMPLETED, _metrics ) evaluator.add_event_handler( Events.ITERATION_COMPLETED, _metrics ) trainer.add_event_handler( Events.EPOCH_COMPLETED, _metrics_mean ) evaluator.add_event_handler( Events.COMPLETED, _metrics_mean ) trainer.add_event_handler( #Events.STARTED | Events.EPOCH_COMPLETED, Events.EPOCH_COMPLETED, _evaluate, evaluator, loader ) # logging trainer.add_event_handler( Events.EPOCH_COMPLETED, _log_metrics ) # early stopping if args.early_stopping > 0: es_p = args.early_stopping es_s = lambda engine: -engine.state.metrics['loss'] evaluator.add_event_handler( Events.COMPLETED, EarlyStopping(patience=es_p, score_function=es_s, trainer=trainer) ) # lr schedulers if args.epoch_length is None: el = len(loader['train']) else: el = args.epoch_length if args.lr_scheduler is not None: lr_sched = create_lr_scheduler(opt, args, num_steps=el) if args.lr_scheduler != 'plateau': def _sched_fun(engine): lr_sched.step() else: def _sched_fun(engine): e = engine.state.epoch v = engine.state.val_metrics[e]['nmse'] lr_sched.step(v) if args.lr_scheduler == 'linearcycle': trainer.add_event_handler(Events.ITERATION_STARTED, lr_sched) else: trainer.add_event_handler(Events.EPOCH_COMPLETED, _sched_fun) # FIXME: warmup is modifying opt base_lr -> must create last if args.lr_warmup > 0: wsched = create_lr_scheduler(opt, args, 'warmup', num_steps=el) wsts = wsched.total_steps trainer.add_event_handler( Events.ITERATION_COMPLETED(event_filter=lambda _, i: i <= wsts), lambda _: wsched.step() ) # saving if save_: to_save = { 'model': model, 'optimizer': opt, 'trainer': trainer, 'evaluator': evaluator } trainer.add_event_handler( Events.EPOCH_COMPLETED, Checkpoint(to_save, DiskSaver(logdir), n_saved=3) ) # handler = Checkpoint( # {'model': model}, # DiskSaver(logdir), # n_saved = 3, # filename_prefix = 'best', # score_function = lambda engine: -engine.state.metrics['nmae'], # score_name = 'val_nmae', # ) # evaluator.add_event_handler( # Events.COMPLETED, # handler # ) # handler = Checkpoint( # {'model': model}, # DiskSaver(logdir), # n_saved = 3, # filename_prefix = 'best', # score_function = lambda engine: -engine.state.metrics['nmse'], # score_name = 'val_nmse', # ) # evaluator.add_event_handler( # Events.COMPLETED, # handler # ) # handler = Checkpoint( # {'model': model}, # DiskSaver(logdir), # n_saved = 3, # filename_prefix = 'best', # score_function = lambda engine: engine.state.metrics['R2'], # score_name = 'val_R2', # ) # evaluator.add_event_handler( # Events.COMPLETED, # handler # ) trainer.add_event_handler( Events.EPOCH_COMPLETED, _save_metrics ) # timer trainer.add_event_handler( Events.COMPLETED | Events.TERMINATE, lambda _: profiler.write_results(logdir + '/time.csv') ) return trainer