def task_deterministic_2(epoch=5): """Check that training in 2 steps is the same as training in one step""" device = fetch_device() state_folder = '/tmp/olympus/tests' file_name = f'{state_folder}/93c88038692bf4baf715ca3806d8a46347a646552f08ede113ef68efae6f1579.state' state_storage = StateStorage(folder=state_folder) # Run in one step metrics1 = run_no_interrupts(epoch * 2, params, device, state_storage) remove(file_name) # run 5 epochs _ = run_no_interrupts(epoch, params, device, state_storage) assert os.path.exists(file_name) # run 10 epochs but resume from the 5 previous epochs metrics2 = run_no_interrupts(epoch * 2, params, device, state_storage) remove(file_name) for k in keys: diff = abs(metrics1[k] - metrics2[k]) print(f'{k:>30} => {diff}') assert diff < 1e-4
def test_model_init(seed): params = { 'optimizer': { 'lr': 0.011113680070144951, 'momentum': 0.04081791544572477, 'weight_decay': 6.2091793568732874e-06 }, 'model': { 'initializer': { 'gain': 1.0 } } } device = fetch_device() model2 = classification_baseline( 'logreg', 'glorot_uniform', 'sgd', 'none', 'test-mnist', 32, device, init_seed=seed, storage=NoStorage()) model1 = classification_baseline( 'logreg', 'glorot_uniform', 'sgd', 'none', 'test-mnist', 32, device, init_seed=seed, storage=NoStorage()) model1.init(**params) model2.init(**params) for p1, p2 in zip(model1.parameters(), model2.parameters()): diff = (p1 - p2).abs().sum() assert diff == 0
def main_resume(epoch, batch_freq=0): global interruption_counter interruption_counter = 0 state_folder = '/tmp/olympus/tests' device = fetch_device() stdout = sys.stdout # sys.stdout = open(os.devnull, "w") metrics1 = run_no_interrupts(epoch, params, device) metrics2 = run_with_interrupts(epoch, batch_freq, state_folder, params, device) # sys.stdout.close() sys.stdout = stdout print(f'epoch = {epoch}') print(f'interrupted = {interruption_counter}') print(f'interrupted = {interruption_counter_batch}') print(f'{"key":>30} | {"NoInterrupt":>12} | {"Interrupted":>12}') for k, v in metrics1.items(): print(f'{k:>30} | {v:12.4f} | {metrics2.get(k, float("NaN")):12.4f}') for k in keys: diff = abs(metrics1[k] - metrics2[k]) print(f'{k} => {diff}') assert diff < 1e-4, f'diff for {k} should be lower but it is {diff}'
def create_trained_trial(epochs=5): """Create a Task that was trained from scratch without interruption""" device = fetch_device() task = make_base_task(device, NoStorage()) task.init(**params) task.fit(epochs=epochs) return task
def main(**kwargs): show_dict(kwargs) args = Namespace(**kwargs) set_verbose_level(args.verbose) device = fetch_device() experiment_name = args.experiment_name.format(**kwargs) # save partial results here state_storage = StateStorage( folder=option('state.storage', '/tmp/olympus/classification')) def main_task(): task = classification_baseline(device=device, storage=state_storage, **kwargs) if args.uri is not None: logger = metric_logger(args.uri, args.database, experiment_name) task.metrics.append(logger) return task space = main_task().get_space() # If space is not empty we search the best hyper parameters params = {} if space: show_dict(space) hpo = HPOptimizer('hyperband', space=space, fidelity=Fidelity(args.min_epochs, args.epochs).to_dict()) hpo_task = HPO(hpo, main_task) hpo_task.metrics.append(ElapsedRealTime()) trial = hpo_task.fit(objective='validation_accuracy') print(f'HPO is done, objective: {trial.objective}') params = trial.params else: print('No hyper parameter missing, running the experiment...') # ------ # Run the experiment with the best hyper parameters # ------------------------------------------------- if params is not None: # Train using train + valid for the final result final_task = classification_baseline(device=device, **kwargs, hpo_done=True) final_task.init(**params) final_task.fit(epochs=args.epochs) print('=' * 40) print('Final Trial Results') show_dict(flatten(params)) final_task.report(pprint=True, print_fun=print) print('=' * 40)
def task_deterministic(epoch=5): device = fetch_device() state_folder = '/tmp/olympus/tests' file_name = f'{state_folder}/93c88038692bf4baf715ca3806d8a46347a646552f08ede113ef68efae6f1579.state' metrics1 = run_no_interrupts(epoch, params, device) remove(file_name) metrics2 = run_no_interrupts(epoch, params, device) remove(file_name) print(metrics1) for k in keys: diff = abs(metrics1[k] - metrics2[k]) print(f'{k:>30} => {diff}') assert diff < 1e-4
def create_resumed_trained_trial(epochs=5): """Create a Task was trained stopped and resumed""" device = fetch_device() # Saves Task old_task = create_trained_trial(epochs) checkpointer = old_task.metrics.get('CheckPointer') uid = checkpointer.uid state_storage = StateStorage(folder='/tmp/olympus/tests') checkpointer.storage = state_storage checkpointer.save(old_task) # Done new_task = make_base_task(device, state_storage) # Automatic Resume new_task.init(uid=uid, **params) assert new_task.resumed() return new_task
import torch.nn.functional as F from olympus.datasets import Dataset, SplitDataset, DataLoader from olympus.optimizers import Optimizer, LRSchedule from olympus.models import Model from olympus.observers import ObserverList, ProgressView, Speed from olympus.utils import fetch_device, option epochs = 2 device = fetch_device() base = option('base_path', '/tmp/olympus') # Model model = Model('resnet18', input_size=(1, 28, 28), output_size=(10, )) # Optimizer optimizer = Optimizer('sgd', params=model.parameters(), weight_decay=0.001, lr=1e-5, momentum=1e-5) # Schedule lr_schedule = LRSchedule('exponential', optimizer=optimizer, gamma=0.99) data = Dataset('fake_mnist', path=f'{base}/data') splits = SplitDataset(data, split_method='original') # Dataloader loader = DataLoader(splits, sampler_seed=1, batch_size=32)
def main(bootstrapping_seed=1, sampler_seed=1, init_seed=1, batch_size=16, learning_rate=0.001, momentum=0.9, weight_decay=1e-4, epoch=240, half=False, hpo_done=False, uid=None, experiment_name=None, client=None, clean_on_exit=True, _interrupt=0): base_folder = options('state.storage', '/tmp') storage = StateStorage(folder=base_folder) split_method = { 'split_method': 'bootstrap', 'ratio': 0.25, # This means 50% training, 25% valid, 25% test 'seed': bootstrapping_seed, 'balanced': False } task = segmentation_baseline('fcn_resnet18', 'self_init', 'SGD', dataset='voc-segmentation', batch_size=batch_size, device=fetch_device(), split_method=split_method, sampler_seed=sampler_seed, init_seed=init_seed, storage=storage, half=half, hpo_done=hpo_done, verbose=False, validate=True) hyperparameters = { 'model': { 'initializer': { 'gain': 1.0 } }, 'optimizer': { 'lr': learning_rate, 'momentum': momentum, 'weight_decay': weight_decay } } show_dict(hyperparameters) if client is not None: task.metrics.append( metric_logger(client=client, experiment=experiment_name)) if _interrupt: from studies import InterruptingMetric # Will raise interrupt every `_interrupt` epochs task.metrics.append(InterruptingMetric(frequency_epoch=_interrupt)) storage.time_buffer = 0 task.init(uid=uid, **hyperparameters) task.fit(epochs=epoch) # Remove checkpoint if clean_on_exit: file_path = storage._file(uid) try: os.remove(file_path) print('Removed checkpoint at', file_path) except FileNotFoundError: print('No checkpoint at ', file_path) show_dict(task.metrics.value()) return float(task.metrics.value()['validation_mean_jaccard_distance'])
def main(task='rte', bootstrapping_seed=1, sampler_seed=1, init_seed=1, global_seed=1, learning_rate=0.00002, beta1=0.9, beta2=0.999, weight_decay=0.0, attention_probs_dropout_prob=0.1, hidden_dropout_prob=0.1, batch_size=32, weight_init='normal', warmup=0, ratio=0.1, init_std=0.2, epoch=3, half=False, hpo_done=False, uid=None, experiment_name=None, client=None, clean_on_exit=True, _interrupt=0): print('seeds: init {} / global {} / sampler {} / bootstrapping {}'.format( init_seed, global_seed, sampler_seed, bootstrapping_seed)) base_folder = options('state.storage', '/tmp/storage') storage = StateStorage(folder=base_folder) split_method = { 'split_method': 'bootstrap', 'ratio': ratio, 'seed': bootstrapping_seed, 'balanced': False } task = classification_baseline("bert-{}".format(task), 'normal', 'adam', schedule='warmup', dataset="glue-{}".format(task), split_method=split_method, sampler_seed=sampler_seed, init_seed=init_seed, batch_size=batch_size, device=fetch_device(), storage=storage, half=half, hpo_done=hpo_done, verbose=False, validate=True) hyperparameters = dict(model={ 'initializer': { 'mean': 0.0, 'std': init_std }, 'attention_probs_dropout_prob': attention_probs_dropout_prob, 'hidden_dropout_prob': hidden_dropout_prob }, optimizer={ 'lr': learning_rate, 'beta1': beta1, 'beta2': beta2, 'weight_decay': weight_decay }, lr_schedule={ 'warmup_steps': warmup, 'max_steps': epoch * len(task.dataloader), 'iterations': 'step' }) show_dict(hyperparameters) if client is not None: task.metrics.append( metric_logger(client=client, experiment=experiment_name)) if _interrupt: from studies import InterruptingMetric # Will raise interrupt every `_interrupt` epochs task.metrics.append(InterruptingMetric(frequency_epoch=_interrupt)) task.init(uid=uid, **hyperparameters) # NOTE: Seed global once all special inits are done. set_seeds(global_seed) task.fit(epochs=epoch) # Remove checkpoint if clean_on_exit: file_path = storage._file(uid) try: os.remove(file_path) print('Removed checkpoint at', file_path) except FileNotFoundError: print('No checkpoint at ', file_path) return task.metrics.value().get('validation_error_rate', None)
def main(): from sspace.space import compute_identity args = arguments() tickers = [ # 1 2 3 4 5 6 7 8 9 10 'MO', 'AEP', 'BA', 'BMY', 'CPB', 'CAT', 'CVX', 'KO', 'CL', 'COP', # 1 'ED', 'CVS', 'DHI', 'DHR', 'DRI', 'DE', 'D', 'DTE', 'ETN', 'EBAY', # 2 'F', 'BEN', 'HSY', 'HBAN', 'IBM', 'K', 'GIS', 'MSI', 'NSC', 'TXN' ] start, end = '2000-01-01', '2019-05-10' device = fetch_device() task = finance_baseline(tickers, start, end, args.optimizer, args.batch_size, device, args.window) lr = 1e-8 uid = compute_identity( dict(tickers=tickers, start=start, end=end, window=args.window, lr=lr, epochs=args.epochs), 16) if args.uri is not None: logger = metric_logger(args.uri, args.database, f'{DEFAULT_EXP_NAME}_{uid}') task.metrics.append(logger) if args.storage is not None: storage = StateStorage( folder=option('state.storage', '/home/setepenre/zshare/tmp')) task.metrics.append( CheckPointer(storage=storage, time_buffer=5, keep_best='validation_loss', save_init=True)) optimizer = task.optimizer.defaults optimizer['lr'] = lr task.init(optimizer=optimizer, uid=uid) task.fit(args.epochs) stats = task.metrics.value() print(stats) return float(stats['validation_loss'])
def main(bootstrapping_seed=1, sampler_seed=1, transform_seed=1, init_seed=1, learning_rate=0.1, momentum=0.9, weight_decay=5e-4, gamma=0.99, weight_init='glorot_uniform', epoch=120, half=False, hpo_done=False, uid=None, experiment_name=None, client=None, clean_on_exit=True, _interrupt=0): base_folder = options('state.storage', '/tmp') storage = StateStorage(folder=base_folder, time_buffer=5 * 60) print(base_folder) sampling_method = { 'split_method': 'bootstrap', 'ratio': 0.1666, 'seed': bootstrapping_seed, 'balanced': True } batch_size = 128 task = classification_baseline('vgg11', 'glorot_uniform', 'sgd', schedule='exponential', dataset='cifar10', batch_size=batch_size, device=fetch_device(), data_augment=True, split_method=sampling_method, sampler_seed=sampler_seed, transform_seed=transform_seed, init_seed=init_seed, storage=storage, half=half, hpo_done=hpo_done, verbose=False, validate=True) hyperparameters = dict(model={'initializer': { 'gain': 1.0 }}, optimizer=dict(lr=learning_rate, momentum=momentum, weight_decay=weight_decay), lr_schedule=dict(gamma=gamma)) show_dict(hyperparameters) if client is not None: task.metrics.append( metric_logger(client=client, experiment=experiment_name)) if _interrupt: from studies import InterruptingMetric # Will raise interrupt every `_interrupt` epochs task.metrics.append(InterruptingMetric(frequency_epoch=_interrupt)) storage.time_buffer = 0 task.init(uid=uid, **hyperparameters) task.fit(epochs=epoch) # Remove checkpoint if clean_on_exit: file_path = storage._file(uid) try: os.remove(file_path) print('Removed checkpoint at', file_path) except FileNotFoundError: print('No checkpoint at ', file_path) show_dict(task.metrics.value()) return float(task.metrics.value()['validation_error_rate'])