def generate(seeds, variables, defaults, add_reference=True): configs = dict() for variable in variables: configs[variable] = [] for seed in seeds: kwargs = copy.copy(defaults) kwargs[variable] = int(seed) # We want the duplicates across the study of the different sources of variation. # So each have a uid which depends on variable name kwargs['_variable'] = variable uid = compute_identity(kwargs, IDENTITY_SIZE) kwargs.pop('_variable') kwargs['uid'] = uid configs[variable].append(kwargs) if add_reference: variable = 'reference' configs[variable] = [] for i in range(len(seeds)): kwargs = copy.copy(defaults) kwargs['_repetition'] = i kwargs['_variable'] = variable uid = compute_identity(kwargs, IDENTITY_SIZE) kwargs.pop('_repetition') kwargs.pop('_variable') kwargs['uid'] = uid configs[variable].append(kwargs) return configs
def generate_simulated_fix(data, config, variables, objective, hpo_budget, num_replicates, early_stopping=True): # Don't forget to update uid of trial after sampling new hps. seeds = numpy.random.RandomState(config['seed']).randint( 2**30, size=num_replicates + 1) space = config['space'] X, y = convert_data_to_xy(data, space, objective, early_stopping) X, y = cutoff(X, y, percentile=0.85) model = fit_model(X, y, space, seed=seeds[0]) configs = [] for i, seed in enumerate(seeds[1:]): params = simulate_hpo(model, space, hpo_budget, seed) replicate = copy.deepcopy(config['defaults']) replicate.update(params) replicate.pop('uid', None) replicate['uid'] = compute_identity(replicate, IDENTITY_SIZE) configs.append(replicate) return configs
def test_doc(name, i): a_doc = copy.copy(defaults) a_doc[name] = i a_doc['_variable'] = name a_doc['uid'] = compute_identity(a_doc, 16) a_doc.pop('_variable') return a_doc
def main(study='lin_reg'): batch_sizes = [32, 64, 128, 256] lrs = [0.005, 0.1, 0.05, 0.01] seeds = [0, 1, 2, 3, 4, 5] epochs = [30] def arguments(): for d in batch_sizes: for l in lrs: for e in epochs: for s in seeds: yield dict(epochs=e, batch_size=d, lr=l, seed=s) with HPOWorkGroup('mongo://127.0.0.1:27017', 'olympus', None) as group: group.launch_workers(2) group.clear_queue() group.client.monitor().clear('OLYMETRIC', group.experiment) for i, kwargs in enumerate(arguments()): bs = kwargs['batch_size'] lr = kwargs['lr'] for j in range(10): kwargs['rs'] = j kwargs['uid'] = compute_identity(kwargs, 16) kwargs.pop('rs') group.queue_work(tiny_task, namespace=f'{study}-{bs}-{lr}', **kwargs) group.wait()
def test_generate_bayesopt(): defaults = {'a': 0, 'b': 1, 'c': 2, 'd': 3} budget = 200 fidelity = 'fidelity(1, 10)' num_experiments = 10 search_space = { 'a': 'uniform(-1, 1)', 'b': 'uniform(-1, 1)', 'c': 'uniform(-1, 1)', 'lr': 'uniform(-1, 1)' } configs = generate_bayesopt(budget, fidelity, search_space, range(num_experiments)) assert len(configs) == num_experiments for i in range(num_experiments): rng = numpy.random.RandomState(i) assert configs[i]['name'] == 'robo' assert configs[i]['space'] == search_space assert configs[i]['n_init'] == 20 assert configs[i]['count'] == budget assert configs[i]['model_seed'] == rng.randint(2**30) assert configs[i]['prior_seed'] == rng.randint(2**30) assert configs[i]['init_seed'] == rng.randint(2**30) assert configs[i]['maximizer_seed'] == rng.randint(2**30) assert configs[i]['namespace'] == f'bayesopt-s-{i}' assert configs[i].pop('uid') == compute_identity(configs[i], 16)
def fetch_registered(client, namespace, hpo, seed): registered = set() for message in client.monitor().messages(WORK_QUEUE, env(namespace, hpo, seed), mtype=HPO_ITEM): registered.add(compute_identity(message.message['kwargs'], IDENTITY_SIZE)) return registered
def generate_nudged_grid_search(budget, fidelity, search_space, seeds): configs = generate_grid_search(budget, fidelity, search_space, []) config = configs[0] config['namespace'] = f'grid-search-nudged-p-{config["n_points"]}' config['nudge'] = 0.5 config['uid'] = compute_identity(config, IDENTITY_SIZE) return [config]
def test_doc(name, i, j): a_doc = copy.copy(defaults) a_doc[name] = int(i) a_doc['variable'] = name a_doc['repetition'] = j a_doc['uid'] = compute_identity(a_doc, 16) a_doc.pop('repetition') a_doc.pop('variable') return a_doc
def init(self, model, uid=None): self.model.init(**model) # Get a unique identifier for this configuration if uid is None: uid = compute_identity(model, size=16) # broadcast a signal that the model is ready # so we can setup logging, data, etc... self.metrics.new_trial(model, uid)
def generate_hyperband(budget, fidelity, search_space, seeds): configs = [] # TODO: Compute budget based on cumulative number of epochs. # Let infinite repetitions and stop when reaching corresponding budget. for seed in seeds: config = {'name': 'hyperband', 'seed': seed} config['uid'] = compute_identity(config, IDENTITY_SIZE) config['namespace'] = f'hyperband-s-{seed}' configs.append(config) return [] # configs
def test_doc(name, i): a_doc = copy.copy(defaults) if name == 'reference': a_doc['_repetition'] = i else: a_doc[name] = i a_doc['_variable'] = name a_doc['uid'] = compute_identity(a_doc, 16) if name == 'reference': a_doc.pop('_repetition') a_doc.pop('_variable') return a_doc
def test_doc(name, i, j, interupt): a_doc = copy.copy(defaults) a_doc[name] = i if interupt: a_doc['_interrupt'] = True a_doc['variable'] = name a_doc['repetition'] = j a_doc.pop('uid', None) a_doc['uid'] = compute_identity(a_doc, 16) a_doc.pop('repetition') a_doc.pop('variable') return a_doc
def test_generate_biased_replicates_last_epoch(): space = { 'a': 'uniform(lower=-1, upper=1)', 'b': 'uniform(lower=-1, upper=1)', 'c': 'uniform(lower=-1, upper=1)' } variables = {'d': 2, 'e': 1} defaults = {'d': 1, 'e': 2} seed = 2 num_experiments = 5 hpo = 'random_search' objective = 'obj' num_replicates = 10 fidelity = Fidelity(1, 1, name='epoch').to_dict() surrogate_budget = 10 hpo_budget = 5 configs = generate_hpos(list(range(num_experiments)), [hpo], budget=surrogate_budget, fidelity=fidelity, search_space=space, namespace=NAMESPACE, defaults=defaults) randomize_seeds(configs['random_search'], variables, seed) data = build_data(surrogate_budget, variables, defaults, space) replicates = generate_biased_replicates( data, configs['random_search'][f'{NAMESPACE}-random-search-s-0'], variables, objective, num_replicates, hpo_budget, early_stopping=False) best_trial_index = 6 rng = numpy.random.RandomState( configs['random_search'][f'{NAMESPACE}-random-search-s-0']['seed']) for replicate in replicates: should_be = copy.deepcopy(defaults) for param in space.keys(): assert replicate[param] == float( data.sel(order=best_trial_index)[param].values) should_be[param] = replicate[param] for variable in variables: assert replicate[variable] == rng.randint(2**30) should_be[variable] = replicate[variable] assert replicate['uid'] == compute_identity(should_be, IDENTITY_SIZE)
def randomize_seeds(configs, variables, seed, compute_id=False): rng = numpy.random.RandomState(seed) for hpo_namespace, config in configs.items(): # config['defaults'] = copy.deepcopy(config.get('defaults', {})) config.setdefault('defaults', {}) for variable in variables: config['defaults'][variable] = rng.randint(2**30) if compute_id: config['defaults'].pop('uid', None) config['defaults']['uid'] = compute_identity( config['defaults'], IDENTITY_SIZE)
def generate(num_experiments, num_repro, objective, variables, defaults, resumable): # TODO: Add a resume test as well # Run 5 times full # Run 5 times half stopped then resumed # NOTE TO TEST: make the uid dependent of repetition number, otherwise there will be collisions in # checkpoints # NOTE Set the checkpointer buffer to 0 to make sure checkpoints are done # NOTE Not all tasks need checkpoints. Do not launch checkpoint tests for them. configs = dict() for variable in variables: configs[variable] = [] for seed in range(1, num_experiments + 1): for repetition in range(1, num_repro + 1): kwargs = copy.copy(defaults) kwargs[variable] = int(seed) kwargs['variable'] = variable kwargs['repetition'] = repetition uid = compute_identity(kwargs, IDENTITY_SIZE) kwargs.pop('repetition') kwargs.pop('variable') kwargs['uid'] = uid configs[variable].append(kwargs) if resumable: kwargs = copy.copy(kwargs) kwargs['_interrupt'] = True kwargs['variable'] = variable kwargs['repetition'] = repetition kwargs.pop('uid') uid = compute_identity(kwargs, IDENTITY_SIZE) kwargs.pop('repetition') kwargs.pop('variable') kwargs['uid'] = uid configs[variable].append(kwargs) return configs
def generate_random_search(budget, fidelity, search_space, seeds): configs = [] for seed in seeds: config = {'name': 'random_search', 'seed': seed, 'pool_size': 20} config['namespace'] = f'random-search-s-{seed}' config['count'] = budget config['fidelity'] = fidelity config['space'] = search_space config['uid'] = compute_identity(config, IDENTITY_SIZE) configs.append(config) return configs
def generate_hpos(seeds, hpos, budget, fidelity, search_space, namespace, defaults): configs = dict() for hpo in hpos: configs[hpo] = dict() hpo_configs = generate_hpo_configs[hpo](budget, fidelity, search_space, seeds) for config in hpo_configs: config['namespace'] = env(namespace, config['namespace']) config['defaults'] = copy.deepcopy(defaults) uid = config.pop('uid') config['uid'] = compute_identity(config, IDENTITY_SIZE) configs[hpo][config['namespace']] = config return configs
def generate_noisy_grid_search(budget, fidelity, search_space, seeds): configs = [] for seed in seeds: seed_configs = generate_grid_search(budget, fidelity, search_space, []) for config in seed_configs: config['name'] = 'noisy_grid_search' config['seed'] = seed config['count'] = budget config['namespace'] = f'noisy-grid-search-p-{config["n_points"]}-s-{seed}' config.pop('uid') config['uid'] = compute_identity(config, IDENTITY_SIZE) configs.append(config) return configs
def limit_to_var(configs, ref_config, var): new_configs = [] for config in configs: # Make sure we have HPs from config new_config = copy.deepcopy(config) # But update variables with default values new_config.update(ref_config) # And bring back the single var we want to vary new_config[var] = config[var] # Update corresponding uid new_config.pop('uid', None) new_config['uid'] = compute_identity(new_config, IDENTITY_SIZE) new_configs.append(new_config) return new_configs
def generate_grid_search(budget, fidelity, search_space, seeds): configs = [] dim = len(search_space) n_points = 2 while n_points ** dim < budget: n_points += 1 config = {'name': 'grid_search', 'n_points': n_points, 'seed': 1, 'pool_size': 20} config['namespace'] = f'grid-search-p-{n_points}' config['space'] = search_space config['fidelity'] = fidelity config['uid'] = compute_identity(config, IDENTITY_SIZE) configs.append(config) return configs
def create_and_register_new_point(self, point, **variables): sample = dict() for i, (name, dim) in enumerate(self.orion_space.items()): sample[name] = point[i] if dim.prior_name == 'reciprocal': sample[name] = numpy.exp(sample[name]) sample.update(variables) sample = unflatten(sample) sample[self.identity] = compute_identity(sample, self.space._identity_size) trial = Trial(sample) self.trials[sample[self.identity]] = trial return sample, trial
def test_generate_simulated_replicates(): num_replicates = 10 fake_simulated_replicates = [] variables = ['d', 'e'] for i in range(num_replicates): replicate = {'a': i, 'b': i, 'c': i, 'd': 1, 'e': 1} replicate['uid'] = compute_identity(replicate, IDENTITY_SIZE) fake_simulated_replicates.append(replicate) replicates = generate_simulated_replicates(fake_simulated_replicates, {'seed': 1}, variables) assert len(replicates) == num_replicates for fix_replicate, var_replicate in zip(fake_simulated_replicates, replicates): assert fix_replicate['a'] == var_replicate['a'] assert fix_replicate['b'] == var_replicate['b'] assert fix_replicate['c'] == var_replicate['c'] assert fix_replicate['d'] != var_replicate['d'] assert fix_replicate['e'] != var_replicate['e']
def test_generate_random_search(): defaults = {'a': 0, 'b': 1, 'c': 2, 'd': 3} budget = 200 fidelity = 'fidelity(1, 10)' num_experiments = 10 search_space = { 'a': 'uniform(-1, 1)', 'b': 'uniform(-1, 1)', 'c': 'uniform(-1, 1)', 'lr': 'uniform(-1, 1)' } configs = generate_random_search(budget, fidelity, search_space, range(num_experiments)) assert len(configs) == num_experiments for i in range(num_experiments): assert configs[i]['name'] == 'random_search' assert configs[i]['space'] == search_space assert configs[i]['seed'] == i assert configs[i]['namespace'] == f'random-search-s-{i}' assert configs[i].pop('uid') == compute_identity(configs[i], 16)
def test_generate_grid_search(): defaults = {'a': 0, 'b': 1, 'c': 2, 'd': 3} budget = 200 fidelity = 'fidelity(1, 10)' num_experiments = 10 search_space = { 'a': 'uniform(-1, 1)', 'b': 'uniform(-1, 1)', 'c': 'uniform(-1, 1)', 'd': 'uniform(-1, 1)' } configs = generate_grid_search(budget, fidelity, search_space, range(num_experiments)) assert len(configs) == 1 assert configs[0]['name'] == 'grid_search' assert configs[0]['n_points'] == 4 assert configs[0]['space'] == search_space assert configs[0]['seed'] == 1 assert configs[0]['namespace'] == 'grid-search-p-4' assert configs[0].pop('uid') == compute_identity(configs[0], 16)
def build_data(budget, variables, defaults, space): epochs = 5 # defaults = {'a': 0, 'b': 1, 'c': 2, 'd': 3} # params = {'c': 2, 'd': 3, 'epoch': epochs} n_vars = len(variables) objectives = numpy.arange(budget * (epochs + 1)) numpy.random.RandomState(0).shuffle(objectives) objectives = objectives.reshape((epochs + 1, budget, 1)) params = Space.from_dict(space).sample(budget, seed=1) trials = OrderedDict() for trial_params in params: config = copy.deepcopy( dict(list(variables.items()) + list(defaults.items()))) config.update(trial_params) config['uid'] = compute_identity(config, IDENTITY_SIZE) # NOTE: We don't need objectives trials[config['uid']] = Trial(config) metrics = dict() for trial_i, trial_uid in enumerate(trials.keys()): metrics[trial_uid] = [{ 'epoch': i, 'obj': objectives[i, trial_i, 0] } for i in range(epochs + 1)] data = [] param_names = list(sorted(space.keys())) return create_valid_curves_xarray(trials, metrics, sorted(variables.keys()), epochs, param_names, seed=1)
def test_randomize_seeds(): space = { 'a': 'uniform(lower=-1, upper=1)', 'b': 'uniform(lower=-1, upper=1)', 'c': 'uniform(lower=-1, upper=1)' } variables = ['d', 'e'] defaults = {} seed = 2 num_experiments = 5 hpo = 'random_search' fidelity = Fidelity(1, 1, name='epoch').to_dict() configs = generate_hpos(list(range(num_experiments)), [hpo], budget=200, fidelity=fidelity, search_space=space, namespace=NAMESPACE, defaults=defaults) randomize_seeds(configs['random_search'], variables, seed, compute_id=True) rng = numpy.random.RandomState(seed) for config in configs['random_search'].values(): for variable in variables: assert config['defaults'][variable] == rng.randint(2**30) uid = config['defaults'].pop('uid') assert uid == compute_identity(config['defaults'], IDENTITY_SIZE) randomize_seeds(configs['random_search'], variables, seed, compute_id=False) rng = numpy.random.RandomState(seed) for config in configs['random_search'].values(): for variable in variables: assert config['defaults'][variable] == rng.randint(2**30) assert 'uid' not in config['defaults']
def generate_bayesopt(budget, fidelity, search_space, seeds): configs = [] for seed in seeds: rng = numpy.random.RandomState(seed) config = { 'name': 'robo', 'model_type': 'gp_mcmc', 'maximizer': 'random', 'n_init': 20, 'count': budget, 'acquisition_func': 'log_ei', 'model_seed': rng.randint(2**30), 'prior_seed': rng.randint(2**30), 'init_seed': rng.randint(2**30), 'maximizer_seed': rng.randint(2**30) } config['fidelity'] = fidelity config['namespace'] = f'bayesopt-s-{seed}' config['space'] = search_space config['uid'] = compute_identity(config, IDENTITY_SIZE) configs.append(config) return configs
def sample(self, count=1, **variables): samples = [] submitted_count = len(self.trials) for point in self.grid[submitted_count:submitted_count + count]: sample = dict(zip(self.orion_space.keys(), point)) sample.update(variables) sample = unflatten(sample) sample[self.identity] = compute_identity(sample, self.space._identity_size) samples.append(sample) self.seed_time += 1 trials = [] # Register all the samples for s in samples: t = Trial(s) trials.append(t) self.trials[s[self.identity]] = t self.new_trials(trials) return samples
def reset_pool_size(configs): for hpo_namespace, config in configs.items(): config['pool_size'] = None config.pop('uid', None) config['uid'] = compute_identity(config, IDENTITY_SIZE)
def main(): from sspace.space import compute_identity args = arguments() tickers = [ # 1 2 3 4 5 6 7 8 9 10 'MO', 'AEP', 'BA', 'BMY', 'CPB', 'CAT', 'CVX', 'KO', 'CL', 'COP', # 1 'ED', 'CVS', 'DHI', 'DHR', 'DRI', 'DE', 'D', 'DTE', 'ETN', 'EBAY', # 2 'F', 'BEN', 'HSY', 'HBAN', 'IBM', 'K', 'GIS', 'MSI', 'NSC', 'TXN' ] start, end = '2000-01-01', '2019-05-10' device = fetch_device() task = finance_baseline(tickers, start, end, args.optimizer, args.batch_size, device, args.window) lr = 1e-8 uid = compute_identity( dict(tickers=tickers, start=start, end=end, window=args.window, lr=lr, epochs=args.epochs), 16) if args.uri is not None: logger = metric_logger(args.uri, args.database, f'{DEFAULT_EXP_NAME}_{uid}') task.metrics.append(logger) if args.storage is not None: storage = StateStorage( folder=option('state.storage', '/home/setepenre/zshare/tmp')) task.metrics.append( CheckPointer(storage=storage, time_buffer=5, keep_best='validation_loss', save_init=True)) optimizer = task.optimizer.defaults optimizer['lr'] = lr task.init(optimizer=optimizer, uid=uid) task.fit(args.epochs) stats = task.metrics.value() print(stats) return float(stats['validation_loss'])