def test_adding_ideal_exclustions(datasets_db): """Test that adding ideal exclusions to single phase datasets works""" datasets_db.insert(dataset_single_valid) datasets_db.insert(dataset_multi_valid_ternary) assert len(datasets_db.all()) == 2 for ds in datasets_db.all(): assert "excluded_model_contributions" not in ds add_ideal_exclusions(datasets_db) assert len(datasets_db.all()) == 2 num_with_excluded_mod_contributions = 0 for ds in datasets_db.all(): if "excluded_model_contributions" in ds: num_with_excluded_mod_contributions += 1 assert ds["excluded_model_contributions"] == ["idmix"] assert num_with_excluded_mod_contributions == 1
def run_espei(run_settings): """Wrapper around the ESPEI fitting procedure, taking only a settings dictionary. Parameters ---------- run_settings : dict Dictionary of input settings Returns ------- Either a Database (for generate parameters only) or a tuple of (Database, sampler) """ run_settings = get_run_settings(run_settings) system_settings = run_settings['system'] output_settings = run_settings['output'] generate_parameters_settings = run_settings.get('generate_parameters') mcmc_settings = run_settings.get('mcmc') # handle verbosity verbosity = { 0: logging.WARNING, 1: logging.INFO, 2: TRACE, 3: logging.DEBUG } logging.basicConfig(level=verbosity[output_settings['verbosity']], filename=output_settings['logfile']) log_version_info() # load datasets and handle i/o logging.log(TRACE, 'Loading and checking datasets.') dataset_path = system_settings['datasets'] datasets = load_datasets(sorted(recursive_glob(dataset_path, '*.json'))) if len(datasets.all()) == 0: logging.warning( 'No datasets were found in the path {}. This should be a directory containing dataset files ending in `.json`.' .format(dataset_path)) apply_tags(datasets, system_settings.get('tags', dict())) add_ideal_exclusions(datasets) logging.log(TRACE, 'Finished checking datasets') with open(system_settings['phase_models']) as fp: phase_models = json.load(fp) if generate_parameters_settings is not None: refdata = generate_parameters_settings['ref_state'] excess_model = generate_parameters_settings['excess_model'] ridge_alpha = generate_parameters_settings['ridge_alpha'] aicc_penalty = generate_parameters_settings['aicc_penalty_factor'] input_dbf = generate_parameters_settings.get('input_db', None) if input_dbf is not None: input_dbf = Database(input_dbf) dbf = generate_parameters( phase_models, datasets, refdata, excess_model, ridge_alpha=ridge_alpha, dbf=input_dbf, aicc_penalty_factor=aicc_penalty, ) dbf.to_file(output_settings['output_db'], if_exists='overwrite') if mcmc_settings is not None: tracefile = output_settings['tracefile'] probfile = output_settings['probfile'] # check that the MCMC output files do not already exist # only matters if we are actually running MCMC if os.path.exists(tracefile): raise OSError( 'Tracefile "{}" exists and would be overwritten by a new run. Use the ``output.tracefile`` setting to set a different name.' .format(tracefile)) if os.path.exists(probfile): raise OSError( 'Probfile "{}" exists and would be overwritten by a new run. Use the ``output.probfile`` setting to set a different name.' .format(probfile)) # scheduler setup if mcmc_settings['scheduler'] == 'dask': _raise_dask_work_stealing() # check for work-stealing from distributed import LocalCluster cores = mcmc_settings.get('cores', multiprocessing.cpu_count()) if (cores > multiprocessing.cpu_count()): cores = multiprocessing.cpu_count() logging.warning( "The number of cores chosen is larger than available. " "Defaulting to run on the {} available cores.".format( cores)) # TODO: make dask-scheduler-verbosity a YAML input so that users can debug. Should have the same log levels as verbosity scheduler = LocalCluster(n_workers=cores, threads_per_worker=1, processes=True, memory_limit=0) client = ImmediateClient(scheduler) client.run(logging.basicConfig, level=verbosity[output_settings['verbosity']], filename=output_settings['logfile']) logging.info("Running with dask scheduler: %s [%s cores]" % (scheduler, sum(client.ncores().values()))) try: bokeh_server_info = client.scheduler_info( )['services']['bokeh'] logging.info( "bokeh server for dask scheduler at localhost:{}".format( bokeh_server_info)) except KeyError: logging.info("Install bokeh to use the dask bokeh server.") elif mcmc_settings['scheduler'] == 'None': client = None logging.info( "Not using a parallel scheduler. ESPEI is running MCMC on a single core." ) else: # we were passed a scheduler file name _raise_dask_work_stealing() # check for work-stealing client = ImmediateClient(scheduler_file=mcmc_settings['scheduler']) client.run(logging.basicConfig, level=verbosity[output_settings['verbosity']], filename=output_settings['logfile']) logging.info("Running with dask scheduler: %s [%s cores]" % (client.scheduler, sum(client.ncores().values()))) # get a Database if mcmc_settings.get('input_db'): dbf = Database(mcmc_settings.get('input_db')) # load the restart trace if needed if mcmc_settings.get('restart_trace'): restart_trace = np.load(mcmc_settings.get('restart_trace')) else: restart_trace = None # load the remaining mcmc fitting parameters iterations = mcmc_settings.get('iterations') save_interval = mcmc_settings.get('save_interval') chains_per_parameter = mcmc_settings.get('chains_per_parameter') chain_std_deviation = mcmc_settings.get('chain_std_deviation') deterministic = mcmc_settings.get('deterministic') prior = mcmc_settings.get('prior') data_weights = mcmc_settings.get('data_weights') syms = mcmc_settings.get('symbols') # set up and run the EmceeOptimizer optimizer = EmceeOptimizer(dbf, scheduler=client) optimizer.save_interval = save_interval all_symbols = syms if syms is not None else database_symbols_to_fit( dbf) optimizer.fit(all_symbols, datasets, prior=prior, iterations=iterations, chains_per_parameter=chains_per_parameter, chain_std_deviation=chain_std_deviation, deterministic=deterministic, restart_trace=restart_trace, tracefile=tracefile, probfile=probfile, mcmc_data_weights=data_weights) optimizer.commit() optimizer.dbf.to_file(output_settings['output_db'], if_exists='overwrite') # close the scheduler, if possible if hasattr(client, 'close'): client.close() return optimizer.dbf, optimizer.sampler return dbf